From: Yen Chi Hsuan Date: Sun, 29 Nov 2015 05:08:46 +0000 (+0800) Subject: Merge pull request #7691 from ryandesign/use-PYTHON-env-var X-Git-Url: http://git.bitcoin.ninja/index.cgi?p=youtube-dl;a=commitdiff_plain;h=ef428960c9b3972586977446e82ec3872094cc1e;hp=8639f89f516c5bd1e4fda38c40e2a5a9b940ad85 Merge pull request #7691 from ryandesign/use-PYTHON-env-var Always use PYTHON env var in Makefile --- diff --git a/.travis.yml b/.travis.yml index fb34299fc..cc21fae8f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,11 +2,11 @@ language: python python: - "2.6" - "2.7" + - "3.2" - "3.3" - "3.4" -before_install: - - sudo apt-get update -qq - - sudo apt-get install -yqq rtmpdump + - "3.5" +sudo: false script: nosetests test --verbose notifications: email: diff --git a/AUTHORS b/AUTHORS index 3d6985ab6..cdb56de3b 100644 --- a/AUTHORS +++ b/AUTHORS @@ -110,3 +110,40 @@ Shaya Goldberg Paul Hartmann Frans de Jonge Robin de Rooij +Ryan Schmidt +Leslie P. Polzer +Duncan Keall +Alexander Mamay +Devin J. Pohly +Eduardo Ferro Aldama +Jeff Buchbinder +Amish Bhadeshia +Joram Schrijver +Will W. +Mohammad Teimori Pabandi +Roman Le Négrate +Matthias Küch +Julian Richen +Ping O. +Mister Hat +Peter Ding +jackyzy823 +George Brighton +Remita Amine +Aurélio A. Heckert +Bernhard Minks +sceext +Zach Bruggeman +Tjark Saul +slangangular +Behrouz Abbasi +ngld +nyuszika7h +Shaun Walbridge +Lee Jenkins +Anssi Hannula +Lukáš Lalinský +Qijiang Fan +Rémy Léone +Marco Ferragina +reiv diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 351229f21..09ce98ca2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,6 +1,6 @@ **Please include the full output of youtube-dl when run with `-v`**. -The output (including the first lines) contain important debugging information. Issues without the full output are often not reproducible and therefore do not get solved in short order, if ever. +The output (including the first lines) contains important debugging information. Issues without the full output are often not reproducible and therefore do not get solved in short order, if ever. Please re-read your issue once again to avoid a couple of common mistakes (you can and should use this as a checklist): @@ -16,13 +16,15 @@ So please elaborate on what feature you are requesting, or what bug you want to If your report is shorter than two lines, it is almost certainly missing some of these, which makes it hard for us to respond to it. We're often too polite to close the issue outright, but the missing info makes misinterpretation likely. As a commiter myself, I often get frustrated by these issues, since the only possible way for me to move forward on them is to ask for clarification over and over. -For bug reports, this means that your report should contain the *complete* output of youtube-dl when called with the -v flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information. +For bug reports, this means that your report should contain the *complete* output of youtube-dl when called with the `-v` flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information. -Site support requests **must contain an example URL**. An example URL is a URL you might want to download, like http://www.youtube.com/watch?v=BaW_jenozKc . There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. http://www.youtube.com/ ) is *not* an example URL. +If your server has multiple IPs or you suspect censorship, adding `--call-home` may be a good idea to get more diagnostics. If the error is `ERROR: Unable to extract ...` and you cannot reproduce it from multiple countries, add `--dump-pages` (warning: this will yield a rather large output, redirect it to the file `log.txt` by adding `>log.txt 2>&1` to your command-line) or upload the `.dump` files you get when you add `--write-pages` [somewhere](https://gist.github.com/). + +**Site support requests must contain an example URL**. An example URL is a URL you might want to download, like http://www.youtube.com/watch?v=BaW_jenozKc . There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. http://www.youtube.com/ ) is *not* an example URL. ### Are you using the latest version? -Before reporting any issue, type youtube-dl -U. This should report that you're up-to-date. About 20% of the reports we receive are already fixed, but people are using outdated versions. This goes for feature requests as well. +Before reporting any issue, type `youtube-dl -U`. This should report that you're up-to-date. About 20% of the reports we receive are already fixed, but people are using outdated versions. This goes for feature requests as well. ### Is the issue already documented? @@ -112,18 +114,19 @@ If you want to add support for a new site, you can follow this quick list (assum webpage = self._download_webpage(url, video_id) # TODO more code goes here, for example ... - title = self._html_search_regex(r'

(.*?)

', webpage, 'title') + title = self._html_search_regex(r'

(.+?)

', webpage, 'title') return { 'id': video_id, 'title': title, 'description': self._og_search_description(webpage), + 'uploader': self._search_regex(r']+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False), # TODO more properties (see youtube_dl/extractor/common.py) } ``` 5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). -6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will be then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. -7. Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Add tests and code for as many as you want. +6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. +7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L62-L200). Add tests and code for as many as you want. 8. If you can, check the code with [flake8](https://pypi.python.org/pypi/flake8). 9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: diff --git a/Makefile b/Makefile index 337a2eefb..f826c1685 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,8 @@ all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites clean: - rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish *.dump *.part *.info.json *.mp4 *.flv *.mp3 CONTRIBUTING.md.tmp youtube-dl youtube-dl.exe + rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp youtube-dl youtube-dl.exe + find . -name "*.pyc" -delete PREFIX ?= /usr/local BINDIR ?= $(PREFIX)/bin @@ -43,7 +44,7 @@ test: ot: offlinetest offlinetest: codetest - nosetests --verbose test --exclude test_download --exclude test_age_restriction --exclude test_subtitles --exclude test_write_annotations --exclude test_youtube_lists + nosetests --verbose test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py tar: youtube-dl.tar.gz diff --git a/README.md b/README.md index 731cea1e1..df419abe8 100644 --- a/README.md +++ b/README.md @@ -5,9 +5,11 @@ youtube-dl - download videos from youtube.com or other video platforms - [OPTIONS](#options) - [CONFIGURATION](#configuration) - [OUTPUT TEMPLATE](#output-template) +- [FORMAT SELECTION](#format-selection) - [VIDEO SELECTION](#video-selection) - [FAQ](#faq) - [DEVELOPER INSTRUCTIONS](#developer-instructions) +- [EMBEDDING YOUTUBE-DL](#embedding-youtube-dl) - [BUGS](#bugs) - [COPYRIGHT](#copyright) @@ -16,12 +18,12 @@ youtube-dl - download videos from youtube.com or other video platforms To install it right away for all UNIX users (Linux, OS X, etc.), type: sudo curl https://yt-dl.org/latest/youtube-dl -o /usr/local/bin/youtube-dl - sudo chmod a+x /usr/local/bin/youtube-dl + sudo chmod a+rx /usr/local/bin/youtube-dl If you do not have curl, you can alternatively use a recent wget: sudo wget https://yt-dl.org/downloads/latest/youtube-dl -O /usr/local/bin/youtube-dl - sudo chmod a+x /usr/local/bin/youtube-dl + sudo chmod a+rx /usr/local/bin/youtube-dl Windows users can [download a .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in their home directory or any other location on their [PATH](http://en.wikipedia.org/wiki/PATH_%28variable%29). @@ -33,7 +35,7 @@ You can also use pip: sudo pip install youtube-dl -Alternatively, refer to the developer instructions below for how to check out and work with the git repository. For further options, including PGP signatures, see https://rg3.github.io/youtube-dl/download.html . +Alternatively, refer to the [developer instructions](#developer-instructions) for how to check out and work with the git repository. For further options, including PGP signatures, see https://rg3.github.io/youtube-dl/download.html . # DESCRIPTION **youtube-dl** is a small command-line program to download videos from @@ -45,24 +47,25 @@ which means you can modify it, redistribute it or use it however you like. youtube-dl [OPTIONS] URL [URL...] # OPTIONS - -h, --help print this help text and exit - --version print program version and exit - -U, --update update this program to latest version. Make + -h, --help Print this help text and exit + --version Print program version and exit + -U, --update Update this program to latest version. Make sure that you have sufficient permissions (run with sudo if needed) - -i, --ignore-errors continue on download errors, for example to + -i, --ignore-errors Continue on download errors, for example to skip unavailable videos in a playlist --abort-on-error Abort downloading of further videos (in the playlist or the command line) if an error occurs - --dump-user-agent display the current browser identification - --list-extractors List all supported extractors and the URLs - they would handle + --dump-user-agent Display the current browser identification + --list-extractors List all supported extractors --extractor-descriptions Output descriptions of all supported extractors + --force-generic-extractor Force extraction to use the generic + extractor --default-search PREFIX Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos - from google videos for youtube-dl "large + from google videos for youtube-dl "large apple". Use the value "auto" to let youtube-dl guess ("auto_warning" to emit a warning when guessing). "error" just throws @@ -77,7 +80,7 @@ which means you can modify it, redistribute it or use it however you like. on Windows) --flat-playlist Do not extract the videos of a playlist, only list them. - --no-color Do not emit color codes in output. + --no-color Do not emit color codes in output ## Network Options: --proxy URL Use the specified HTTP/HTTPS proxy. Pass in @@ -90,37 +93,42 @@ which means you can modify it, redistribute it or use it however you like. (experimental) -6, --force-ipv6 Make all connections via IPv6 (experimental) + --cn-verification-proxy URL Use this proxy to verify the IP address for + some Chinese sites. The default proxy + specified by --proxy (or none, if the + options is not present) is used for the + actual downloading. (experimental) ## Video Selection: - --playlist-start NUMBER playlist video to start at (default is 1) - --playlist-end NUMBER playlist video to end at (default is last) - --playlist-items ITEM_SPEC playlist video items to download. Specify + --playlist-start NUMBER Playlist video to start at (default is 1) + --playlist-end NUMBER Playlist video to end at (default is last) + --playlist-items ITEM_SPEC Playlist video items to download. Specify indices of the videos in the playlist - seperated by commas like: "--playlist-items + separated by commas like: "--playlist-items 1,2,5,8" if you want to download videos indexed 1, 2, 5, 8 in the playlist. You can specify range: "--playlist-items 1-3,7,10-13", it will download the videos at index 1, 2, 3, 7, 10, 11, 12 and 13. - --match-title REGEX download only matching titles (regex or + --match-title REGEX Download only matching titles (regex or caseless sub-string) - --reject-title REGEX skip download for matching titles (regex or + --reject-title REGEX Skip download for matching titles (regex or caseless sub-string) --max-downloads NUMBER Abort after downloading NUMBER files --min-filesize SIZE Do not download any videos smaller than SIZE (e.g. 50k or 44.6m) --max-filesize SIZE Do not download any videos larger than SIZE (e.g. 50k or 44.6m) - --date DATE download only videos uploaded in this date - --datebefore DATE download only videos uploaded on or before + --date DATE Download only videos uploaded in this date + --datebefore DATE Download only videos uploaded on or before this date (i.e. inclusive) - --dateafter DATE download only videos uploaded on or after + --dateafter DATE Download only videos uploaded on or after this date (i.e. inclusive) --min-views COUNT Do not download any videos with less than COUNT views --max-views COUNT Do not download any videos with more than COUNT views - --match-filter FILTER (Experimental) Generic video filter. + --match-filter FILTER Generic video filter (experimental). Specify any key (see help for -o for a list of available keys) to match if the key is present, !key to check if the key is not @@ -135,11 +143,13 @@ which means you can modify it, redistribute it or use it however you like. less than 50 times (or the dislike functionality is not available at the given service), but who also have a description, - use --match-filter "like_count > 100 & + use --match-filter "like_count > 100 & dislike_count 10M]"). This works for - filesize, height, width, tbr, abr, vbr, - asr, and fps and the comparisons <, <=, >, - >=, =, != and for ext, acodec, vcodec, - container, and protocol and the comparisons - =, != . Formats for which the value is not - known are excluded unless you put a - question mark (?) after the operator. You - can combine format filters, so -f "[height - <=? 720][tbr>500]" selects up to 720p - videos (or videos where the height is not - known) with a bitrate of at least 500 - KBit/s. By default, youtube-dl will pick - the best quality. Use commas to download - multiple audio formats, such as -f - 136/137/mp4/bestvideo,140/m4a/bestaudio. - You can merge the video and audio of two - formats into a single file using -f + (requires ffmpeg or - avconv), for example -f - bestvideo+bestaudio. - --all-formats download all available video formats - --prefer-free-formats prefer free video formats unless a specific + -f, --format FORMAT Video format code, see the "FORMAT + SELECTION" for all the info + --all-formats Download all available video formats + --prefer-free-formats Prefer free video formats unless a specific one is requested - --max-quality FORMAT highest quality format to download - -F, --list-formats list all available formats - --youtube-skip-dash-manifest Do not download the DASH manifest on - YouTube videos + -F, --list-formats List all available formats of specified + videos + --youtube-skip-dash-manifest Do not download the DASH manifests and + related data on YouTube videos --merge-output-format FORMAT If a merge is required (e.g. bestvideo+bestaudio), output to given container format. One of mkv, mp4, ogg, - webm, flv.Ignored if no merge is required + webm, flv. Ignored if no merge is required ## Subtitle Options: - --write-sub write subtitle file - --write-auto-sub write automatic subtitle file (youtube - only) - --all-subs downloads all the available subtitles of - the video - --list-subs lists all available subtitles for the video - --sub-format FORMAT subtitle format (default=srt) ([sbv/vtt] - youtube only) - --sub-lang LANGS languages of the subtitles to download + --write-sub Write subtitle file + --write-auto-sub Write automatically generated subtitle file + (YouTube only) + --all-subs Download all the available subtitles of the + video + --list-subs List all available subtitles for the video + --sub-format FORMAT Subtitle format, accepts formats + preference, for example: "srt" or + "ass/srt/best" + --sub-lang LANGS Languages of the subtitles to download (optional) separated by commas, use IETF language tags like 'en,pt' ## Authentication Options: - -u, --username USERNAME login with this account ID - -p, --password PASSWORD account password. If this option is left + -u, --username USERNAME Login with this account ID + -p, --password PASSWORD Account password. If this option is left out, youtube-dl will ask interactively. - -2, --twofactor TWOFACTOR two-factor auth code - -n, --netrc use .netrc authentication data - --video-password PASSWORD video password (vimeo, smotri) + -2, --twofactor TWOFACTOR Two-factor auth code + -n, --netrc Use .netrc authentication data + --video-password PASSWORD Video password (vimeo, smotri, youku) ## Post-processing Options: - -x, --extract-audio convert video files to audio-only files + -x, --extract-audio Convert video files to audio-only files (requires ffmpeg or avconv and ffprobe or avprobe) - --audio-format FORMAT "best", "aac", "vorbis", "mp3", "m4a", - "opus", or "wav"; "best" by default - --audio-quality QUALITY ffmpeg/avconv audio quality specification, - insert a value between 0 (better) and 9 - (worse) for VBR or a specific bitrate like - 128K (default 5) + --audio-format FORMAT Specify audio format: "best", "aac", + "vorbis", "mp3", "m4a", "opus", or "wav"; + "best" by default + --audio-quality QUALITY Specify ffmpeg/avconv audio quality, insert + a value between 0 (better) and 9 (worse) + for VBR or a specific bitrate like 128K + (default 5) --recode-video FORMAT Encode the video to another format if necessary (currently supported: - mp4|flv|ogg|webm|mkv) - -k, --keep-video keeps the video file on disk after the - post-processing; the video is erased by - default - --no-post-overwrites do not overwrite post-processed files; the + mp4|flv|ogg|webm|mkv|avi) + --postprocessor-args ARGS Give these arguments to the postprocessor + -k, --keep-video Keep the video file on disk after the post- + processing; the video is erased by default + --no-post-overwrites Do not overwrite post-processed files; the post-processed files are overwritten by default - --embed-subs embed subtitles in the video (only for mp4 - videos) - --embed-thumbnail embed thumbnail in the audio as cover art - --add-metadata write metadata to the video file - --xattrs write metadata to the video file's xattrs + --embed-subs Embed subtitles in the video (only for mkv + and mp4 videos) + --embed-thumbnail Embed thumbnail in the audio as cover art + --add-metadata Write metadata to the video file + --metadata-from-title FORMAT Parse additional metadata like song title / + artist from the video title. The format + syntax is the same as --output, the parsed + parameters replace existing values. + Additional templates: %(album)s, + %(artist)s. Example: --metadata-from-title + "%(artist)s - %(title)s" matches a title + like "Coldplay - Paradise" + --xattrs Write metadata to the video file's xattrs (using dublin core and xdg standards) --fixup POLICY Automatically correct known faults of the file. One of never (do nothing), warn (only - emit a warning), detect_or_warn(the + emit a warning), detect_or_warn (the default; fix file if we can, warn otherwise) --prefer-avconv Prefer avconv over ffmpeg for running the postprocessors (default) --prefer-ffmpeg Prefer ffmpeg over avconv for running the postprocessors + --ffmpeg-location PATH Location of the ffmpeg/avconv binary; + either the path to the binary or its + containing directory. --exec CMD Execute a command on the file after downloading, similar to find's -exec syntax. Example: --exec 'adb push {} /sdcard/Music/ && rm {}' + --convert-subtitles FORMAT Convert the subtitles to other format + (currently supported: srt|ass|vtt) # CONFIGURATION -You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl/config`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config.txt` and `C:\Users\\youtube-dl.conf`. +You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\\youtube-dl.conf`. For example, with the following configuration file youtube-dl will always extract the audio, not copy the mtime and use a proxy: +``` +--extract-audio +--no-mtime +--proxy 127.0.0.1:3128 +``` + +You can use `--ignore-config` if you want to disable the configuration file for a particular youtube-dl run. + +### Authentication with `.netrc` file ### + +You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every youtube-dl execution and prevent tracking plain text passwords in the shell command history. You can achieve this using a [`.netrc` file](http://stackoverflow.com/tags/.netrc/info) on per extractor basis. For that you will need to create a`.netrc` file in your `$HOME` and restrict permissions to read/write by you only: +``` +touch $HOME/.netrc +chmod a-rwx,u+rw $HOME/.netrc +``` +After that you can add credentials for extractor in the following format, where *extractor* is the name of extractor in lowercase: +``` +machine login password +``` +For example: +``` +machine youtube login myaccount@gmail.com password my_youtube_password +machine twitch login my_twitch_account_name password my_twitch_password +``` +To activate authentication with the `.netrc` file you should pass `--netrc` to youtube-dl or place it in the [configuration file](#configuration). + +On Windows you may also need to setup the `%HOME%` environment variable manually. # OUTPUT TEMPLATE -The `-o` option allows users to indicate a template for the output file names. The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "http://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences have the format `%(NAME)s`. To clarify, that is a percent symbol followed by a name in parenthesis, followed by a lowercase S. Allowed names are: +The `-o` option allows users to indicate a template for the output file names. The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "http://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences have the format `%(NAME)s`. To clarify, that is a percent symbol followed by a name in parentheses, followed by a lowercase S. Allowed names are: - `id`: The sequence will be replaced by the video identifier. - `url`: The sequence will be replaced by the video URL. @@ -418,8 +446,10 @@ The `-o` option allows users to indicate a template for the output file names. T - `ext`: The sequence will be replaced by the appropriate extension (like flv or mp4). - `epoch`: The sequence will be replaced by the Unix epoch when creating the file. - `autonumber`: The sequence will be replaced by a five-digit number that will be increased with each download, starting at zero. - - `playlist`: The name or the id of the playlist that contains the video. - - `playlist_index`: The index of the video in the playlist, a five-digit number. + - `playlist`: The sequence will be replaced by the name or the id of the playlist that contains the video. + - `playlist_index`: The sequence will be replaced by the index of the video in the playlist padded with leading zeros according to the total length of the playlist. + - `format_id`: The sequence will be replaced by the format code specified by `--format`. + - `duration`: The sequence will be replaced by the length of the video in seconds. The current default template is `%(title)s-%(id)s.%(ext)s`. @@ -432,9 +462,20 @@ $ youtube-dl --get-filename -o "%(title)s.%(ext)s" BaW_jenozKc --restrict-filena youtube-dl_test_video_.mp4 # A simple file name ``` +# FORMAT SELECTION + +By default youtube-dl tries to download the best quality, but sometimes you may want to download in a different format. +The simplest case is requesting a specific format, for example `-f 22`. You can get the list of available formats using `--list-formats`, you can also use a file extension (currently it supports aac, m4a, mp3, mp4, ogg, wav, webm) or the special names `best`, `bestvideo`, `bestaudio` and `worst`. + +If you want to download multiple videos and they don't have the same formats available, you can specify the order of preference using slashes, as in `-f 22/17/18`. You can also filter the video results by putting a condition in brackets, as in `-f "best[height=720]"` (or `-f "[filesize>10M]"`). This works for filesize, height, width, tbr, abr, vbr, asr, and fps and the comparisons <, <=, >, >=, =, != and for ext, acodec, vcodec, container, and protocol and the comparisons =, != . Formats for which the value is not known are excluded unless you put a question mark (?) after the operator. You can combine format filters, so `-f "[height <=? 720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 KBit/s. Use commas to download multiple formats, such as `-f 136/137/mp4/bestvideo,140/m4a/bestaudio`. You can merge the video and audio of two formats into a single file using `-f +` (requires ffmpeg or avconv), for example `-f bestvideo+bestaudio`. Format selectors can also be grouped using parentheses, for example if you want to download the best mp4 and webm formats with a height lower than 480 you can use `-f '(mp4,webm)[height<480]'`. + +Since the end of April 2015 and version 2015.04.26 youtube-dl uses `-f bestvideo+bestaudio/best` as default format selection (see #5447, #5456). If ffmpeg or avconv are installed this results in downloading `bestvideo` and `bestaudio` separately and muxing them together into a single file giving the best overall quality available. Otherwise it falls back to `best` and results in downloading the best available quality served as a single file. `best` is also needed for videos that don't come from YouTube because they don't provide the audio and video in two different files. If you want to only download some dash formats (for example if you are not interested in getting videos with a resolution higher than 1080p), you can add `-f bestvideo[height<=?1080]+bestaudio/best` to your configuration file. Note that if you use youtube-dl to stream to `stdout` (and most likely to pipe it to your media player then), i.e. you explicitly specify output template as `-o -`, youtube-dl still uses `-f best` format selection in order to start content delivery immediately to your player and not to wait until `bestvideo` and `bestaudio` are downloaded and muxed. + +If you want to preserve the old format selection behavior (prior to youtube-dl 2015.04.26), i.e. you want to download the best available quality media served as a single file, you should explicitly specify your choice with `-f best`. You may want to add it to the [configuration file](#configuration) in order not to type it every time you run youtube-dl. + # VIDEO SELECTION -Videos can be filtered by their upload date using the options `--date`, `--datebefore` or `--dateafter`, they accept dates in two formats: +Videos can be filtered by their upload date using the options `--date`, `--datebefore` or `--dateafter`. They accept dates in two formats: - Absolute dates: Dates in the format `YYYYMMDD`. - Relative dates: Dates in the format `(now|today)[+-][0-9](day|week|month|year)(s)?` @@ -448,7 +489,7 @@ $ youtube-dl --dateafter now-6months # Download only the videos uploaded on January 1, 1970 $ youtube-dl --date 19700101 -$ # will only download the videos uploaded in the 200x decade +$ # Download only the videos uploaded in the 200x decade $ youtube-dl --dateafter 20000101 --datebefore 20091231 ``` @@ -460,7 +501,7 @@ If you've followed [our manual installation instructions](http://rg3.github.io/y If you have used pip, a simple `sudo pip install -U youtube-dl` is sufficient to update. -If you have installed youtube-dl using a package manager like *apt-get* or *yum*, use the standard system update mechanism to update. Note that distribution packages are often outdated. As a rule of thumb, youtube-dl releases at least once a month, and often weekly or even daily. Simply go to http://yt-dl.org/ to find out the current version. Unfortunately, there is nothing we youtube-dl developers can do if your distributions serves a really outdated version. You can (and should) complain to your distribution in their bugtracker or support forum. +If you have installed youtube-dl using a package manager like *apt-get* or *yum*, use the standard system update mechanism to update. Note that distribution packages are often outdated. As a rule of thumb, youtube-dl releases at least once a month, and often weekly or even daily. Simply go to http://yt-dl.org/ to find out the current version. Unfortunately, there is nothing we youtube-dl developers can do if your distribution serves a really outdated version. You can (and should) complain to your distribution in their bugtracker or support forum. As a last resort, you can also uninstall the version installed by your package manager and follow our manual installation instructions. For that, remove the distribution's package, with a line like @@ -482,11 +523,11 @@ YouTube changed their playlist format in March 2014 and later on, so you'll need If you have installed youtube-dl with a package manager, pip, setup.py or a tarball, please use that to update. Note that Ubuntu packages do not seem to get updated anymore. Since we are not affiliated with Ubuntu, there is little we can do. Feel free to [report bugs](https://bugs.launchpad.net/ubuntu/+source/youtube-dl/+filebug) to the [Ubuntu packaging guys](mailto:ubuntu-motu@lists.ubuntu.com?subject=outdated%20version%20of%20youtube-dl) - all they have to do is update the package to a somewhat recent version. See above for a way to update. -### Do I always have to pass in `--max-quality FORMAT`, or `-citw`? +### Do I always have to pass `-citw`? -By default, youtube-dl intends to have the best options (incidentally, if you have a convincing case that these should be different, [please file an issue where you explain that](https://yt-dl.org/bug)). Therefore, it is unnecessary and sometimes harmful to copy long option strings from webpages. In particular, `--max-quality` *limits* the video quality (so if you want the best quality, do NOT pass it in), and the only option out of `-citw` that is regularly useful is `-i`. +By default, youtube-dl intends to have the best options (incidentally, if you have a convincing case that these should be different, [please file an issue where you explain that](https://yt-dl.org/bug)). Therefore, it is unnecessary and sometimes harmful to copy long option strings from webpages. In particular, the only option out of `-citw` that is regularly useful is `-i`. -### Can you please put the -b option back? +### Can you please put the `-b` option back? Most people asking this question are not aware that youtube-dl now defaults to downloading the highest available quality as reported by YouTube, which will be 1080p or 720p in some cases, so you no longer need the `-b` option. For some specific videos, maybe YouTube does not report them to be available in a specific high quality format you're interested in. In that case, simply request it with the `-f` option and youtube-dl will try to download it. @@ -494,27 +535,57 @@ Most people asking this question are not aware that youtube-dl now defaults to d Apparently YouTube requires you to pass a CAPTCHA test if you download too much. We're [considering to provide a way to let you solve the CAPTCHA](https://github.com/rg3/youtube-dl/issues/154), but at the moment, your best course of action is pointing a webbrowser to the youtube URL, solving the CAPTCHA, and restart youtube-dl. +### Do I need any other programs? + +youtube-dl works fine on its own on most sites. However, if you want to convert video/audio, you'll need [avconv](https://libav.org/) or [ffmpeg](https://www.ffmpeg.org/). On some sites - most notably YouTube - videos can be retrieved in a higher quality format without sound. youtube-dl will detect whether avconv/ffmpeg is present and automatically pick the best option. + +Videos or video formats streamed via RTMP protocol can only be downloaded when [rtmpdump](https://rtmpdump.mplayerhq.hu/) is installed. Downloading MMS and RTSP videos requires either [mplayer](http://mplayerhq.hu/) or [mpv](https://mpv.io/) to be installed. + ### I have downloaded a video but how can I play it? Once the video is fully downloaded, use any video player, such as [vlc](http://www.videolan.org) or [mplayer](http://www.mplayerhq.hu/). -### I extracted a video URL with -g, but it does not play on another machine / in my webbrowser. +### I extracted a video URL with `-g`, but it does not play on another machine / in my webbrowser. It depends a lot on the service. In many cases, requests for the video (to download/play it) must come from the same IP address and with the same cookies. Use the `--cookies` option to write the required cookies into a file, and advise your downloader to read cookies from that file. Some sites also require a common user agent to be used, use `--dump-user-agent` to see the one in use by youtube-dl. It may be beneficial to use IPv6; in some cases, the restrictions are only applied to IPv4. Some services (sometimes only for a subset of videos) do not restrict the video URL by IP address, cookie, or user-agent, but these are the exception rather than the rule. -Please bear in mind that some URL protocols are **not** supported by browsers out of the box, including RTMP. If you are using -g, your own downloader must support these as well. +Please bear in mind that some URL protocols are **not** supported by browsers out of the box, including RTMP. If you are using `-g`, your own downloader must support these as well. If you want to play the video on a machine that is not running youtube-dl, you can relay the video content from the machine that runs youtube-dl. You can use `-o -` to let youtube-dl stream a video to stdout, or simply allow the player to download the files written by youtube-dl in turn. ### ERROR: no fmt_url_map or conn information found in video info -youtube has switched to a new video info format in July 2011 which is not supported by old versions of youtube-dl. You can update youtube-dl with `sudo youtube-dl --update`. +YouTube has switched to a new video info format in July 2011 which is not supported by old versions of youtube-dl. See [above](#how-do-i-update-youtube-dl) for how to update youtube-dl. ### ERROR: unable to download video ### -youtube requires an additional signature since September 2012 which is not supported by old versions of youtube-dl. You can update youtube-dl with `sudo youtube-dl --update`. +YouTube requires an additional signature since September 2012 which is not supported by old versions of youtube-dl. See [above](#how-do-i-update-youtube-dl) for how to update youtube-dl. + +### Video URL contains an ampersand and I'm getting some strange output `[1] 2839` or `'v' is not recognized as an internal or external command` ### + +That's actually the output from your shell. Since ampersand is one of the special shell characters it's interpreted by the shell preventing you from passing the whole URL to youtube-dl. To disable your shell from interpreting the ampersands (or any other special characters) you have to either put the whole URL in quotes or escape them with a backslash (which approach will work depends on your shell). + +For example if your URL is https://www.youtube.com/watch?t=4&v=BaW_jenozKc you should end up with following command: + +```youtube-dl 'https://www.youtube.com/watch?t=4&v=BaW_jenozKc'``` + +or + +```youtube-dl https://www.youtube.com/watch?t=4\&v=BaW_jenozKc``` + +For Windows you have to use the double quotes: + +```youtube-dl "https://www.youtube.com/watch?t=4&v=BaW_jenozKc"``` + +### ExtractorError: Could not find JS function u'OF' + +In February 2015, the new YouTube player contained a character sequence in a string that was misinterpreted by old versions of youtube-dl. See [above](#how-do-i-update-youtube-dl) for how to update youtube-dl. + +### HTTP Error 429: Too Many Requests or 402: Payment Required + +These two error codes indicate that the service is blocking your IP address because of overuse. Contact the service and ask them to unblock your IP address, or - if you have acquired a whitelisted IP address already - use the [`--proxy` or `--source-address` options](#network-options) to select another IP address. ### SyntaxError: Non-ASCII character ### @@ -552,6 +623,12 @@ Either prepend `http://www.youtube.com/watch?v=` or separate the ID from the opt youtube-dl -- -wNyEUrxzFU youtube-dl "http://www.youtube.com/watch?v=-wNyEUrxzFU" +### How do I pass cookies to youtube-dl? + +Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`. Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows, `LF` (`\n`) for Linux and `CR` (`\r`) for Mac OS. `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. + +Passing cookies to youtube-dl is a good way to workaround login when a particular extractor does not implement it explicitly. + ### Can you add support for this anime video site, or site which shows current movies for free? As a matter of policy (as well as legality), youtube-dl does not include support for services that specialize in infringing copyright. As a rule of thumb, if you cannot easily find a video that the service is quite obviously allowed to distribute (i.e. that has been uploaded by the creator, the creator's distributor, or is published under a free license), the service is probably unfit for inclusion to youtube-dl. @@ -560,9 +637,21 @@ A note on the service that they don't host the infringing content, but just link Support requests for services that **do** purchase the rights to distribute their content are perfectly fine though. If in doubt, you can simply include a source that mentions the legitimate purchase of content. +### How can I speed up work on my issue? + +(Also known as: Help, my important issue not being solved!) The youtube-dl core developer team is quite small. While we do our best to solve as many issues as possible, sometimes that can take quite a while. To speed up your issue, here's what you can do: + +First of all, please do report the issue [at our issue tracker](https://yt-dl.org/bugs). That allows us to coordinate all efforts by users and developers, and serves as a unified point. Unfortunately, the youtube-dl project has grown too large to use personal email as an effective communication channel. + +Please read the [bug reporting instructions](#bugs) below. A lot of bugs lack all the necessary information. If you can, offer proxy, VPN, or shell access to the youtube-dl developers. If you are able to, test the issue from multiple computers in multiple countries to exclude local censorship or misconfiguration issues. + +If nobody is interested in solving your issue, you are welcome to take matters into your own hands and submit a pull request (or coerce/pay somebody else to do so). + +Feel free to bump the issue from time to time by writing a small comment ("Issue is still present in youtube-dl version ...from France, but fixed from Belgium"), but please not more than once a month. Please do not declare your issue as `important` or `urgent`. + ### How can I detect whether a given URL is supported by youtube-dl? -For one, have a look at the [list of supported sites](docs/supportedsites.md). Note that it can sometimes happen that the site changes its URL scheme (say, from http://example.com/v/1234567 to http://example.com/v/1234567 ) and youtube-dl reports an URL of a service in that list as unsupported. In that case, simply report a bug. +For one, have a look at the [list of supported sites](docs/supportedsites.md). Note that it can sometimes happen that the site changes its URL scheme (say, from http://example.com/video/1234567 to http://example.com/v/1234567 ) and youtube-dl reports an URL of a service in that list as unsupported. In that case, simply report a bug. It is *not* possible to detect whether a URL is supported or not. That's because youtube-dl contains a generic extractor which matches **all** URLs. You may be tempted to disable, exclude, or remove the generic extractor, but the generic extractor not only allows users to extract videos from lots of websites that embed a video from another service, but may also be used to extract video from a service that it's hosting itself. Therefore, we neither recommend nor support disabling, excluding, or removing the generic extractor. @@ -628,18 +717,19 @@ If you want to add support for a new site, you can follow this quick list (assum webpage = self._download_webpage(url, video_id) # TODO more code goes here, for example ... - title = self._html_search_regex(r'

(.*?)

', webpage, 'title') + title = self._html_search_regex(r'

(.+?)

', webpage, 'title') return { 'id': video_id, 'title': title, 'description': self._og_search_description(webpage), + 'uploader': self._search_regex(r']+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False), # TODO more properties (see youtube_dl/extractor/common.py) } ``` 5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). -6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will be then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. -7. Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Add tests and code for as many as you want. +6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. +7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L62-L200). Add tests and code for as many as you want. 8. If you can, check the code with [flake8](https://pypi.python.org/pypi/flake8). 9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: @@ -659,6 +749,7 @@ youtube-dl makes the best effort to be a good command-line program, and thus sho From a Python program, you can embed youtube-dl in a more powerful fashion, like this: ```python +from __future__ import unicode_literals import youtube_dl ydl_opts = {} @@ -666,11 +757,12 @@ with youtube_dl.YoutubeDL(ydl_opts) as ydl: ydl.download(['http://www.youtube.com/watch?v=BaW_jenozKc']) ``` -Most likely, you'll want to use various options. For a list of what can be done, have a look at [youtube_dl/YoutubeDL.py](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/YoutubeDL.py#L69). For a start, if you want to intercept youtube-dl's output, set a `logger` object. +Most likely, you'll want to use various options. For a list of what can be done, have a look at [youtube_dl/YoutubeDL.py](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/YoutubeDL.py#L117-L265). For a start, if you want to intercept youtube-dl's output, set a `logger` object. Here's a more complete example of a program that outputs only errors (and a short message after the download is finished), and downloads/converts the video to an mp3 file: ```python +from __future__ import unicode_literals import youtube_dl @@ -710,7 +802,7 @@ Bugs and suggestions should be reported at: log.txt 2>&1` to your command-line) or upload the `.dump` files you get when you add `--write-pages` [somewhere](https://gist.github.com/). -Site support requests **must contain an example URL**. An example URL is a URL you might want to download, like http://www.youtube.com/watch?v=BaW_jenozKc . There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. http://www.youtube.com/ ) is *not* an example URL. +**Site support requests must contain an example URL**. An example URL is a URL you might want to download, like http://www.youtube.com/watch?v=BaW_jenozKc . There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. http://www.youtube.com/ ) is *not* an example URL. ### Are you using the latest version? -Before reporting any issue, type youtube-dl -U. This should report that you're up-to-date. About 20% of the reports we receive are already fixed, but people are using outdated versions. This goes for feature requests as well. +Before reporting any issue, type `youtube-dl -U`. This should report that you're up-to-date. About 20% of the reports we receive are already fixed, but people are using outdated versions. This goes for feature requests as well. ### Is the issue already documented? diff --git a/devscripts/bash-completion.py b/devscripts/bash-completion.py index cd26cc089..ce68f26f9 100755 --- a/devscripts/bash-completion.py +++ b/devscripts/bash-completion.py @@ -5,7 +5,7 @@ import os from os.path import dirname as dirn import sys -sys.path.append(dirn(dirn((os.path.abspath(__file__))))) +sys.path.insert(0, dirn(dirn((os.path.abspath(__file__))))) import youtube_dl BASH_COMPLETION_FILE = "youtube-dl.bash-completion" diff --git a/devscripts/check-porn.py b/devscripts/check-porn.py index 216282712..7a219ebe9 100644 --- a/devscripts/check-porn.py +++ b/devscripts/check-porn.py @@ -28,7 +28,7 @@ for test in get_testcases(): if METHOD == 'EURISTIC': try: webpage = compat_urllib_request.urlopen(test['url'], timeout=10).read() - except: + except Exception: print('\nFail: {0}'.format(test['name'])) continue @@ -45,12 +45,12 @@ for test in get_testcases(): RESULT = ('.' + domain + '\n' in LIST or '\n' + domain + '\n' in LIST) - if RESULT and ('info_dict' not in test or 'age_limit' not in test['info_dict'] - or test['info_dict']['age_limit'] != 18): + if RESULT and ('info_dict' not in test or 'age_limit' not in test['info_dict'] or + test['info_dict']['age_limit'] != 18): print('\nPotential missing age_limit check: {0}'.format(test['name'])) - elif not RESULT and ('info_dict' in test and 'age_limit' in test['info_dict'] - and test['info_dict']['age_limit'] == 18): + elif not RESULT and ('info_dict' in test and 'age_limit' in test['info_dict'] and + test['info_dict']['age_limit'] == 18): print('\nPotential false negative: {0}'.format(test['name'])) else: diff --git a/devscripts/fish-completion.py b/devscripts/fish-completion.py index c2f238798..41629d87d 100755 --- a/devscripts/fish-completion.py +++ b/devscripts/fish-completion.py @@ -6,7 +6,7 @@ import os from os.path import dirname as dirn import sys -sys.path.append(dirn(dirn((os.path.abspath(__file__))))) +sys.path.insert(0, dirn(dirn((os.path.abspath(__file__))))) import youtube_dl from youtube_dl.utils import shell_quote diff --git a/devscripts/generate_aes_testdata.py b/devscripts/generate_aes_testdata.py new file mode 100644 index 000000000..2e389fc8e --- /dev/null +++ b/devscripts/generate_aes_testdata.py @@ -0,0 +1,42 @@ +from __future__ import unicode_literals + +import codecs +import subprocess + +import os +import sys +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.utils import intlist_to_bytes +from youtube_dl.aes import aes_encrypt, key_expansion + +secret_msg = b'Secret message goes here' + + +def hex_str(int_list): + return codecs.encode(intlist_to_bytes(int_list), 'hex') + + +def openssl_encode(algo, key, iv): + cmd = ['openssl', 'enc', '-e', '-' + algo, '-K', hex_str(key), '-iv', hex_str(iv)] + prog = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) + out, _ = prog.communicate(secret_msg) + return out + +iv = key = [0x20, 0x15] + 14 * [0] + +r = openssl_encode('aes-128-cbc', key, iv) +print('aes_cbc_decrypt') +print(repr(r)) + +password = key +new_key = aes_encrypt(password, key_expansion(password)) +r = openssl_encode('aes-128-ctr', new_key, iv) +print('aes_decrypt_text 16') +print(repr(r)) + +password = key + 16 * [0] +new_key = aes_encrypt(password, key_expansion(password)) * (32 // 16) +r = openssl_encode('aes-256-ctr', new_key, iv) +print('aes_decrypt_text 32') +print(repr(r)) diff --git a/devscripts/gh-pages/update-sites.py b/devscripts/gh-pages/update-sites.py index d3ef5f0b5..503c1372f 100755 --- a/devscripts/gh-pages/update-sites.py +++ b/devscripts/gh-pages/update-sites.py @@ -6,7 +6,7 @@ import os import textwrap # We must be able to import youtube_dl -sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) import youtube_dl diff --git a/devscripts/make_supportedsites.py b/devscripts/make_supportedsites.py index 3df4385a6..8cb4a4638 100644 --- a/devscripts/make_supportedsites.py +++ b/devscripts/make_supportedsites.py @@ -9,7 +9,7 @@ import sys # Import youtube_dl ROOT_DIR = os.path.join(os.path.dirname(__file__), '..') -sys.path.append(ROOT_DIR) +sys.path.insert(0, ROOT_DIR) import youtube_dl diff --git a/devscripts/prepare_manpage.py b/devscripts/prepare_manpage.py index 7ece37754..776e6556e 100644 --- a/devscripts/prepare_manpage.py +++ b/devscripts/prepare_manpage.py @@ -8,6 +8,35 @@ import re ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) README_FILE = os.path.join(ROOT_DIR, 'README.md') + +def filter_options(readme): + ret = '' + in_options = False + for line in readme.split('\n'): + if line.startswith('# '): + if line[2:].startswith('OPTIONS'): + in_options = True + else: + in_options = False + + if in_options: + if line.lstrip().startswith('-'): + option, description = re.split(r'\s{2,}', line.lstrip()) + split_option = option.split(' ') + + if not split_option[-1].startswith('-'): # metavar + option = ' '.join(split_option[:-1] + ['*%s*' % split_option[-1]]) + + # Pandoc's definition_lists. See http://pandoc.org/README.html + # for more information. + ret += '\n%s\n: %s\n' % (option, description) + else: + ret += line.lstrip() + '\n' + else: + ret += line + '\n' + + return ret + with io.open(README_FILE, encoding='utf-8') as f: readme = f.read() @@ -26,6 +55,8 @@ readme = re.sub(r'(?s)^.*?(?=# DESCRIPTION)', '', readme) readme = re.sub(r'\s+youtube-dl \[OPTIONS\] URL \[URL\.\.\.\]', '', readme) readme = PREFIX + readme +readme = filter_options(readme) + if sys.version_info < (3, 0): print(readme.encode('utf-8')) else: diff --git a/devscripts/zsh-completion.py b/devscripts/zsh-completion.py index f200f2c80..04728e8e2 100755 --- a/devscripts/zsh-completion.py +++ b/devscripts/zsh-completion.py @@ -5,7 +5,7 @@ import os from os.path import dirname as dirn import sys -sys.path.append(dirn(dirn((os.path.abspath(__file__))))) +sys.path.insert(0, dirn(dirn((os.path.abspath(__file__))))) import youtube_dl ZSH_COMPLETION_FILE = "youtube-dl.zsh" diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 8bce8fede..1df408610 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1,29 +1,35 @@ # Supported sites + - **1tv**: Первый канал - **1up.com** - **220.ro** + - **22tracks:genre** + - **22tracks:track** - **24video** - **3sat** - **4tube** - **56.com** - **5min** - **8tracks** + - **91porn** - **9gag** - **abc.net.au** - **Abc7News** - **AcademicEarth:Course** - **AddAnime** - **AdobeTV** + - **AdobeTVVideo** - **AdultSwim** - **Aftenposten** - **Aftonbladet** + - **AirMozilla** - **AlJazeera** - **Allocine** - **AlphaPorno** - **anitube.se** - **AnySex** - **Aparat** - - **AppleDailyAnimationNews** - - **AppleDailyRealtimeNews** + - **AppleConnect** + - **AppleDaily**: 臺灣蘋果日報 - **AppleTrailers** - **archive.org**: archive.org videos - **ARD** @@ -40,11 +46,15 @@ - **audiomack** - **audiomack:album** - **Azubu** + - **BaiduVideo**: 百度视频 - **bambuser** - **bambuser:channel** - **Bandcamp** - **Bandcamp:album** + - **bbc**: BBC - **bbc.co.uk**: BBC iPlayer + - **bbc.co.uk:article**: BBC articles + - **BeatportPro** - **Beeg** - **BehindKink** - **Bet** @@ -57,24 +67,34 @@ - **Bpb**: Bundeszentrale für politische Bildung - **BR**: Bayerischer Rundfunk Mediathek - **Break** - - **Brightcove** + - **brightcove:legacy** + - **brightcove:new** + - **bt:article**: Bergens Tidende Articles + - **bt:vestlendingen**: Bergens Tidende - Vestlendingen - **BuzzFeed** - **BYUtv** + - **Camdemy** + - **CamdemyFolder** - **Canal13cl** - **canalc2.tv** - **Canalplus**: canalplus.fr, piwiplus.fr and d8.tv - **CBS** - **CBSNews**: CBS News + - **CBSSports** - **CeskaTelevize** - **channel9**: Channel 9 + - **Chaturbate** - **Chilloutzone** + - **chirbit** + - **chirbit:profile** - **Cinchcast** - **Cinemassacre** - - **clipfish** + - **Clipfish** - **cliphunter** - **Clipsyndicate** - **Cloudy** - **Clubic** + - **Clyp** - **cmt.com** - **CNET** - **CNN** @@ -85,31 +105,41 @@ - **ComCarCoff** - **ComedyCentral** - **ComedyCentralShows**: The Daily Show / The Colbert Report - - **CondeNast**: Condé Nast media group: Condé Nast, GQ, Glamour, Vanity Fair, Vogue, W Magazine, WIRED + - **CondeNast**: Condé Nast media group: Allure, Architectural Digest, Ars Technica, Bon Appétit, Brides, Condé Nast, Condé Nast Traveler, Details, Epicurious, GQ, Glamour, Golf Digest, SELF, Teen Vogue, The New Yorker, Vanity Fair, Vogue, W Magazine, WIRED - **Cracked** - **Criterion** + - **CrooksAndLiars** - **Crunchyroll** - **crunchyroll:playlist** - **CSpan**: C-SPAN - - **CtsNews** + - **CtsNews**: 華視新聞 - **culturebox.francetvinfo.fr** - **dailymotion** - **dailymotion:playlist** - **dailymotion:user** + - **DailymotionCloud** - **daum.net** - **DBTV** + - **DCN** - **DctpTv** - **DeezerPlaylist** - **defense.gouv.fr** + - **democracynow** + - **DHM**: Filmarchiv - Deutsches Historisches Museum - **Discovery** - - **divxstage**: DivxStage - **Dotsub** + - **DouyuTV**: 斗鱼 + - **DPlay** + - **dramafever** + - **dramafever:series** - **DRBonanza** - **Dropbox** - **DrTuber** - **DRTV** - **Dump** + - **Dumpert** - **dvtv**: http://video.aktualne.cz/ + - **EaglePlatform** - **EbaumsWorld** - **EchoMsk** - **eHow** @@ -118,11 +148,15 @@ - **EllenTV** - **EllenTV:clips** - **ElPais**: El País + - **Embedly** - **EMPFlix** - **Engadget** - **Eporner** - **EroProfile** - **Escapist** + - **ESPN** (Currently broken) + - **EsriVideo** + - **Europa** - **EveryonesMixtape** - **exfm**: ex.fm - **ExpoTV** @@ -130,15 +164,16 @@ - **facebook** - **faz.net** - **fc2** + - **Fczenit** - **fernsehkritik.tv** - - **fernsehkritik.tv:postecke** - - **Firedrive** - **Firstpost** - - **firsttv**: Видеоархив - Первый канал + - **FiveTV** - **Flickr** - **Folketinget**: Folketinget (ft.dk; Danish parliament) + - **FootyRoom** - **Foxgay** - - **FoxNews** + - **FoxNews**: Fox News and Fox Business Video + - **FoxSports** - **france2.fr:generation-quoi** - **FranceCulture** - **FranceInter** @@ -151,21 +186,23 @@ - **Gamekings** - **GameOne** - **gameone:playlist** + - **Gamersyde** - **GameSpot** - **GameStar** - **Gametrailers** + - **Gazeta** - **GDCVault** - **generic**: Generic downloader that works on some sites + - **Gfycat** - **GiantBomb** - **Giga** - **Glide**: Glide mobile video messages (glide.me) - **Globo** + - **GloboArticle** - **GodTube** - **GoldenMoustache** - **Golem** - - **GorillaVid**: GorillaVid.in, daclips.in, movpod.in and fastvideo.in - **Goshgay** - - **Grooveshark** - **Groupon** - **Hark** - **HearThisAt** @@ -174,10 +211,10 @@ - **Helsinki**: helsinki.fi - **HentaiStigma** - **HistoricFilms** + - **History** - **hitbox** - **hitbox:live** - **HornBunny** - - **HostingBulk** - **HotNewHipHop** - **Howcast** - **HowStuffWorks** @@ -187,12 +224,18 @@ - **ign.com** - **imdb**: Internet Movie Database trailers - **imdb:list**: Internet Movie Database lists + - **Imgur** + - **ImgurAlbum** - **Ina** + - **Indavideo** + - **IndavideoEmbed** - **InfoQ** - **Instagram** - **instagram:user**: Instagram user profile - **InternetVideoArchive** - **IPrima** + - **iqiyi**: 爱奇艺 + - **Ir90Tv** - **ivi**: ivi.ru - **ivi:compilation**: ivi.ru compilations - **Izlesene** @@ -201,8 +244,11 @@ - **Jove** - **jpopsuki.tv** - **Jukebox** + - **Kaltura** + - **KanalPlay**: Kanal 5/9/11 Play - **Kankan** - **Karaoketv** + - **KarriereVideos** - **keek** - **KeezMovies** - **KhanAcademy** @@ -210,9 +256,24 @@ - **kontrtube**: KontrTube.ru - Труба зовёт - **KrasView**: Красвью - **Ku6** + - **kuwo:album**: 酷我音乐 - 专辑 + - **kuwo:category**: 酷我音乐 - 分类 + - **kuwo:chart**: 酷我音乐 - 排行榜 + - **kuwo:mv**: 酷我音乐 - MV + - **kuwo:singer**: 酷我音乐 - 歌手 + - **kuwo:song**: 酷我音乐 - **la7.tv** - **Laola1Tv** + - **Lecture2Go** + - **Letv**: 乐视网 + - **LetvPlaylist** + - **LetvTv** + - **Libsyn** + - **life:embed** - **lifenews**: LIFE | NEWS + - **limelight** + - **limelight:channel** + - **limelight:channel_list** - **LiveLeak** - **livestream** - **livestream:original** @@ -224,14 +285,15 @@ - **macgamestore**: MacGameStore trailers - **mailru**: Видео@Mail.Ru - **Malemotion** - - **MDR** + - **MDR**: MDR.DE and KiKA - **media.ccc.de** - **metacafe** - **Metacritic** - **Mgoon** - **Minhateca** - **MinistryGrid** - - **mitele.es** + - **miomio.tv** + - **MiTele**: mitele.es - **mixcloud** - **MLB** - **MoeVideo**: LetitBit video services: moevideo.net, playreplay.net and videochart.net @@ -243,35 +305,51 @@ - **Motherless** - **Motorsport**: motorsport.com - **MovieClips** + - **MovieFap** - **Moviezine** - **movshare**: MovShare - **MPORA** + - **MSNBC** - **MTV** + - **mtv.de** - **mtviggy.com** - **mtvservices:embedded** - **MuenchenTV**: münchen.tv - **MusicPlayOn** - - **MusicVault** - **muzu.tv** + - **Mwave** - **MySpace** - **MySpace:album** - **MySpass** + - **Myvi** - **myvideo** - **MyVidster** - **n-tv.de** + - **NationalGeographic** - **Naver** - **NBA** - **NBC** - **NBCNews** - - **ndr**: NDR.de - Mediathek + - **NBCSports** + - **NBCSportsVPlayer** + - **ndr**: NDR.de - Norddeutscher Rundfunk + - **ndr:embed** + - **ndr:embed:base** - **NDTV** - **NerdCubedFeed** - **Nerdist** + - **netease:album**: 网易云音乐 - 专辑 + - **netease:djradio**: 网易云音乐 - 电台 + - **netease:mv**: 网易云音乐 - MV + - **netease:playlist**: 网易云音乐 - 歌单 + - **netease:program**: 网易云音乐 - 电台节目 + - **netease:singer**: 网易云音乐 - 歌手 + - **netease:song**: 网易云音乐 - **Netzkino** - **Newgrounds** - **Newstube** - - **NextMedia** - - **NextMediaActionNews** + - **NextMedia**: 蘋果日報 + - **NextMediaActionNews**: 蘋果日報 - 動新聞 - **nfb**: National Film Board of Canada - **nfl.com** - **nhl.com** @@ -279,52 +357,86 @@ - **nhl.com:videocenter**: NHL videocenter category - **niconico**: ニコニコ動画 - **NiconicoPlaylist** + - **njoy**: N-JOY + - **njoy:embed** - **Noco** - **Normalboots** - **NosVideo** + - **Nova**: TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz - **novamov**: NovaMov - - **Nowness** + - **nowness** + - **nowness:playlist** + - **nowness:series** + - **NowTV** + - **NowTVList** - **nowvideo**: NowVideo - - **npo.nl** + - **npo**: npo.nl and ntr.nl - **npo.nl:live** + - **npo.nl:radio** + - **npo.nl:radio:fragment** - **NRK** - - **NRKTV** + - **NRKPlaylist** + - **NRKTV**: NRK TV and NRK Radio - **ntv.ru** - **Nuvid** - **NYTimes** + - **NYTimesArticle** - **ocw.mit.edu** + - **Odnoklassniki** - **OktoberfestTV** - **on.aol.com** + - **OnionStudios** - **Ooyala** - - **OpenFilm** + - **OoyalaExternal** - **orf:fm4**: radio FM4 + - **orf:iptv**: iptv.ORF.at - **orf:oe1**: Radio Österreich 1 - **orf:tvthek**: ORF TVthek - **parliamentlive.tv**: UK parliament videos - **Patreon** - **PBS** + - **Periscope**: Periscope + - **PhilharmonieDeParis**: Philharmonie de Paris - **Phoenix** - **Photobucket** + - **Pinkbike** + - **Pladform** - **PlanetaPlay** - **play.fm** - **played.to** + - **Playtvak**: Playtvak.cz, iDNES.cz and Lidovky.cz - **Playvid** + - **Playwire** + - **pluralsight** + - **pluralsight:course** - **plus.google**: Google Plus - **pluzz.francetv.fr** - **podomatic** - **PornHd** - **PornHub** + - **PornHubPlaylist** - **Pornotube** + - **PornoVoisines** - **PornoXO** + - **PrimeShareTV** - **PromptFile** - **prosiebensat1**: ProSiebenSat.1 Digital + - **Puls4** - **Pyvideo** + - **qqmusic**: QQ音乐 + - **qqmusic:album**: QQ音乐 - 专辑 + - **qqmusic:playlist**: QQ音乐 - 歌单 + - **qqmusic:singer**: QQ音乐 - 歌手 + - **qqmusic:toplist**: QQ音乐 - 排行榜 - **QuickVid** + - **R7** - **radio.de** - **radiobremen** - **radiofrance** + - **RadioJavan** - **Rai** - **RBMARadio** + - **RDS**: RDS.ca - **RedTube** - **Restudy** - **ReverbNation** @@ -333,13 +445,14 @@ - **Roxwel** - **RTBF** - **Rte** + - **rtl.nl**: rtl.nl and rtlxl.nl - **RTL2** - - **RTLnow** - - **rtlxl.nl** - **RTP** - **RTS**: RTS.ch - **rtve.es:alacarta**: RTVE a la carta + - **rtve.es:infantil**: RTVE infantil - **rtve.es:live**: RTVE.es live streams + - **RTVNH** - **RUHD** - **rutube**: Rutube videos - **rutube:channel**: Rutube channels @@ -347,6 +460,10 @@ - **rutube:movie**: Rutube movies - **rutube:person**: Rutube person videos - **RUTV**: RUTV.RU + - **Ruutu** + - **safari**: safaribooksonline.com online video + - **safari:course**: safaribooksonline.com online courses + - **Sandia**: Sandia National Laboratories - **Sapo**: SAPO Vídeos - **savefrom.net** - **SBS**: sbs.com.au @@ -355,10 +472,12 @@ - **Screencast** - **ScreencastOMatic** - **ScreenwaveMedia** + - **SenateISVP** - **ServingSys** - **Sexu** - **SexyKarma**: Sexy Karma and Watch Indian Porn - - **Shared** + - **Shahid** + - **Shared**: shared.sx and vivo.sx - **ShareSix** - **Sina** - **Slideshare** @@ -367,17 +486,26 @@ - **smotri:broadcast**: Smotri.com broadcasts - **smotri:community**: Smotri.com community videos - **smotri:user**: Smotri.com user videos + - **SnagFilms** + - **SnagFilmsEmbed** - **Snotr** - - **Sockshare** - **Sohu** + - **soompi** + - **soompi:show** - **soundcloud** - **soundcloud:playlist** + - **soundcloud:search**: Soundcloud search - **soundcloud:set** - **soundcloud:user** - - **Soundgasm** + - **soundgasm** + - **soundgasm:profile** - **southpark.cc.com** + - **southpark.cc.com:español** - **southpark.de** + - **southpark.nl** + - **southparkstudios.dk** - **Space** + - **SpankBang** - **Spankwire** - **Spiegel** - **Spiegel:Article**: Articles on spiegel.de @@ -385,15 +513,21 @@ - **Spike** - **Sport5** - **SportBox** + - **SportBoxEmbed** - **SportDeutschland** + - **Sportschau** + - **Srf** - **SRMediathek**: Saarländischer Rundfunk + - **SSA** - **stanfordoc**: Stanford Open ClassRoom - **Steam** + - **Stitcher** - **streamcloud.eu** - **StreamCZ** - **StreetVoice** - **SunPorno** - - **SVTPlay** + - **SVT** + - **SVTPlay**: SVT Play and Öppet arkiv - **SWRMediathek** - **Syfy** - **SztvHu** @@ -407,10 +541,10 @@ - **TeamFour** - **TechTalks** - **techtv.mit.edu** - - **TED** - - **tegenlicht.vpro.nl** + - **ted** - **TeleBruxelles** - - **telecinco.es** + - **Telecinco**: telecinco.es, cuatro.com and mediaset.es + - **Telegraaf** - **TeleMB** - **TeleTask** - **TenPlay** @@ -418,7 +552,9 @@ - **TF1** - **TheOnion** - **ThePlatform** + - **ThePlatformFeed** - **TheSixtyOne** + - **ThisAmericanLife** - **ThisAV** - **THVideo** - **THVideoPlaylist** @@ -426,6 +562,7 @@ - **tlc.com** - **tlc.de** - **TMZ** + - **TMZArticle** - **TNAFlix** - **tou.tv** - **Toypics**: Toypics user profile @@ -434,12 +571,18 @@ - **Trilulilu** - **TruTube** - **Tube8** + - **TubiTv** - **Tudou** - **Tumblr** - **TuneIn** - **Turbo** - **Tutv** - **tv.dfb.de** + - **TV2** + - **TV2Article** + - **TV4**: tv4.se and tv4play.se + - **TVC** + - **TVCArticle** - **tvigle**: Интернет-телевидение Tvigle.ru - **tvp.pl** - **tvp.pl:Series** @@ -452,28 +595,32 @@ - **twitch:stream** - **twitch:video** - **twitch:vod** + - **twitter** + - **twitter:card** - **Ubu** - **udemy** - **udemy:course** + - **UDNEmbed**: 聯合影音 + - **Ultimedia** - **Unistra** - **Urort**: NRK P3 Urørt - **ustream** - **ustream:channel** + - **Varzesh3** - **Vbox7** - **VeeHD** - **Veoh** + - **Vessel** - **Vesti**: Вести.Ru - **Vevo** - - **VGTV** + - **VGTV**: VGTV and BTTV - **vh1.com** - **Vice** - **Viddler** - **video.google:search**: Google Video search - **video.mit.edu** - - **VideoBam** - **VideoDetective** - **videofy.me** - - **videolectures.net** - **VideoMega** - **VideoPremium** - **VideoTt**: video.tt - Your True Tube @@ -482,7 +629,10 @@ - **Vidzi** - **vier** - **vier:videos** + - **Viewster** + - **Viidea** - **viki** + - **viki:channel** - **vimeo** - **vimeo:album** - **vimeo:channel** @@ -491,13 +641,16 @@ - **vimeo:review**: Review pages on vimeo - **vimeo:user** - **vimeo:watchlater**: Vimeo watch later list, "vimeowatchlater" keyword (requires authentication) - - **Vimple**: Vimple.ru + - **Vimple**: Vimple - one-click video hosting - **Vine** - **vine:user** - - **vk.com** - - **vk.com:user-videos**: vk.com:All of a user's videos + - **vk**: VK + - **vk:uservideos**: VK - User's Videos + - **vlive** - **Vodlocker** + - **VoiceRepublic** - **Vporn** + - **vpro**: npo.nl and ntr.nl - **VRT** - **vube**: Vube.com - **VuClip** @@ -510,27 +663,37 @@ - **wdr:mobile** - **WDRMaus**: Sendung mit der Maus - **WebOfStories** + - **WebOfStoriesPlaylist** - **Weibo** - **Wimp** - **Wistia** + - **WNL** - **WorldStarHipHop** - **wrzuta.pl** - **WSJ**: Wall Street Journal - **XBef** - **XboxClips** + - **XFileShare**: XFileShare based sites: GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net, filehoot.com and vidto.me - **XHamster** + - **XHamsterEmbed** - **XMinus** - **XNXX** + - **Xstream** - **XTube** - **XTubeUser**: XTube user profile - - **Xuite** + - **Xuite**: 隨意窩Xuite影音 - **XVideos** - **XXXYMovies** - **Yahoo**: Yahoo screen and movies + - **Yam**: 蕃薯藤yam天空部落 + - **yandexmusic:album**: Яндекс.Музыка - Альбом + - **yandexmusic:playlist**: Яндекс.Музыка - Плейлист + - **yandexmusic:track**: Яндекс.Музыка - Трек - **YesJapan** + - **yinyuetai:video**: 音悦Tai - **Ynet** - **YouJizz** - - **Youku** + - **youku**: 优酷 - **YouPorn** - **YourUpload** - **youtube**: YouTube.com @@ -545,7 +708,9 @@ - **youtube:show**: YouTube.com (multi-season) shows - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication) - **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword) - - **youtube:watch_later**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) + - **youtube:user:playlists**: YouTube.com user playlists + - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) + - **Zapiks** - **ZDF** - **ZDFChannel** - **zingmp3:album**: mp3.zing.vn albums diff --git a/setup.py b/setup.py index 4686260e0..bfe931f5b 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ py2exe_options = { "compressed": 1, "optimize": 2, "dist_dir": '.', - "dll_excludes": ['w9xpopen.exe'], + "dll_excludes": ['w9xpopen.exe', 'crypt32.dll'], } py2exe_console = [{ diff --git a/test/helper.py b/test/helper.py index 651ef99b9..bdd7acca4 100644 --- a/test/helper.py +++ b/test/helper.py @@ -89,58 +89,83 @@ def gettestcases(include_onlymatching=False): md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() -def expect_info_dict(self, got_dict, expected_dict): +def expect_value(self, got, expected, field): + if isinstance(expected, compat_str) and expected.startswith('re:'): + match_str = expected[len('re:'):] + match_rex = re.compile(match_str) + + self.assertTrue( + isinstance(got, compat_str), + 'Expected a %s object, but got %s for field %s' % ( + compat_str.__name__, type(got).__name__, field)) + self.assertTrue( + match_rex.match(got), + 'field %s (value: %r) should match %r' % (field, got, match_str)) + elif isinstance(expected, compat_str) and expected.startswith('startswith:'): + start_str = expected[len('startswith:'):] + self.assertTrue( + isinstance(got, compat_str), + 'Expected a %s object, but got %s for field %s' % ( + compat_str.__name__, type(got).__name__, field)) + self.assertTrue( + got.startswith(start_str), + 'field %s (value: %r) should start with %r' % (field, got, start_str)) + elif isinstance(expected, compat_str) and expected.startswith('contains:'): + contains_str = expected[len('contains:'):] + self.assertTrue( + isinstance(got, compat_str), + 'Expected a %s object, but got %s for field %s' % ( + compat_str.__name__, type(got).__name__, field)) + self.assertTrue( + contains_str in got, + 'field %s (value: %r) should contain %r' % (field, got, contains_str)) + elif isinstance(expected, type): + self.assertTrue( + isinstance(got, expected), + 'Expected type %r for field %s, but got value %r of type %r' % (expected, field, got, type(got))) + elif isinstance(expected, dict) and isinstance(got, dict): + expect_dict(self, got, expected) + elif isinstance(expected, list) and isinstance(got, list): + self.assertEqual( + len(expected), len(got), + 'Expect a list of length %d, but got a list of length %d for field %s' % ( + len(expected), len(got), field)) + for index, (item_got, item_expected) in enumerate(zip(got, expected)): + type_got = type(item_got) + type_expected = type(item_expected) + self.assertEqual( + type_expected, type_got, + 'Type mismatch for list item at index %d for field %s, expected %r, got %r' % ( + index, field, type_expected, type_got)) + expect_value(self, item_got, item_expected, field) + else: + if isinstance(expected, compat_str) and expected.startswith('md5:'): + got = 'md5:' + md5(got) + elif isinstance(expected, compat_str) and expected.startswith('mincount:'): + self.assertTrue( + isinstance(got, (list, dict)), + 'Expected field %s to be a list or a dict, but it is of type %s' % ( + field, type(got).__name__)) + expected_num = int(expected.partition(':')[2]) + assertGreaterEqual( + self, len(got), expected_num, + 'Expected %d items in field %s, but only got %d' % (expected_num, field, len(got))) + return + self.assertEqual( + expected, got, + 'Invalid value for field %s, expected %r, got %r' % (field, expected, got)) + + +def expect_dict(self, got_dict, expected_dict): for info_field, expected in expected_dict.items(): - if isinstance(expected, compat_str) and expected.startswith('re:'): - got = got_dict.get(info_field) - match_str = expected[len('re:'):] - match_rex = re.compile(match_str) + got = got_dict.get(info_field) + expect_value(self, got, expected, info_field) - self.assertTrue( - isinstance(got, compat_str), - 'Expected a %s object, but got %s for field %s' % ( - compat_str.__name__, type(got).__name__, info_field)) - self.assertTrue( - match_rex.match(got), - 'field %s (value: %r) should match %r' % (info_field, got, match_str)) - elif isinstance(expected, compat_str) and expected.startswith('startswith:'): - got = got_dict.get(info_field) - start_str = expected[len('startswith:'):] - self.assertTrue( - isinstance(got, compat_str), - 'Expected a %s object, but got %s for field %s' % ( - compat_str.__name__, type(got).__name__, info_field)) - self.assertTrue( - got.startswith(start_str), - 'field %s (value: %r) should start with %r' % (info_field, got, start_str)) - elif isinstance(expected, type): - got = got_dict.get(info_field) - self.assertTrue(isinstance(got, expected), - 'Expected type %r for field %s, but got value %r of type %r' % (expected, info_field, got, type(got))) - else: - if isinstance(expected, compat_str) and expected.startswith('md5:'): - got = 'md5:' + md5(got_dict.get(info_field)) - elif isinstance(expected, compat_str) and expected.startswith('mincount:'): - got = got_dict.get(info_field) - self.assertTrue( - isinstance(got, list), - 'Expected field %s to be a list, but it is of type %s' % ( - info_field, type(got).__name__)) - expected_num = int(expected.partition(':')[2]) - assertGreaterEqual( - self, len(got), expected_num, - 'Expected %d items in field %s, but only got %d' % ( - expected_num, info_field, len(got) - ) - ) - continue - else: - got = got_dict.get(info_field) - self.assertEqual(expected, got, - 'invalid value for field %s, expected %r, got %r' % (info_field, expected, got)) +def expect_info_dict(self, got_dict, expected_dict): + expect_dict(self, got_dict, expected_dict) # Check for the presence of mandatory fields - if got_dict.get('_type') != 'playlist': + if got_dict.get('_type') not in ('playlist', 'multi_video'): for key in ('id', 'url', 'title', 'ext'): self.assertTrue(got_dict.get(key), 'Missing mandatory field %s' % key) # Check for mandatory fields that are automatically set by YoutubeDL @@ -150,7 +175,7 @@ def expect_info_dict(self, got_dict, expected_dict): # Are checkable fields missing from the test case definition? test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value)) for key, value in got_dict.items() - if value and key in ('id', 'title', 'description', 'uploader', 'upload_date', 'timestamp', 'uploader_id', 'location')) + if value and key in ('id', 'title', 'description', 'uploader', 'upload_date', 'timestamp', 'uploader_id', 'location', 'age_limit')) missing_keys = set(test_info_dict.keys()) - set(expected_dict.keys()) if missing_keys: def _repr(v): @@ -163,12 +188,14 @@ def expect_info_dict(self, got_dict, expected_dict): info_dict_str += ''.join( ' %s: %s,\n' % (_repr(k), _repr(v)) for k, v in test_info_dict.items() if k not in missing_keys) - info_dict_str += '\n' + + if info_dict_str: + info_dict_str += '\n' info_dict_str += ''.join( ' %s: %s,\n' % (_repr(k), _repr(test_info_dict[k])) for k in missing_keys) write_string( - '\n\'info_dict\': {\n' + info_dict_str + '}\n', out=sys.stderr) + '\n\'info_dict\': {\n' + info_dict_str + '},\n', out=sys.stderr) self.assertFalse( missing_keys, 'Missing keys in test definition: %s' % ( diff --git a/test/parameters.json b/test/parameters.json index af77b89b4..7bf59c25f 100644 --- a/test/parameters.json +++ b/test/parameters.json @@ -7,8 +7,7 @@ "forcethumbnail": false, "forcetitle": false, "forceurl": false, - "format": null, - "format_limit": null, + "format": "best", "ignoreerrors": false, "listformats": null, "logtostderr": false, @@ -28,7 +27,7 @@ "retries": 10, "simulate": false, "subtitleslang": null, - "subtitlesformat": "srt", + "subtitlesformat": "best", "test": true, "updatetime": true, "usenetrc": false, diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index be8d12997..938466a80 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -35,10 +35,18 @@ class TestInfoExtractor(unittest.TestCase): + + + + ''' self.assertEqual(ie._og_search_title(html), 'Foo') self.assertEqual(ie._og_search_description(html), 'Some video\'s description ') self.assertEqual(ie._og_search_thumbnail(html), 'http://domain.com/pic.jpg?key1=val1&key2=val2') + self.assertEqual(ie._og_search_video_url(html, default=None), None) + self.assertEqual(ie._og_search_property('foobar', html), 'Foo') + self.assertEqual(ie._og_search_property('test1', html), 'foo > < bar') + self.assertEqual(ie._og_search_property('test2', html), 'foo >//< bar') def test_html_search_meta(self): ie = self.ie diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index b1cd6a69f..0388c0bf3 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -12,8 +12,12 @@ import copy from test.helper import FakeYDL, assertRegexpMatches from youtube_dl import YoutubeDL +from youtube_dl.compat import compat_str from youtube_dl.extractor import YoutubeIE from youtube_dl.postprocessor.common import PostProcessor +from youtube_dl.utils import ExtractorError, match_filter_func + +TEST_URL = 'http://localhost/sample.mp4' class YDL(FakeYDL): @@ -46,8 +50,8 @@ class TestFormatSelection(unittest.TestCase): ydl = YDL() ydl.params['prefer_free_formats'] = True formats = [ - {'ext': 'webm', 'height': 460, 'url': 'x'}, - {'ext': 'mp4', 'height': 460, 'url': 'y'}, + {'ext': 'webm', 'height': 460, 'url': TEST_URL}, + {'ext': 'mp4', 'height': 460, 'url': TEST_URL}, ] info_dict = _make_result(formats) yie = YoutubeIE(ydl) @@ -60,8 +64,8 @@ class TestFormatSelection(unittest.TestCase): ydl = YDL() ydl.params['prefer_free_formats'] = True formats = [ - {'ext': 'webm', 'height': 720, 'url': 'a'}, - {'ext': 'mp4', 'height': 1080, 'url': 'b'}, + {'ext': 'webm', 'height': 720, 'url': TEST_URL}, + {'ext': 'mp4', 'height': 1080, 'url': TEST_URL}, ] info_dict['formats'] = formats yie = YoutubeIE(ydl) @@ -74,9 +78,9 @@ class TestFormatSelection(unittest.TestCase): ydl = YDL() ydl.params['prefer_free_formats'] = False formats = [ - {'ext': 'webm', 'height': 720, 'url': '_'}, - {'ext': 'mp4', 'height': 720, 'url': '_'}, - {'ext': 'flv', 'height': 720, 'url': '_'}, + {'ext': 'webm', 'height': 720, 'url': TEST_URL}, + {'ext': 'mp4', 'height': 720, 'url': TEST_URL}, + {'ext': 'flv', 'height': 720, 'url': TEST_URL}, ] info_dict['formats'] = formats yie = YoutubeIE(ydl) @@ -88,8 +92,8 @@ class TestFormatSelection(unittest.TestCase): ydl = YDL() ydl.params['prefer_free_formats'] = False formats = [ - {'ext': 'flv', 'height': 720, 'url': '_'}, - {'ext': 'webm', 'height': 720, 'url': '_'}, + {'ext': 'flv', 'height': 720, 'url': TEST_URL}, + {'ext': 'webm', 'height': 720, 'url': TEST_URL}, ] info_dict['formats'] = formats yie = YoutubeIE(ydl) @@ -98,45 +102,13 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['ext'], 'flv') - def test_format_limit(self): - formats = [ - {'format_id': 'meh', 'url': 'http://example.com/meh', 'preference': 1}, - {'format_id': 'good', 'url': 'http://example.com/good', 'preference': 2}, - {'format_id': 'great', 'url': 'http://example.com/great', 'preference': 3}, - {'format_id': 'excellent', 'url': 'http://example.com/exc', 'preference': 4}, - ] - info_dict = _make_result(formats) - - ydl = YDL() - ydl.process_ie_result(info_dict) - downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], 'excellent') - - ydl = YDL({'format_limit': 'good'}) - assert ydl.params['format_limit'] == 'good' - ydl.process_ie_result(info_dict.copy()) - downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], 'good') - - ydl = YDL({'format_limit': 'great', 'format': 'all'}) - ydl.process_ie_result(info_dict.copy()) - self.assertEqual(ydl.downloaded_info_dicts[0]['format_id'], 'meh') - self.assertEqual(ydl.downloaded_info_dicts[1]['format_id'], 'good') - self.assertEqual(ydl.downloaded_info_dicts[2]['format_id'], 'great') - self.assertTrue('3' in ydl.msgs[0]) - - ydl = YDL() - ydl.params['format_limit'] = 'excellent' - ydl.process_ie_result(info_dict.copy()) - downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], 'excellent') - def test_format_selection(self): formats = [ - {'format_id': '35', 'ext': 'mp4', 'preference': 1, 'url': '_'}, - {'format_id': '45', 'ext': 'webm', 'preference': 2, 'url': '_'}, - {'format_id': '47', 'ext': 'webm', 'preference': 3, 'url': '_'}, - {'format_id': '2', 'ext': 'flv', 'preference': 4, 'url': '_'}, + {'format_id': '35', 'ext': 'mp4', 'preference': 1, 'url': TEST_URL}, + {'format_id': 'example-with-dashes', 'ext': 'webm', 'preference': 1, 'url': TEST_URL}, + {'format_id': '45', 'ext': 'webm', 'preference': 2, 'url': TEST_URL}, + {'format_id': '47', 'ext': 'webm', 'preference': 3, 'url': TEST_URL}, + {'format_id': '2', 'ext': 'flv', 'preference': 4, 'url': TEST_URL}, ] info_dict = _make_result(formats) @@ -165,12 +137,17 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], '35') + ydl = YDL({'format': 'example-with-dashes'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'example-with-dashes') + def test_format_selection_audio(self): formats = [ - {'format_id': 'audio-low', 'ext': 'webm', 'preference': 1, 'vcodec': 'none', 'url': '_'}, - {'format_id': 'audio-mid', 'ext': 'webm', 'preference': 2, 'vcodec': 'none', 'url': '_'}, - {'format_id': 'audio-high', 'ext': 'flv', 'preference': 3, 'vcodec': 'none', 'url': '_'}, - {'format_id': 'vid', 'ext': 'mp4', 'preference': 4, 'url': '_'}, + {'format_id': 'audio-low', 'ext': 'webm', 'preference': 1, 'vcodec': 'none', 'url': TEST_URL}, + {'format_id': 'audio-mid', 'ext': 'webm', 'preference': 2, 'vcodec': 'none', 'url': TEST_URL}, + {'format_id': 'audio-high', 'ext': 'flv', 'preference': 3, 'vcodec': 'none', 'url': TEST_URL}, + {'format_id': 'vid', 'ext': 'mp4', 'preference': 4, 'url': TEST_URL}, ] info_dict = _make_result(formats) @@ -185,8 +162,8 @@ class TestFormatSelection(unittest.TestCase): self.assertEqual(downloaded['format_id'], 'audio-low') formats = [ - {'format_id': 'vid-low', 'ext': 'mp4', 'preference': 1, 'url': '_'}, - {'format_id': 'vid-high', 'ext': 'mp4', 'preference': 2, 'url': '_'}, + {'format_id': 'vid-low', 'ext': 'mp4', 'preference': 1, 'url': TEST_URL}, + {'format_id': 'vid-high', 'ext': 'mp4', 'preference': 2, 'url': TEST_URL}, ] info_dict = _make_result(formats) @@ -228,9 +205,9 @@ class TestFormatSelection(unittest.TestCase): def test_format_selection_video(self): formats = [ - {'format_id': 'dash-video-low', 'ext': 'mp4', 'preference': 1, 'acodec': 'none', 'url': '_'}, - {'format_id': 'dash-video-high', 'ext': 'mp4', 'preference': 2, 'acodec': 'none', 'url': '_'}, - {'format_id': 'vid', 'ext': 'mp4', 'preference': 3, 'url': '_'}, + {'format_id': 'dash-video-low', 'ext': 'mp4', 'preference': 1, 'acodec': 'none', 'url': TEST_URL}, + {'format_id': 'dash-video-high', 'ext': 'mp4', 'preference': 2, 'acodec': 'none', 'url': TEST_URL}, + {'format_id': 'vid', 'ext': 'mp4', 'preference': 3, 'url': TEST_URL}, ] info_dict = _make_result(formats) @@ -258,29 +235,89 @@ class TestFormatSelection(unittest.TestCase): '141', '172', '140', '171', '139', ] - for f1id, f2id in zip(order, order[1:]): - f1 = YoutubeIE._formats[f1id].copy() - f1['format_id'] = f1id - f1['url'] = 'url:' + f1id - f2 = YoutubeIE._formats[f2id].copy() - f2['format_id'] = f2id - f2['url'] = 'url:' + f2id + def format_info(f_id): + info = YoutubeIE._formats[f_id].copy() + info['format_id'] = f_id + info['url'] = 'url:' + f_id + return info + formats_order = [format_info(f_id) for f_id in order] + + info_dict = _make_result(list(formats_order), extractor='youtube') + ydl = YDL({'format': 'bestvideo+bestaudio'}) + yie = YoutubeIE(ydl) + yie._sort_formats(info_dict['formats']) + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], '137+141') + self.assertEqual(downloaded['ext'], 'mp4') + + info_dict = _make_result(list(formats_order), extractor='youtube') + ydl = YDL({'format': 'bestvideo[height>=999999]+bestaudio/best'}) + yie = YoutubeIE(ydl) + yie._sort_formats(info_dict['formats']) + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], '38') + + info_dict = _make_result(list(formats_order), extractor='youtube') + ydl = YDL({'format': 'bestvideo/best,bestaudio'}) + yie = YoutubeIE(ydl) + yie._sort_formats(info_dict['formats']) + ydl.process_ie_result(info_dict) + downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] + self.assertEqual(downloaded_ids, ['137', '141']) + + info_dict = _make_result(list(formats_order), extractor='youtube') + ydl = YDL({'format': '(bestvideo[ext=mp4],bestvideo[ext=webm])+bestaudio'}) + yie = YoutubeIE(ydl) + yie._sort_formats(info_dict['formats']) + ydl.process_ie_result(info_dict) + downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] + self.assertEqual(downloaded_ids, ['137+141', '248+141']) + + info_dict = _make_result(list(formats_order), extractor='youtube') + ydl = YDL({'format': '(bestvideo[ext=mp4],bestvideo[ext=webm])[height<=720]+bestaudio'}) + yie = YoutubeIE(ydl) + yie._sort_formats(info_dict['formats']) + ydl.process_ie_result(info_dict) + downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] + self.assertEqual(downloaded_ids, ['136+141', '247+141']) + info_dict = _make_result(list(formats_order), extractor='youtube') + ydl = YDL({'format': '(bestvideo[ext=none]/bestvideo[ext=webm])+bestaudio'}) + yie = YoutubeIE(ydl) + yie._sort_formats(info_dict['formats']) + ydl.process_ie_result(info_dict) + downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] + self.assertEqual(downloaded_ids, ['248+141']) + + for f1, f2 in zip(formats_order, formats_order[1:]): info_dict = _make_result([f1, f2], extractor='youtube') - ydl = YDL() + ydl = YDL({'format': 'best/bestvideo'}) yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], f1id) + self.assertEqual(downloaded['format_id'], f1['format_id']) info_dict = _make_result([f2, f1], extractor='youtube') - ydl = YDL() + ydl = YDL({'format': 'best/bestvideo'}) yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], f1id) + self.assertEqual(downloaded['format_id'], f1['format_id']) + + def test_invalid_format_specs(self): + def assert_syntax_error(format_spec): + ydl = YDL({'format': format_spec}) + info_dict = _make_result([{'format_id': 'foo', 'url': TEST_URL}]) + self.assertRaises(SyntaxError, ydl.process_ie_result, info_dict) + + assert_syntax_error('bestvideo,,best') + assert_syntax_error('+bestaudio') + assert_syntax_error('bestvideo+') + assert_syntax_error('/') def test_format_filtering(self): formats = [ @@ -337,6 +374,79 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'G') + ydl = YDL({'format': 'all[width>=400][width<=600]'}) + ydl.process_ie_result(info_dict) + downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] + self.assertEqual(downloaded_ids, ['B', 'C', 'D']) + + ydl = YDL({'format': 'best[height<40]'}) + try: + ydl.process_ie_result(info_dict) + except ExtractorError: + pass + self.assertEqual(ydl.downloaded_info_dicts, []) + + +class TestYoutubeDL(unittest.TestCase): + def test_subtitles(self): + def s_formats(lang, autocaption=False): + return [{ + 'ext': ext, + 'url': 'http://localhost/video.%s.%s' % (lang, ext), + '_auto': autocaption, + } for ext in ['vtt', 'srt', 'ass']] + subtitles = dict((l, s_formats(l)) for l in ['en', 'fr', 'es']) + auto_captions = dict((l, s_formats(l, True)) for l in ['it', 'pt', 'es']) + info_dict = { + 'id': 'test', + 'title': 'Test', + 'url': 'http://localhost/video.mp4', + 'subtitles': subtitles, + 'automatic_captions': auto_captions, + 'extractor': 'TEST', + } + + def get_info(params={}): + params.setdefault('simulate', True) + ydl = YDL(params) + ydl.report_warning = lambda *args, **kargs: None + return ydl.process_video_result(info_dict, download=False) + + result = get_info() + self.assertFalse(result.get('requested_subtitles')) + self.assertEqual(result['subtitles'], subtitles) + self.assertEqual(result['automatic_captions'], auto_captions) + + result = get_info({'writesubtitles': True}) + subs = result['requested_subtitles'] + self.assertTrue(subs) + self.assertEqual(set(subs.keys()), set(['en'])) + self.assertTrue(subs['en'].get('data') is None) + self.assertEqual(subs['en']['ext'], 'ass') + + result = get_info({'writesubtitles': True, 'subtitlesformat': 'foo/srt'}) + subs = result['requested_subtitles'] + self.assertEqual(subs['en']['ext'], 'srt') + + result = get_info({'writesubtitles': True, 'subtitleslangs': ['es', 'fr', 'it']}) + subs = result['requested_subtitles'] + self.assertTrue(subs) + self.assertEqual(set(subs.keys()), set(['es', 'fr'])) + + result = get_info({'writesubtitles': True, 'writeautomaticsub': True, 'subtitleslangs': ['es', 'pt']}) + subs = result['requested_subtitles'] + self.assertTrue(subs) + self.assertEqual(set(subs.keys()), set(['es', 'pt'])) + self.assertFalse(subs['es']['_auto']) + self.assertTrue(subs['pt']['_auto']) + + result = get_info({'writeautomaticsub': True, 'subtitleslangs': ['es', 'pt']}) + subs = result['requested_subtitles'] + self.assertTrue(subs) + self.assertEqual(set(subs.keys()), set(['es', 'pt'])) + self.assertTrue(subs['es']['_auto']) + self.assertTrue(subs['pt']['_auto']) + def test_add_extra_info(self): test_dict = { 'extractor': 'Foo', @@ -379,27 +489,148 @@ class TestFormatSelection(unittest.TestCase): def run(self, info): with open(audiofile, 'wt') as f: f.write('EXAMPLE') - info['filepath'] - return False, info + return [info['filepath']], info - def run_pp(params): + def run_pp(params, PP): with open(filename, 'wt') as f: f.write('EXAMPLE') ydl = YoutubeDL(params) - ydl.add_post_processor(SimplePP()) + ydl.add_post_processor(PP()) ydl.post_process(filename, {'filepath': filename}) - run_pp({'keepvideo': True}) + run_pp({'keepvideo': True}, SimplePP) self.assertTrue(os.path.exists(filename), '%s doesn\'t exist' % filename) self.assertTrue(os.path.exists(audiofile), '%s doesn\'t exist' % audiofile) os.unlink(filename) os.unlink(audiofile) - run_pp({'keepvideo': False}) + run_pp({'keepvideo': False}, SimplePP) self.assertFalse(os.path.exists(filename), '%s exists' % filename) self.assertTrue(os.path.exists(audiofile), '%s doesn\'t exist' % audiofile) os.unlink(audiofile) + class ModifierPP(PostProcessor): + def run(self, info): + with open(info['filepath'], 'wt') as f: + f.write('MODIFIED') + return [], info + + run_pp({'keepvideo': False}, ModifierPP) + self.assertTrue(os.path.exists(filename), '%s doesn\'t exist' % filename) + os.unlink(filename) + + def test_match_filter(self): + class FilterYDL(YDL): + def __init__(self, *args, **kwargs): + super(FilterYDL, self).__init__(*args, **kwargs) + self.params['simulate'] = True + + def process_info(self, info_dict): + super(YDL, self).process_info(info_dict) + + def _match_entry(self, info_dict, incomplete): + res = super(FilterYDL, self)._match_entry(info_dict, incomplete) + if res is None: + self.downloaded_info_dicts.append(info_dict) + return res + + first = { + 'id': '1', + 'url': TEST_URL, + 'title': 'one', + 'extractor': 'TEST', + 'duration': 30, + 'filesize': 10 * 1024, + } + second = { + 'id': '2', + 'url': TEST_URL, + 'title': 'two', + 'extractor': 'TEST', + 'duration': 10, + 'description': 'foo', + 'filesize': 5 * 1024, + } + videos = [first, second] + + def get_videos(filter_=None): + ydl = FilterYDL({'match_filter': filter_}) + for v in videos: + ydl.process_ie_result(v, download=True) + return [v['id'] for v in ydl.downloaded_info_dicts] + + res = get_videos() + self.assertEqual(res, ['1', '2']) + + def f(v): + if v['id'] == '1': + return None + else: + return 'Video id is not 1' + res = get_videos(f) + self.assertEqual(res, ['1']) + + f = match_filter_func('duration < 30') + res = get_videos(f) + self.assertEqual(res, ['2']) + + f = match_filter_func('description = foo') + res = get_videos(f) + self.assertEqual(res, ['2']) + + f = match_filter_func('description =? foo') + res = get_videos(f) + self.assertEqual(res, ['1', '2']) + + f = match_filter_func('filesize > 5KiB') + res = get_videos(f) + self.assertEqual(res, ['1']) + + def test_playlist_items_selection(self): + entries = [{ + 'id': compat_str(i), + 'title': compat_str(i), + 'url': TEST_URL, + } for i in range(1, 5)] + playlist = { + '_type': 'playlist', + 'id': 'test', + 'entries': entries, + 'extractor': 'test:playlist', + 'extractor_key': 'test:playlist', + 'webpage_url': 'http://example.com', + } + + def get_ids(params): + ydl = YDL(params) + # make a copy because the dictionary can be modified + ydl.process_ie_result(playlist.copy()) + return [int(v['id']) for v in ydl.downloaded_info_dicts] + + result = get_ids({}) + self.assertEqual(result, [1, 2, 3, 4]) + + result = get_ids({'playlistend': 10}) + self.assertEqual(result, [1, 2, 3, 4]) + + result = get_ids({'playlistend': 2}) + self.assertEqual(result, [1, 2]) + + result = get_ids({'playliststart': 10}) + self.assertEqual(result, []) + + result = get_ids({'playliststart': 2}) + self.assertEqual(result, [2, 3, 4]) + + result = get_ids({'playlist_items': '2-4'}) + self.assertEqual(result, [2, 3, 4]) + + result = get_ids({'playlist_items': '2,4'}) + self.assertEqual(result, [2, 4]) + + result = get_ids({'playlist_items': '10'}) + self.assertEqual(result, []) + if __name__ == '__main__': unittest.main() diff --git a/test/test_aes.py b/test/test_aes.py new file mode 100644 index 000000000..315a3f5ae --- /dev/null +++ b/test/test_aes.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python + +from __future__ import unicode_literals + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.aes import aes_decrypt, aes_encrypt, aes_cbc_decrypt, aes_decrypt_text +from youtube_dl.utils import bytes_to_intlist, intlist_to_bytes +import base64 + +# the encrypted data can be generate with 'devscripts/generate_aes_testdata.py' + + +class TestAES(unittest.TestCase): + def setUp(self): + self.key = self.iv = [0x20, 0x15] + 14 * [0] + self.secret_msg = b'Secret message goes here' + + def test_encrypt(self): + msg = b'message' + key = list(range(16)) + encrypted = aes_encrypt(bytes_to_intlist(msg), key) + decrypted = intlist_to_bytes(aes_decrypt(encrypted, key)) + self.assertEqual(decrypted, msg) + + def test_cbc_decrypt(self): + data = bytes_to_intlist( + b"\x97\x92+\xe5\x0b\xc3\x18\x91ky9m&\xb3\xb5@\xe6'\xc2\x96.\xc8u\x88\xab9-[\x9e|\xf1\xcd" + ) + decrypted = intlist_to_bytes(aes_cbc_decrypt(data, self.key, self.iv)) + self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) + + def test_decrypt_text(self): + password = intlist_to_bytes(self.key).decode('utf-8') + encrypted = base64.b64encode( + intlist_to_bytes(self.iv[:8]) + + b'\x17\x15\x93\xab\x8d\x80V\xcdV\xe0\t\xcdo\xc2\xa5\xd8ksM\r\xe27N\xae' + ).decode('utf-8') + decrypted = (aes_decrypt_text(encrypted, password, 16)) + self.assertEqual(decrypted, self.secret_msg) + + password = intlist_to_bytes(self.key).decode('utf-8') + encrypted = base64.b64encode( + intlist_to_bytes(self.iv[:8]) + + b'\x0b\xe6\xa4\xd9z\x0e\xb8\xb9\xd0\xd4i_\x85\x1d\x99\x98_\xe5\x80\xe7.\xbf\xa5\x83' + ).decode('utf-8') + decrypted = (aes_decrypt_text(encrypted, password, 32)) + self.assertEqual(decrypted, self.secret_msg) + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_all_urls.py b/test/test_all_urls.py index e66264b4b..a9db42b30 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -59,7 +59,7 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch('www.youtube.com/NASAgovVideo/videos', ['youtube:user']) def test_youtube_feeds(self): - self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watch_later']) + self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater']) self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions']) self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended']) self.assertMatch('https://www.youtube.com/my_favorites', ['youtube:favorites']) @@ -104,11 +104,11 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch(':tds', ['ComedyCentralShows']) def test_vimeo_matching(self): - self.assertMatch('http://vimeo.com/channels/tributes', ['vimeo:channel']) - self.assertMatch('http://vimeo.com/channels/31259', ['vimeo:channel']) - self.assertMatch('http://vimeo.com/channels/31259/53576664', ['vimeo']) - self.assertMatch('http://vimeo.com/user7108434', ['vimeo:user']) - self.assertMatch('http://vimeo.com/user7108434/videos', ['vimeo:user']) + self.assertMatch('https://vimeo.com/channels/tributes', ['vimeo:channel']) + self.assertMatch('https://vimeo.com/channels/31259', ['vimeo:channel']) + self.assertMatch('https://vimeo.com/channels/31259/53576664', ['vimeo']) + self.assertMatch('https://vimeo.com/user7108434', ['vimeo:user']) + self.assertMatch('https://vimeo.com/user7108434/videos', ['vimeo:user']) self.assertMatch('https://vimeo.com/user21297594/review/75524534/3c257a1b5d', ['vimeo:review']) # https://github.com/rg3/youtube-dl/issues/1930 diff --git a/test/test_compat.py b/test/test_compat.py index 1eb454e06..b6bfad05e 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -13,7 +13,12 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl.utils import get_filesystem_encoding from youtube_dl.compat import ( compat_getenv, + compat_etree_fromstring, compat_expanduser, + compat_shlex_split, + compat_str, + compat_urllib_parse_unquote, + compat_urllib_parse_unquote_plus, ) @@ -42,5 +47,46 @@ class TestCompat(unittest.TestCase): dir(youtube_dl.compat))) - set(['unicode_literals']) self.assertEqual(all_names, sorted(present_names)) + def test_compat_urllib_parse_unquote(self): + self.assertEqual(compat_urllib_parse_unquote('abc%20def'), 'abc def') + self.assertEqual(compat_urllib_parse_unquote('%7e/abc+def'), '~/abc+def') + self.assertEqual(compat_urllib_parse_unquote(''), '') + self.assertEqual(compat_urllib_parse_unquote('%'), '%') + self.assertEqual(compat_urllib_parse_unquote('%%'), '%%') + self.assertEqual(compat_urllib_parse_unquote('%%%'), '%%%') + self.assertEqual(compat_urllib_parse_unquote('%2F'), '/') + self.assertEqual(compat_urllib_parse_unquote('%2f'), '/') + self.assertEqual(compat_urllib_parse_unquote('%E6%B4%A5%E6%B3%A2'), '津波') + self.assertEqual( + compat_urllib_parse_unquote(''' +%%a'''), + ''' +%%a''') + self.assertEqual( + compat_urllib_parse_unquote('''%28%5E%E2%97%A3_%E2%97%A2%5E%29%E3%81%A3%EF%B8%BB%E3%83%87%E2%95%90%E4%B8%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%86%B6%I%Break%25Things%'''), + '''(^◣_◢^)っ︻デ═一 ⇀ ⇀ ⇀ ⇀ ⇀ ↶%I%Break%Things%''') + + def test_compat_urllib_parse_unquote_plus(self): + self.assertEqual(compat_urllib_parse_unquote_plus('abc%20def'), 'abc def') + self.assertEqual(compat_urllib_parse_unquote_plus('%7e/abc+def'), '~/abc def') + + def test_compat_shlex_split(self): + self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two']) + + def test_compat_etree_fromstring(self): + xml = ''' + + foo + 中文 + spam + + ''' + doc = compat_etree_fromstring(xml.encode('utf-8')) + self.assertTrue(isinstance(doc.attrib['foo'], compat_str)) + self.assertTrue(isinstance(doc.attrib['spam'], compat_str)) + self.assertTrue(isinstance(doc.find('normal').text, compat_str)) + self.assertTrue(isinstance(doc.find('chinese').text, compat_str)) + self.assertTrue(isinstance(doc.find('foo/bar').text, compat_str)) + if __name__ == '__main__': unittest.main() diff --git a/test/test_download.py b/test/test_download.py index 6a149ae4f..a3f1c0644 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -102,7 +102,7 @@ def generator(test_case): params = get_params(test_case.get('params', {})) if is_playlist and 'playlist' not in test_case: - params.setdefault('extract_flat', True) + params.setdefault('extract_flat', 'in_playlist') params.setdefault('skip_download', True) ydl = YoutubeDL(params, auto_init=False) @@ -136,7 +136,9 @@ def generator(test_case): # We're not using .download here sine that is just a shim # for outside error handling, and returns the exit code # instead of the result dict. - res_dict = ydl.extract_info(test_case['url']) + res_dict = ydl.extract_info( + test_case['url'], + force_generic_extractor=params.get('force_generic_extractor', False)) except (DownloadError, ExtractorError) as err: # Check if the exception is not a network related one if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError, compat_http_client.BadStatusLine) or (err.exc_info[0] == compat_HTTPError and err.exc_info[1].code == 503): @@ -153,7 +155,7 @@ def generator(test_case): break if is_playlist: - self.assertEqual(res_dict['_type'], 'playlist') + self.assertTrue(res_dict['_type'] in ['playlist', 'multi_video']) self.assertTrue('entries' in res_dict) expect_info_dict(self, res_dict, test_case.get('info_dict', {})) diff --git a/test/test_execution.py b/test/test_execution.py index 60df187de..620db080e 100644 --- a/test/test_execution.py +++ b/test/test_execution.py @@ -1,4 +1,6 @@ #!/usr/bin/env python +# coding: utf-8 + from __future__ import unicode_literals import unittest @@ -6,6 +8,9 @@ import unittest import sys import os import subprocess +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.utils import encodeArgument rootDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @@ -27,5 +32,12 @@ class TestExecution(unittest.TestCase): def test_main_exec(self): subprocess.check_call([sys.executable, 'youtube_dl/__main__.py', '--version'], cwd=rootDir, stdout=_DEV_NULL) + def test_cmdline_umlauts(self): + p = subprocess.Popen( + [sys.executable, 'youtube_dl/__main__.py', encodeArgument('ä'), '--version'], + cwd=rootDir, stdout=_DEV_NULL, stderr=subprocess.PIPE) + _, stderr = p.communicate() + self.assertFalse(stderr) + if __name__ == '__main__': unittest.main() diff --git a/test/test_http.py b/test/test_http.py index bd4d46fef..f2e305b6f 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -8,7 +8,7 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl import YoutubeDL -from youtube_dl.compat import compat_http_server +from youtube_dl.compat import compat_http_server, compat_urllib_request import ssl import threading @@ -68,5 +68,52 @@ class TestHTTP(unittest.TestCase): r = ydl.extract_info('https://localhost:%d/video.html' % self.port) self.assertEqual(r['url'], 'https://localhost:%d/vid.mp4' % self.port) + +def _build_proxy_handler(name): + class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): + proxy_name = name + + def log_message(self, format, *args): + pass + + def do_GET(self): + self.send_response(200) + self.send_header('Content-Type', 'text/plain; charset=utf-8') + self.end_headers() + self.wfile.write('{self.proxy_name}: {self.path}'.format(self=self).encode('utf-8')) + return HTTPTestRequestHandler + + +class TestProxy(unittest.TestCase): + def setUp(self): + self.proxy = compat_http_server.HTTPServer( + ('localhost', 0), _build_proxy_handler('normal')) + self.port = self.proxy.socket.getsockname()[1] + self.proxy_thread = threading.Thread(target=self.proxy.serve_forever) + self.proxy_thread.daemon = True + self.proxy_thread.start() + + self.cn_proxy = compat_http_server.HTTPServer( + ('localhost', 0), _build_proxy_handler('cn')) + self.cn_port = self.cn_proxy.socket.getsockname()[1] + self.cn_proxy_thread = threading.Thread(target=self.cn_proxy.serve_forever) + self.cn_proxy_thread.daemon = True + self.cn_proxy_thread.start() + + def test_proxy(self): + cn_proxy = 'localhost:{0}'.format(self.cn_port) + ydl = YoutubeDL({ + 'proxy': 'localhost:{0}'.format(self.port), + 'cn_verification_proxy': cn_proxy, + }) + url = 'http://foo.com/bar' + response = ydl.urlopen(url).read().decode('utf-8') + self.assertEqual(response, 'normal: {0}'.format(url)) + + req = compat_urllib_request.Request(url) + req.add_header('Ytdl-request-proxy', cn_proxy) + response = ydl.urlopen(req).read().decode('utf-8') + self.assertEqual(response, 'cn: {0}'.format(url)) + if __name__ == '__main__': unittest.main() diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index b91b8c492..63c350b8f 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -19,6 +19,9 @@ class TestJSInterpreter(unittest.TestCase): jsi = JSInterpreter('function x3(){return 42;}') self.assertEqual(jsi.call_function('x3'), 42) + jsi = JSInterpreter('var x5 = function(){return 42;}') + self.assertEqual(jsi.call_function('x5'), 42) + def test_calc(self): jsi = JSInterpreter('function x4(a){return 2*a+1;}') self.assertEqual(jsi.call_function('x4', 3), 7) @@ -70,6 +73,8 @@ class TestJSInterpreter(unittest.TestCase): self.assertEqual(jsi.call_function('f'), -11) def test_comments(self): + 'Skipping: Not yet fully implemented' + return jsi = JSInterpreter(''' function x() { var x = /* 1 + */ 2; @@ -80,6 +85,15 @@ class TestJSInterpreter(unittest.TestCase): ''') self.assertEqual(jsi.call_function('x'), 52) + jsi = JSInterpreter(''' + function f() { + var x = "/*"; + var y = 1 /* comment */ + 2; + return y; + } + ''') + self.assertEqual(jsi.call_function('f'), 3) + def test_precedence(self): jsi = JSInterpreter(''' function x() { diff --git a/test/test_netrc.py b/test/test_netrc.py new file mode 100644 index 000000000..7cf3a6a2e --- /dev/null +++ b/test/test_netrc.py @@ -0,0 +1,26 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +from youtube_dl.extractor import ( + gen_extractors, +) + + +class TestNetRc(unittest.TestCase): + def test_netrc_present(self): + for ie in gen_extractors(): + if not hasattr(ie, '_login'): + continue + self.assertTrue( + hasattr(ie, '_NETRC_MACHINE'), + 'Extractor %s supports login, but is missing a _NETRC_MACHINE property' % ie.IE_NAME) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_postprocessors.py b/test/test_postprocessors.py new file mode 100644 index 000000000..addb69d6f --- /dev/null +++ b/test/test_postprocessors.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python + +from __future__ import unicode_literals + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.postprocessor import MetadataFromTitlePP + + +class TestMetadataFromTitle(unittest.TestCase): + def test_format_to_regex(self): + pp = MetadataFromTitlePP(None, '%(title)s - %(artist)s') + self.assertEqual(pp._titleregex, '(?P.+)\ \-\ (?P<artist>.+)') diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 3e329438f..75f0ea75f 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -18,6 +18,17 @@ from youtube_dl.extractor import ( VimeoIE, WallaIE, CeskaTelevizeIE, + LyndaIE, + NPOIE, + ComedyCentralIE, + NRKTVIE, + RaiIE, + VikiIE, + ThePlatformIE, + ThePlatformFeedIE, + RTVEALaCartaIE, + FunnyOrDieIE, + DemocracynowIE, ) @@ -27,42 +38,38 @@ class BaseTestSubtitles(unittest.TestCase): def setUp(self): self.DL = FakeYDL() - self.ie = self.IE(self.DL) + self.ie = self.IE() + self.DL.add_info_extractor(self.ie) def getInfoDict(self): - info_dict = self.ie.extract(self.url) + info_dict = self.DL.extract_info(self.url, download=False) return info_dict def getSubtitles(self): info_dict = self.getInfoDict() - return info_dict['subtitles'] + subtitles = info_dict['requested_subtitles'] + if not subtitles: + return subtitles + for sub_info in subtitles.values(): + if sub_info.get('data') is None: + uf = self.DL.urlopen(sub_info['url']) + sub_info['data'] = uf.read().decode('utf-8') + return dict((l, sub_info['data']) for l, sub_info in subtitles.items()) class TestYoutubeSubtitles(BaseTestSubtitles): url = 'QRS8MkLhQmM' IE = YoutubeIE - def test_youtube_no_writesubtitles(self): - self.DL.params['writesubtitles'] = False - subtitles = self.getSubtitles() - self.assertEqual(subtitles, None) - - def test_youtube_subtitles(self): - self.DL.params['writesubtitles'] = True - subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260') - - def test_youtube_subtitles_lang(self): - self.DL.params['writesubtitles'] = True - self.DL.params['subtitleslangs'] = ['it'] - subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d') - def test_youtube_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(len(subtitles.keys()), 13) + self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260') + self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d') + for lang in ['it', 'fr', 'de']: + self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) def test_youtube_subtitles_sbv_format(self): self.DL.params['writesubtitles'] = True @@ -76,12 +83,6 @@ class TestYoutubeSubtitles(BaseTestSubtitles): subtitles = self.getSubtitles() self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06') - def test_youtube_list_subtitles(self): - self.DL.expect_warning('Video doesn\'t have automatic captions') - self.DL.params['listsubtitles'] = True - info_dict = self.getInfoDict() - self.assertEqual(info_dict, None) - def test_youtube_automatic_captions(self): self.url = '8YoUxe5ncPo' self.DL.params['writeautomaticsub'] = True @@ -103,246 +104,267 @@ class TestYoutubeSubtitles(BaseTestSubtitles): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(len(subtitles), 0) - - def test_youtube_multiple_langs(self): - self.url = 'QRS8MkLhQmM' - self.DL.params['writesubtitles'] = True - langs = ['it', 'fr', 'de'] - self.DL.params['subtitleslangs'] = langs - subtitles = self.getSubtitles() - for lang in langs: - self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) + self.assertFalse(subtitles) class TestDailymotionSubtitles(BaseTestSubtitles): url = 'http://www.dailymotion.com/video/xczg00' IE = DailymotionIE - def test_no_writesubtitles(self): + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(subtitles, None) + self.assertTrue(len(subtitles.keys()) >= 6) + self.assertEqual(md5(subtitles['en']), '976553874490cba125086bbfea3ff76f') + self.assertEqual(md5(subtitles['fr']), '594564ec7d588942e384e920e5341792') + for lang in ['es', 'fr', 'de']: + self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) - def test_subtitles(self): + def test_nosubtitles(self): + self.DL.expect_warning('video doesn\'t have subtitles') + self.url = 'http://www.dailymotion.com/video/x12u166_le-zapping-tele-star-du-08-aout-2013_tv' self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['en']), '976553874490cba125086bbfea3ff76f') + self.assertFalse(subtitles) + - def test_subtitles_lang(self): +class TestTedSubtitles(BaseTestSubtitles): + url = 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html' + IE = TEDIE + + def test_allsubtitles(self): self.DL.params['writesubtitles'] = True - self.DL.params['subtitleslangs'] = ['fr'] + self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['fr']), '594564ec7d588942e384e920e5341792') + self.assertTrue(len(subtitles.keys()) >= 28) + self.assertEqual(md5(subtitles['en']), '4262c1665ff928a2dada178f62cb8d14') + self.assertEqual(md5(subtitles['fr']), '66a63f7f42c97a50f8c0e90bc7797bb5') + for lang in ['es', 'fr', 'de']: + self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) + + +class TestBlipTVSubtitles(BaseTestSubtitles): + url = 'http://blip.tv/a/a-6603250' + IE = BlipTVIE def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(len(subtitles.keys()), 5) + self.assertEqual(set(subtitles.keys()), set(['en'])) + self.assertEqual(md5(subtitles['en']), '5b75c300af65fe4476dff79478bb93e4') - def test_list_subtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['listsubtitles'] = True - info_dict = self.getInfoDict() - self.assertEqual(info_dict, None) - def test_automatic_captions(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['writeautomaticsub'] = True - self.DL.params['subtitleslang'] = ['en'] +class TestVimeoSubtitles(BaseTestSubtitles): + url = 'http://vimeo.com/76979871' + IE = VimeoIE + + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertTrue(len(subtitles.keys()) == 0) + self.assertEqual(set(subtitles.keys()), set(['de', 'en', 'es', 'fr'])) + self.assertEqual(md5(subtitles['en']), '8062383cf4dec168fc40a088aa6d5888') + self.assertEqual(md5(subtitles['fr']), 'b6191146a6c5d3a452244d853fde6dc8') def test_nosubtitles(self): self.DL.expect_warning('video doesn\'t have subtitles') - self.url = 'http://www.dailymotion.com/video/x12u166_le-zapping-tele-star-du-08-aout-2013_tv' + self.url = 'http://vimeo.com/56015672' self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(len(subtitles), 0) + self.assertFalse(subtitles) + - def test_multiple_langs(self): +class TestWallaSubtitles(BaseTestSubtitles): + url = 'http://vod.walla.co.il/movie/2705958/the-yes-men' + IE = WallaIE + + def test_allsubtitles(self): + self.DL.expect_warning('Automatic Captions not supported by this server') self.DL.params['writesubtitles'] = True - langs = ['es', 'fr', 'de'] - self.DL.params['subtitleslangs'] = langs + self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - for lang in langs: - self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) + self.assertEqual(set(subtitles.keys()), set(['heb'])) + self.assertEqual(md5(subtitles['heb']), 'e758c5d7cb982f6bef14f377ec7a3920') + def test_nosubtitles(self): + self.DL.expect_warning('video doesn\'t have subtitles') + self.url = 'http://vod.walla.co.il/movie/2642630/one-direction-all-for-one' + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertFalse(subtitles) -class TestTedSubtitles(BaseTestSubtitles): - url = 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html' - IE = TEDIE - def test_no_writesubtitles(self): - subtitles = self.getSubtitles() - self.assertEqual(subtitles, None) +class TestCeskaTelevizeSubtitles(BaseTestSubtitles): + url = 'http://www.ceskatelevize.cz/ivysilani/10600540290-u6-uzasny-svet-techniky' + IE = CeskaTelevizeIE - def test_subtitles(self): + def test_allsubtitles(self): + self.DL.expect_warning('Automatic Captions not supported by this server') self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['en']), '4262c1665ff928a2dada178f62cb8d14') + self.assertEqual(set(subtitles.keys()), set(['cs'])) + self.assertTrue(len(subtitles['cs']) > 20000) - def test_subtitles_lang(self): + def test_nosubtitles(self): + self.DL.expect_warning('video doesn\'t have subtitles') + self.url = 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220' self.DL.params['writesubtitles'] = True - self.DL.params['subtitleslangs'] = ['fr'] + self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['fr']), '66a63f7f42c97a50f8c0e90bc7797bb5') + self.assertFalse(subtitles) + + +class TestLyndaSubtitles(BaseTestSubtitles): + url = 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html' + IE = LyndaIE def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertTrue(len(subtitles.keys()) >= 28) + self.assertEqual(set(subtitles.keys()), set(['en'])) + self.assertEqual(md5(subtitles['en']), '09bbe67222259bed60deaa26997d73a7') - def test_list_subtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['listsubtitles'] = True - info_dict = self.getInfoDict() - self.assertEqual(info_dict, None) - def test_automatic_captions(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['writeautomaticsub'] = True - self.DL.params['subtitleslang'] = ['en'] - subtitles = self.getSubtitles() - self.assertTrue(len(subtitles.keys()) == 0) +class TestNPOSubtitles(BaseTestSubtitles): + url = 'http://www.npo.nl/nos-journaal/28-08-2014/POW_00722860' + IE = NPOIE - def test_multiple_langs(self): + def test_allsubtitles(self): self.DL.params['writesubtitles'] = True - langs = ['es', 'fr', 'de'] - self.DL.params['subtitleslangs'] = langs + self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - for lang in langs: - self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) + self.assertEqual(set(subtitles.keys()), set(['nl'])) + self.assertEqual(md5(subtitles['nl']), 'fc6435027572b63fb4ab143abd5ad3f4') -class TestBlipTVSubtitles(BaseTestSubtitles): - url = 'http://blip.tv/a/a-6603250' - IE = BlipTVIE +class TestMTVSubtitles(BaseTestSubtitles): + url = 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother' + IE = ComedyCentralIE - def test_list_subtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['listsubtitles'] = True - info_dict = self.getInfoDict() - self.assertEqual(info_dict, None) + def getInfoDict(self): + return super(TestMTVSubtitles, self).getInfoDict()['entries'][0] def test_allsubtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['en'])) - self.assertEqual(md5(subtitles['en']), '5b75c300af65fe4476dff79478bb93e4') + self.assertEqual(md5(subtitles['en']), 'b9f6ca22a6acf597ec76f61749765e65') -class TestVimeoSubtitles(BaseTestSubtitles): - url = 'http://vimeo.com/76979871' - IE = VimeoIE +class TestNRKSubtitles(BaseTestSubtitles): + url = 'http://tv.nrk.no/serie/ikke-gjoer-dette-hjemme/DMPV73000411/sesong-2/episode-1' + IE = NRKTVIE - def test_no_writesubtitles(self): - subtitles = self.getSubtitles() - self.assertEqual(subtitles, None) - - def test_subtitles(self): + def test_allsubtitles(self): self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['en']), '8062383cf4dec168fc40a088aa6d5888') + self.assertEqual(set(subtitles.keys()), set(['no'])) + self.assertEqual(md5(subtitles['no']), '544fa917d3197fcbee64634559221cc2') + - def test_subtitles_lang(self): +class TestRaiSubtitles(BaseTestSubtitles): + url = 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html' + IE = RaiIE + + def test_allsubtitles(self): self.DL.params['writesubtitles'] = True - self.DL.params['subtitleslangs'] = ['fr'] + self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['fr']), 'b6191146a6c5d3a452244d853fde6dc8') + self.assertEqual(set(subtitles.keys()), set(['it'])) + self.assertEqual(md5(subtitles['it']), 'b1d90a98755126b61e667567a1f6680a') + + +class TestVikiSubtitles(BaseTestSubtitles): + url = 'http://www.viki.com/videos/1060846v-punch-episode-18' + IE = VikiIE def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(set(subtitles.keys()), set(['de', 'en', 'es', 'fr'])) + self.assertEqual(set(subtitles.keys()), set(['en'])) + self.assertEqual(md5(subtitles['en']), '53cb083a5914b2d84ef1ab67b880d18a') - def test_list_subtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['listsubtitles'] = True - info_dict = self.getInfoDict() - self.assertEqual(info_dict, None) - def test_automatic_captions(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['writeautomaticsub'] = True - self.DL.params['subtitleslang'] = ['en'] - subtitles = self.getSubtitles() - self.assertTrue(len(subtitles.keys()) == 0) +class TestThePlatformSubtitles(BaseTestSubtitles): + # from http://www.3playmedia.com/services-features/tools/integrations/theplatform/ + # (see http://theplatform.com/about/partners/type/subtitles-closed-captioning/) + url = 'theplatform:JFUjUE1_ehvq' + IE = ThePlatformIE - def test_nosubtitles(self): - self.DL.expect_warning('video doesn\'t have subtitles') - self.url = 'http://vimeo.com/56015672' + def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(len(subtitles), 0) + self.assertEqual(set(subtitles.keys()), set(['en'])) + self.assertEqual(md5(subtitles['en']), '97e7670cbae3c4d26ae8bcc7fdd78d4b') + + +class TestThePlatformFeedSubtitles(BaseTestSubtitles): + url = 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207' + IE = ThePlatformFeedIE - def test_multiple_langs(self): + def test_allsubtitles(self): self.DL.params['writesubtitles'] = True - langs = ['es', 'fr', 'de'] - self.DL.params['subtitleslangs'] = langs + self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - for lang in langs: - self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) + self.assertEqual(set(subtitles.keys()), set(['en'])) + self.assertEqual(md5(subtitles['en']), '48649a22e82b2da21c9a67a395eedade') -class TestWallaSubtitles(BaseTestSubtitles): - url = 'http://vod.walla.co.il/movie/2705958/the-yes-men' - IE = WallaIE - - def test_list_subtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['listsubtitles'] = True - info_dict = self.getInfoDict() - self.assertEqual(info_dict, None) +class TestRtveSubtitles(BaseTestSubtitles): + url = 'http://www.rtve.es/alacarta/videos/los-misterios-de-laura/misterios-laura-capitulo-32-misterio-del-numero-17-2-parte/2428621/' + IE = RTVEALaCartaIE def test_allsubtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') + print('Skipping, only available from Spain') + return self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(set(subtitles.keys()), set(['heb'])) - self.assertEqual(md5(subtitles['heb']), 'e758c5d7cb982f6bef14f377ec7a3920') + self.assertEqual(set(subtitles.keys()), set(['es'])) + self.assertEqual(md5(subtitles['es']), '69e70cae2d40574fb7316f31d6eb7fca') - def test_nosubtitles(self): - self.DL.expect_warning('video doesn\'t have subtitles') - self.url = 'http://vod.walla.co.il/movie/2642630/one-direction-all-for-one' + +class TestFunnyOrDieSubtitles(BaseTestSubtitles): + url = 'http://www.funnyordie.com/videos/224829ff6d/judd-apatow-will-direct-your-vine' + IE = FunnyOrDieIE + + def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(len(subtitles), 0) - + self.assertEqual(set(subtitles.keys()), set(['en'])) + self.assertEqual(md5(subtitles['en']), 'c5593c193eacd353596c11c2d4f9ecc4') -class TestCeskaTelevizeSubtitles(BaseTestSubtitles): - url = 'http://www.ceskatelevize.cz/ivysilani/10600540290-u6-uzasny-svet-techniky' - IE = CeskaTelevizeIE - def test_list_subtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['listsubtitles'] = True - info_dict = self.getInfoDict() - self.assertEqual(info_dict, None) +class TestDemocracynowSubtitles(BaseTestSubtitles): + url = 'http://www.democracynow.org/shows/2015/7/3' + IE = DemocracynowIE def test_allsubtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(set(subtitles.keys()), set(['cs'])) - self.assertEqual(md5(subtitles['cs']), 'cc3957b2c6dff1db71e5f2e83d467480') + self.assertEqual(set(subtitles.keys()), set(['en'])) + self.assertEqual(md5(subtitles['en']), 'acaca989e24a9e45a6719c9b3d60815c') - def test_nosubtitles(self): - self.DL.expect_warning('video doesn\'t have subtitles') - self.url = 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220' + def test_subtitles_in_page(self): + self.url = 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree' self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(len(subtitles), 0) + self.assertEqual(set(subtitles.keys()), set(['en'])) + self.assertEqual(md5(subtitles['en']), 'acaca989e24a9e45a6719c9b3d60815c') if __name__ == '__main__': diff --git a/test/test_swfinterp.py b/test/test_swfinterp.py index 9f18055e6..f1e899819 100644 --- a/test/test_swfinterp.py +++ b/test/test_swfinterp.py @@ -34,8 +34,8 @@ def _make_testfunc(testfile): def test_func(self): as_file = os.path.join(TEST_DIR, testfile) swf_file = os.path.join(TEST_DIR, test_id + '.swf') - if ((not os.path.exists(swf_file)) - or os.path.getmtime(swf_file) < os.path.getmtime(as_file)): + if ((not os.path.exists(swf_file)) or + os.path.getmtime(swf_file) < os.path.getmtime(as_file)): # Recompile try: subprocess.check_call([ diff --git a/test/test_unicode_literals.py b/test/test_unicode_literals.py index 7f816698e..6c1b7ec91 100644 --- a/test/test_unicode_literals.py +++ b/test/test_unicode_literals.py @@ -17,13 +17,22 @@ IGNORED_FILES = [ 'buildserver.py', ] +IGNORED_DIRS = [ + '.git', + '.tox', +] from test.helper import assertRegexpMatches class TestUnicodeLiterals(unittest.TestCase): def test_all_files(self): - for dirpath, _, filenames in os.walk(rootDir): + for dirpath, dirnames, filenames in os.walk(rootDir): + for ignore_dir in IGNORED_DIRS: + if ignore_dir in dirnames: + # If we remove the directory from dirnames os.walk won't + # recurse into it + dirnames.remove(ignore_dir) for basename in filenames: if not basename.endswith('.py'): continue diff --git a/test/test_utils.py b/test/test_utils.py index 1c29d0889..501355c74 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -21,9 +21,11 @@ from youtube_dl.utils import ( clean_html, DateRange, detect_exe_version, + determine_ext, encodeFilename, escape_rfc3986, escape_url, + ExtractorError, find_xpath_attr, fix_xml_ampersands, InAdvancePagedList, @@ -38,6 +40,9 @@ from youtube_dl.utils import ( parse_iso8601, read_batch_urls, sanitize_filename, + sanitize_path, + prepend_extension, + replace_extension, shell_quote, smuggle_url, str_to_int, @@ -48,12 +53,24 @@ from youtube_dl.utils import ( unified_strdate, unsmuggle_url, uppercase_escape, + lowercase_escape, url_basename, urlencode_postdata, version_tuple, xpath_with_ns, + xpath_element, + xpath_text, + xpath_attr, render_table, match_str, + parse_dfxp_time_expr, + dfxp2srt, + cli_option, + cli_valueless_option, + cli_bool_option, +) +from youtube_dl.compat import ( + compat_etree_fromstring, ) @@ -86,6 +103,11 @@ class TestUtil(unittest.TestCase): sanitize_filename('New World record at 0:12:34'), 'New World record at 0_12_34') + self.assertEqual(sanitize_filename('--gasdgf'), '_-gasdgf') + self.assertEqual(sanitize_filename('--gasdgf', is_id=True), '--gasdgf') + self.assertEqual(sanitize_filename('.gasdgf'), 'gasdgf') + self.assertEqual(sanitize_filename('.gasdgf', is_id=True), '.gasdgf') + forbidden = '"\0\\/' for fc in forbidden: for fbc in forbidden: @@ -126,6 +148,58 @@ class TestUtil(unittest.TestCase): self.assertEqual(sanitize_filename('_BD_eEpuzXw', is_id=True), '_BD_eEpuzXw') self.assertEqual(sanitize_filename('N0Y__7-UOdI', is_id=True), 'N0Y__7-UOdI') + def test_sanitize_path(self): + if sys.platform != 'win32': + return + + self.assertEqual(sanitize_path('abc'), 'abc') + self.assertEqual(sanitize_path('abc/def'), 'abc\\def') + self.assertEqual(sanitize_path('abc\\def'), 'abc\\def') + self.assertEqual(sanitize_path('abc|def'), 'abc#def') + self.assertEqual(sanitize_path('<>:"|?*'), '#######') + self.assertEqual(sanitize_path('C:/abc/def'), 'C:\\abc\\def') + self.assertEqual(sanitize_path('C?:/abc/def'), 'C##\\abc\\def') + + self.assertEqual(sanitize_path('\\\\?\\UNC\\ComputerName\\abc'), '\\\\?\\UNC\\ComputerName\\abc') + self.assertEqual(sanitize_path('\\\\?\\UNC/ComputerName/abc'), '\\\\?\\UNC\\ComputerName\\abc') + + self.assertEqual(sanitize_path('\\\\?\\C:\\abc'), '\\\\?\\C:\\abc') + self.assertEqual(sanitize_path('\\\\?\\C:/abc'), '\\\\?\\C:\\abc') + self.assertEqual(sanitize_path('\\\\?\\C:\\ab?c\\de:f'), '\\\\?\\C:\\ab#c\\de#f') + self.assertEqual(sanitize_path('\\\\?\\C:\\abc'), '\\\\?\\C:\\abc') + + self.assertEqual( + sanitize_path('youtube/%(uploader)s/%(autonumber)s-%(title)s-%(upload_date)s.%(ext)s'), + 'youtube\\%(uploader)s\\%(autonumber)s-%(title)s-%(upload_date)s.%(ext)s') + + self.assertEqual( + sanitize_path('youtube/TheWreckingYard ./00001-Not bad, Especially for Free! (1987 Yamaha 700)-20141116.mp4.part'), + 'youtube\\TheWreckingYard #\\00001-Not bad, Especially for Free! (1987 Yamaha 700)-20141116.mp4.part') + self.assertEqual(sanitize_path('abc/def...'), 'abc\\def..#') + self.assertEqual(sanitize_path('abc.../def'), 'abc..#\\def') + self.assertEqual(sanitize_path('abc.../def...'), 'abc..#\\def..#') + + self.assertEqual(sanitize_path('../abc'), '..\\abc') + self.assertEqual(sanitize_path('../../abc'), '..\\..\\abc') + self.assertEqual(sanitize_path('./abc'), 'abc') + self.assertEqual(sanitize_path('./../abc'), '..\\abc') + + def test_prepend_extension(self): + self.assertEqual(prepend_extension('abc.ext', 'temp'), 'abc.temp.ext') + self.assertEqual(prepend_extension('abc.ext', 'temp', 'ext'), 'abc.temp.ext') + self.assertEqual(prepend_extension('abc.unexpected_ext', 'temp', 'ext'), 'abc.unexpected_ext.temp') + self.assertEqual(prepend_extension('abc', 'temp'), 'abc.temp') + self.assertEqual(prepend_extension('.abc', 'temp'), '.abc.temp') + self.assertEqual(prepend_extension('.abc.ext', 'temp'), '.abc.temp.ext') + + def test_replace_extension(self): + self.assertEqual(replace_extension('abc.ext', 'temp'), 'abc.temp') + self.assertEqual(replace_extension('abc.ext', 'temp', 'ext'), 'abc.temp') + self.assertEqual(replace_extension('abc.unexpected_ext', 'temp', 'ext'), 'abc.unexpected_ext.temp') + self.assertEqual(replace_extension('abc', 'temp'), 'abc.temp') + self.assertEqual(replace_extension('.abc', 'temp'), '.abc.temp') + self.assertEqual(replace_extension('.abc.ext', 'temp'), '.abc.temp') + def test_ordered_set(self): self.assertEqual(orderedSet([1, 1, 2, 3, 4, 4, 5, 6, 7, 3, 5]), [1, 2, 3, 4, 5, 6, 7]) self.assertEqual(orderedSet([]), []) @@ -135,8 +209,10 @@ class TestUtil(unittest.TestCase): def test_unescape_html(self): self.assertEqual(unescapeHTML('%20;'), '%20;') - self.assertEqual( - unescapeHTML('é'), 'é') + self.assertEqual(unescapeHTML('/'), '/') + self.assertEqual(unescapeHTML('/'), '/') + self.assertEqual(unescapeHTML('é'), 'é') + self.assertEqual(unescapeHTML('�'), '�') def test_daterange(self): _20century = DateRange("19000101", "20000101") @@ -160,6 +236,15 @@ class TestUtil(unittest.TestCase): self.assertEqual( unified_strdate('2/2/2015 6:47:40 PM', day_first=False), '20150202') + self.assertEqual(unified_strdate('25-09-2014'), '20140925') + self.assertEqual(unified_strdate('UNKNOWN DATE FORMAT'), None) + + def test_determine_ext(self): + self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4') + self.assertEqual(determine_ext('http://example.com/foo/bar/?download', None), None) + self.assertEqual(determine_ext('http://example.com/foo/bar.nonext/?download', None), None) + self.assertEqual(determine_ext('http://example.com/foo/bar/mp4?download', None), None) + self.assertEqual(determine_ext('http://example.com/foo/bar.m3u8//?download'), 'm3u8') def test_find_xpath_attr(self): testxml = '''<root> @@ -167,12 +252,21 @@ class TestUtil(unittest.TestCase): <node x="a"/> <node x="a" y="c" /> <node x="b" y="d" /> + <node x="" /> </root>''' - doc = xml.etree.ElementTree.fromstring(testxml) + doc = compat_etree_fromstring(testxml) + self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n'), None) self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n', 'v'), None) + self.assertEqual(find_xpath_attr(doc, './/node', 'n'), None) + self.assertEqual(find_xpath_attr(doc, './/node', 'n', 'v'), None) + self.assertEqual(find_xpath_attr(doc, './/node', 'x'), doc[1]) self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'a'), doc[1]) + self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'b'), doc[3]) + self.assertEqual(find_xpath_attr(doc, './/node', 'y'), doc[2]) self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'c'), doc[2]) + self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'd'), doc[3]) + self.assertEqual(find_xpath_attr(doc, './/node', 'x', ''), doc[4]) def test_xpath_with_ns(self): testxml = '''<root xmlns:media="http://example.com/"> @@ -181,12 +275,56 @@ class TestUtil(unittest.TestCase): <url>http://server.com/download.mp3</url> </media:song> </root>''' - doc = xml.etree.ElementTree.fromstring(testxml) + doc = compat_etree_fromstring(testxml) find = lambda p: doc.find(xpath_with_ns(p, {'media': 'http://example.com/'})) self.assertTrue(find('media:song') is not None) self.assertEqual(find('media:song/media:author').text, 'The Author') self.assertEqual(find('media:song/url').text, 'http://server.com/download.mp3') + def test_xpath_element(self): + doc = xml.etree.ElementTree.Element('root') + div = xml.etree.ElementTree.SubElement(doc, 'div') + p = xml.etree.ElementTree.SubElement(div, 'p') + p.text = 'Foo' + self.assertEqual(xpath_element(doc, 'div/p'), p) + self.assertEqual(xpath_element(doc, ['div/p']), p) + self.assertEqual(xpath_element(doc, ['div/bar', 'div/p']), p) + self.assertEqual(xpath_element(doc, 'div/bar', default='default'), 'default') + self.assertEqual(xpath_element(doc, ['div/bar'], default='default'), 'default') + self.assertTrue(xpath_element(doc, 'div/bar') is None) + self.assertTrue(xpath_element(doc, ['div/bar']) is None) + self.assertTrue(xpath_element(doc, ['div/bar'], 'div/baz') is None) + self.assertRaises(ExtractorError, xpath_element, doc, 'div/bar', fatal=True) + self.assertRaises(ExtractorError, xpath_element, doc, ['div/bar'], fatal=True) + self.assertRaises(ExtractorError, xpath_element, doc, ['div/bar', 'div/baz'], fatal=True) + + def test_xpath_text(self): + testxml = '''<root> + <div> + <p>Foo</p> + </div> + </root>''' + doc = compat_etree_fromstring(testxml) + self.assertEqual(xpath_text(doc, 'div/p'), 'Foo') + self.assertEqual(xpath_text(doc, 'div/bar', default='default'), 'default') + self.assertTrue(xpath_text(doc, 'div/bar') is None) + self.assertRaises(ExtractorError, xpath_text, doc, 'div/bar', fatal=True) + + def test_xpath_attr(self): + testxml = '''<root> + <div> + <p x="a">Foo</p> + </div> + </root>''' + doc = compat_etree_fromstring(testxml) + self.assertEqual(xpath_attr(doc, 'div/p', 'x'), 'a') + self.assertEqual(xpath_attr(doc, 'div/bar', 'x'), None) + self.assertEqual(xpath_attr(doc, 'div/p', 'y'), None) + self.assertEqual(xpath_attr(doc, 'div/bar', 'x', default='default'), 'default') + self.assertEqual(xpath_attr(doc, 'div/p', 'y', default='default'), 'default') + self.assertRaises(ExtractorError, xpath_attr, doc, 'div/bar', 'x', fatal=True) + self.assertRaises(ExtractorError, xpath_attr, doc, 'div/p', 'y', fatal=True) + def test_smuggle_url(self): data = {"ö": "ö", "abc": [3]} url = 'https://foo.bar/baz?x=y#a' @@ -244,6 +382,8 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_duration('2.5 hours'), 9000) self.assertEqual(parse_duration('02:03:04'), 7384) self.assertEqual(parse_duration('01:02:03:04'), 93784) + self.assertEqual(parse_duration('1 hour 3 minutes'), 3780) + self.assertEqual(parse_duration('87 Min.'), 5220) def test_fix_xml_ampersands(self): self.assertEqual( @@ -304,6 +444,8 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_iso8601('2014-03-23T22:04:26+0000'), 1395612266) self.assertEqual(parse_iso8601('2014-03-23T22:04:26Z'), 1395612266) self.assertEqual(parse_iso8601('2014-03-23T22:04:26.1234Z'), 1395612266) + self.assertEqual(parse_iso8601('2015-09-29T08:27:31.727'), 1443515251) + self.assertEqual(parse_iso8601('2015-09-29T08-27-31.727'), None) def test_strip_jsonp(self): stripped = strip_jsonp('cb ([ {"id":"532cb",\n\n\n"x":\n3}\n]\n);') @@ -318,6 +460,10 @@ class TestUtil(unittest.TestCase): self.assertEqual(uppercase_escape('aä'), 'aä') self.assertEqual(uppercase_escape('\\U0001d550'), '𝕐') + def test_lowercase_escape(self): + self.assertEqual(lowercase_escape('aä'), 'aä') + self.assertEqual(lowercase_escape('\\u0026'), '&') + def test_limit_length(self): self.assertEqual(limit_length(None, 12), None) self.assertEqual(limit_length('foo', 12), 'foo') @@ -370,6 +516,13 @@ class TestUtil(unittest.TestCase): "playlist":[{"controls":{"all":null}}] }''') + inp = '''"The CW\\'s \\'Crazy Ex-Girlfriend\\'"''' + self.assertEqual(js_to_json(inp), '''"The CW's 'Crazy Ex-Girlfriend'"''') + + inp = '"SAND Number: SAND 2013-7800P\\nPresenter: Tom Russo\\nHabanero Software Training - Xyce Software\\nXyce, Sandia\\u0027s"' + json_code = js_to_json(inp) + self.assertEqual(json.loads(json_code), json.loads(inp)) + def test_js_to_json_edgecases(self): on = js_to_json("{abc_def:'1\\'\\\\2\\\\\\'3\"4'}") self.assertEqual(json.loads(on), {"abc_def": "1'\\2\\'3\"4"}) @@ -387,6 +540,12 @@ class TestUtil(unittest.TestCase): self.assertEqual(d['x'], 1) self.assertEqual(d['y'], 'a') + on = js_to_json('["abc", "def",]') + self.assertEqual(json.loads(on), ['abc', 'def']) + + on = js_to_json('{"abc": "def",}') + self.assertEqual(json.loads(on), {'abc': 'def'}) + def test_clean_html(self): self.assertEqual(clean_html('a:\nb'), 'a: b') self.assertEqual(clean_html('a:\n "b"'), 'a: "b"') @@ -491,6 +650,102 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4') 'like_count > 100 & dislike_count <? 50 & description', {'like_count': 190, 'dislike_count': 10})) + def test_parse_dfxp_time_expr(self): + self.assertEqual(parse_dfxp_time_expr(None), 0.0) + self.assertEqual(parse_dfxp_time_expr(''), 0.0) + self.assertEqual(parse_dfxp_time_expr('0.1'), 0.1) + self.assertEqual(parse_dfxp_time_expr('0.1s'), 0.1) + self.assertEqual(parse_dfxp_time_expr('00:00:01'), 1.0) + self.assertEqual(parse_dfxp_time_expr('00:00:01.100'), 1.1) + + def test_dfxp2srt(self): + dfxp_data = '''<?xml version="1.0" encoding="UTF-8"?> + <tt xmlns="http://www.w3.org/ns/ttml" xml:lang="en" xmlns:tts="http://www.w3.org/ns/ttml#parameter"> + <body> + <div xml:lang="en"> + <p begin="0" end="1">The following line contains Chinese characters and special symbols</p> + <p begin="1" end="2">第二行<br/>♪♪</p> + <p begin="2" dur="1"><span>Third<br/>Line</span></p> + </div> + </body> + </tt>''' + srt_data = '''1 +00:00:00,000 --> 00:00:01,000 +The following line contains Chinese characters and special symbols + +2 +00:00:01,000 --> 00:00:02,000 +第二行 +♪♪ + +3 +00:00:02,000 --> 00:00:03,000 +Third +Line + +''' + self.assertEqual(dfxp2srt(dfxp_data), srt_data) + + dfxp_data_no_default_namespace = '''<?xml version="1.0" encoding="UTF-8"?> + <tt xml:lang="en" xmlns:tts="http://www.w3.org/ns/ttml#parameter"> + <body> + <div xml:lang="en"> + <p begin="0" end="1">The first line</p> + </div> + </body> + </tt>''' + srt_data = '''1 +00:00:00,000 --> 00:00:01,000 +The first line + +''' + self.assertEqual(dfxp2srt(dfxp_data_no_default_namespace), srt_data) + + def test_cli_option(self): + self.assertEqual(cli_option({'proxy': '127.0.0.1:3128'}, '--proxy', 'proxy'), ['--proxy', '127.0.0.1:3128']) + self.assertEqual(cli_option({'proxy': None}, '--proxy', 'proxy'), []) + self.assertEqual(cli_option({}, '--proxy', 'proxy'), []) + + def test_cli_valueless_option(self): + self.assertEqual(cli_valueless_option( + {'downloader': 'external'}, '--external-downloader', 'downloader', 'external'), ['--external-downloader']) + self.assertEqual(cli_valueless_option( + {'downloader': 'internal'}, '--external-downloader', 'downloader', 'external'), []) + self.assertEqual(cli_valueless_option( + {'nocheckcertificate': True}, '--no-check-certificate', 'nocheckcertificate'), ['--no-check-certificate']) + self.assertEqual(cli_valueless_option( + {'nocheckcertificate': False}, '--no-check-certificate', 'nocheckcertificate'), []) + self.assertEqual(cli_valueless_option( + {'checkcertificate': True}, '--no-check-certificate', 'checkcertificate', False), []) + self.assertEqual(cli_valueless_option( + {'checkcertificate': False}, '--no-check-certificate', 'checkcertificate', False), ['--no-check-certificate']) + + def test_cli_bool_option(self): + self.assertEqual( + cli_bool_option( + {'nocheckcertificate': True}, '--no-check-certificate', 'nocheckcertificate'), + ['--no-check-certificate', 'true']) + self.assertEqual( + cli_bool_option( + {'nocheckcertificate': True}, '--no-check-certificate', 'nocheckcertificate', separator='='), + ['--no-check-certificate=true']) + self.assertEqual( + cli_bool_option( + {'nocheckcertificate': True}, '--check-certificate', 'nocheckcertificate', 'false', 'true'), + ['--check-certificate', 'false']) + self.assertEqual( + cli_bool_option( + {'nocheckcertificate': True}, '--check-certificate', 'nocheckcertificate', 'false', 'true', '='), + ['--check-certificate=false']) + self.assertEqual( + cli_bool_option( + {'nocheckcertificate': False}, '--check-certificate', 'nocheckcertificate', 'false', 'true'), + ['--check-certificate', 'true']) + self.assertEqual( + cli_bool_option( + {'nocheckcertificate': False}, '--check-certificate', 'nocheckcertificate', 'false', 'true', '='), + ['--check-certificate=true']) + if __name__ == '__main__': unittest.main() diff --git a/test/test_write_annotations.py b/test/test_write_annotations.py index 780636c77..84b8f39e0 100644 --- a/test/test_write_annotations.py +++ b/test/test_write_annotations.py @@ -33,7 +33,7 @@ params = get_params({ TEST_ID = 'gr51aVj-mLg' -ANNOTATIONS_FILE = TEST_ID + '.flv.annotations.xml' +ANNOTATIONS_FILE = TEST_ID + '.annotations.xml' EXPECTED_ANNOTATIONS = ['Speech bubble', 'Note', 'Title', 'Spotlight', 'Label'] diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index c889b6f15..26aadb34f 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -57,5 +57,14 @@ class TestYoutubeLists(unittest.TestCase): entries = result['entries'] self.assertEqual(len(entries), 100) + def test_youtube_flat_playlist_titles(self): + dl = FakeYDL() + dl.params['extract_flat'] = True + ie = YoutubePlaylistIE(dl) + result = ie.extract('https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re') + self.assertIsPlaylist(result) + for entry in result['entries']: + self.assertTrue(entry.get('title')) + if __name__ == '__main__': unittest.main() diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 09696e19a..060864434 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -64,6 +64,12 @@ _TESTS = [ 'js', '4646B5181C6C3020DF1D9C7FCFEA.AD80ABF70C39BD369CCCAE780AFBB98FA6B6CB42766249D9488C288', '82C8849D94266724DC6B6AF89BBFA087EACCD963.B93C07FBA084ACAEFCF7C9D1FD0203C6C1815B6B' + ), + ( + 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflKjOTVq/html5player.js', + 'js', + '312AA52209E3623129A412D56A40F11CB0AF14AE.3EE09501CB14E3BCDC3B2AE808BF3F1D14E7FBF12', + '112AA5220913623229A412D56A40F11CB0AF14AE.3EE0950FCB14EEBCDC3B2AE808BF331D14E7FBF3', ) ] diff --git a/tox.ini b/tox.ini index ed01e3386..48504329f 100644 --- a/tox.ini +++ b/tox.ini @@ -1,8 +1,13 @@ [tox] -envlist = py26,py27,py33 +envlist = py26,py27,py33,py34,py35 [testenv] deps = nose coverage -commands = nosetests --verbose {posargs:test} # --with-coverage --cover-package=youtube_dl --cover-html +# We need a valid $HOME for test_compat_expanduser +passenv = HOME +defaultargs = test --exclude test_download.py --exclude test_age_restriction.py + --exclude test_subtitles.py --exclude test_write_annotations.py + --exclude test_youtube_lists.py +commands = nosetests --verbose {posargs:{[testenv]defaultargs}} # --with-coverage --cover-package=youtube_dl --cover-html # test.test_download:TestDownload.test_NowVideo diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 13d18e25e..9a8c7da05 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -4,8 +4,10 @@ from __future__ import absolute_import, unicode_literals import collections +import contextlib import datetime import errno +import fileinput import io import itertools import json @@ -19,6 +21,7 @@ import subprocess import socket import sys import time +import tokenize import traceback if os.name == 'nt': @@ -28,14 +31,16 @@ from .compat import ( compat_basestring, compat_cookiejar, compat_expanduser, + compat_get_terminal_size, compat_http_client, compat_kwargs, compat_str, + compat_tokenize_tokenize, compat_urllib_error, compat_urllib_request, + compat_urllib_request_DataHandler, ) from .utils import ( - escape_url, ContentTooShortError, date_from_str, DateRange, @@ -46,28 +51,31 @@ from .utils import ( ExtractorError, format_bytes, formatSeconds, - get_term_width, locked_file, make_HTTPS_handler, MaxDownloadsReached, PagedList, parse_filesize, + PerRequestProxyHandler, PostProcessingError, platform_name, preferredencoding, render_table, SameFileError, sanitize_filename, + sanitize_path, + sanitized_Request, std_headers, subtitles_filename, - takewhile_inclusive, UnavailableVideoError, url_basename, version_tuple, write_json_file, write_string, + YoutubeDLCookieProcessor, YoutubeDLHandler, prepend_extension, + replace_extension, args_to_str, age_restricted, ) @@ -114,7 +122,7 @@ class YoutubeDL(object): username: Username for authentication purposes. password: Password for authentication purposes. - videopassword: Password for acces a video. + videopassword: Password for accessing a video. usenetrc: Use netrc for authentication instead. verbose: Print additional info to stdout. quiet: Do not print messages to stdout. @@ -131,10 +139,10 @@ class YoutubeDL(object): (or video) as a single JSON line. simulate: Do not download the video files. format: Video format code. See options.py for more information. - format_limit: Highest quality format to try. outtmpl: Template for output names. restrictfilenames: Do not allow "&" and spaces in file names ignoreerrors: Do not stop on download errors. + force_generic_extractor: Force downloader to use the generic extractor nooverwrites: Prevent overwriting files. playliststart: Playlist item to start at. playlistend: Playlist item to end at. @@ -150,11 +158,11 @@ class YoutubeDL(object): writethumbnail: Write the thumbnail image to a file write_all_thumbnails: Write all thumbnail formats to files writesubtitles: Write the video subtitles to a file - writeautomaticsub: Write the automatic subtitles to a file + writeautomaticsub: Write the automatically generated subtitles to a file allsubtitles: Downloads all the subtitles of the video (requires writesubtitles or writeautomaticsub) listsubtitles: Lists all available subtitles for the video - subtitlesformat: Subtitle format [srt/sbv/vtt] (default=srt) + subtitlesformat: The format code for subtitles subtitleslangs: List of languages of the subtitles to download keepvideo: Keep the video file after post-processing daterange: A DateRange object, download only if the upload_date is in the range. @@ -181,6 +189,8 @@ class YoutubeDL(object): prefer_insecure: Use HTTP instead of HTTPS to retrieve information. At the moment, this is only supported by YouTube. proxy: URL of the proxy server to use + cn_verification_proxy: URL of the proxy to use for IP address verification + on Chinese sites. (Experimental) socket_timeout: Time to wait for unresponsive hosts, in seconds bidi_workaround: Work around buggy terminals without bidirectional text support, using fridibi @@ -199,18 +209,25 @@ class YoutubeDL(object): postprocessor. progress_hooks: A list of functions that get called on download progress, with a dictionary with the entries - * status: One of "downloading" and "finished". + * status: One of "downloading", "error", or "finished". Check this first and ignore unknown values. - If status is one of "downloading" or "finished", the + If status is one of "downloading", or "finished", the following properties may also be present: * filename: The final filename (always present) + * tmpfilename: The filename we're currently writing to * downloaded_bytes: Bytes on disk * total_bytes: Size of the whole file, None if unknown - * tmpfilename: The filename we're currently writing to + * total_bytes_estimate: Guess of the eventual file size, + None if unavailable. + * elapsed: The number of seconds since download started. * eta: The estimated time in seconds, None if unknown * speed: The download speed in bytes/second, None if unknown + * fragment_index: The counter of the currently + downloaded video fragment. + * fragment_count: The number of fragments (= individual + files that will be merged) Progress hooks are guaranteed to be called at least once (with status "finished") if the download is successful. @@ -225,7 +242,6 @@ class YoutubeDL(object): call_home: Boolean, true iff we are allowed to contact the youtube-dl servers for debugging. sleep_interval: Number of seconds to sleep before each download. - external_downloader: Executable of the external downloader to call. listformats: Print an overview of available video formats and exit. list_thumbnails: Print a table of all thumbnails and exit. match_filter: A function that gets called with the info_dict of @@ -235,17 +251,22 @@ class YoutubeDL(object): match_filter_func in utils.py is one example for this. no_color: Do not emit color codes in output. + The following options determine which downloader is picked: + external_downloader: Executable of the external downloader to call. + None or unset for standard (built-in) downloader. + hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv. The following parameters are not used by YoutubeDL itself, they are used by - the FileDownloader: + the downloader (see youtube_dl/downloader/common.py): nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test, noresizebuffer, retries, continuedl, noprogress, consoletitle, - xattr_set_filesize. + xattr_set_filesize, external_downloader_args. The following options are used by the post processors: prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available, otherwise prefer avconv. - exec_cmd: Arbitrary command to run after downloading + postprocessor_args: A list of additional command-line arguments for the + postprocessor. """ params = None @@ -267,14 +288,18 @@ class YoutubeDL(object): self._num_downloads = 0 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)] self._err_file = sys.stderr - self.params = params + self.params = { + # Default parameters + 'nocheckcertificate': False, + } + self.params.update(params) self.cache = Cache(self) if params.get('bidi_workaround', False): try: import pty master, slave = pty.openpty() - width = get_term_width() + width = compat_get_terminal_size().columns if width is None: width_args = [] else: @@ -298,8 +323,8 @@ class YoutubeDL(object): raise if (sys.version_info >= (3,) and sys.platform != 'win32' and - sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] - and not params.get('restrictfilenames', False)): + sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and + not params.get('restrictfilenames', False)): # On Python 3, the Unicode filesystem API will throw errors (#1474) self.report_warning( 'Assuming --restrict-filenames since file system encoding ' @@ -307,8 +332,10 @@ class YoutubeDL(object): 'Set the LC_ALL environment variable to fix this.') self.params['restrictfilenames'] = True - if '%(stitle)s' in self.params.get('outtmpl', ''): - self.report_warning('%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.') + if isinstance(params.get('outtmpl'), bytes): + self.report_warning( + 'Parameter outtmpl is bytes, but should be a unicode string. ' + 'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.') self._setup_opener() @@ -555,7 +582,7 @@ class YoutubeDL(object): # to workaround encoding issues with subprocess on python2 @ Windows if sys.version_info < (3, 0) and sys.platform == 'win32': filename = encodeFilename(filename, True).decode(preferredencoding()) - return filename + return sanitize_path(filename) except ValueError as err: self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')') return None @@ -609,12 +636,15 @@ class YoutubeDL(object): info_dict.setdefault(key, value) def extract_info(self, url, download=True, ie_key=None, extra_info={}, - process=True): + process=True, force_generic_extractor=False): ''' Returns a list with a dictionary for each video we find. If 'download', also downloads the videos. extra_info is a dict containing the extra values to add to each result - ''' + ''' + + if not ie_key and force_generic_extractor: + ie_key = 'Generic' if ie_key: ies = [self.get_info_extractor(ie_key)] @@ -743,7 +773,9 @@ class YoutubeDL(object): if isinstance(ie_entries, list): n_all_entries = len(ie_entries) if playlistitems: - entries = [ie_entries[i - 1] for i in playlistitems] + entries = [ + ie_entries[i - 1] for i in playlistitems + if -n_all_entries <= i - 1 < n_all_entries] else: entries = ie_entries[playliststart:playlistend] n_entries = len(entries) @@ -803,6 +835,7 @@ class YoutubeDL(object): extra_info=extra) playlist_results.append(entry_result) ie_result['entries'] = playlist_results + self.to_screen('[download] Finished downloading playlist: %s' % playlist) return ie_result elif result_type == 'compat_list': self.report_warning( @@ -828,8 +861,8 @@ class YoutubeDL(object): else: raise Exception('Invalid result type: %s' % result_type) - def _apply_format_filter(self, format_spec, available_formats): - " Returns a tuple of the remaining format_spec and filtered formats " + def _build_format_filter(self, filter_spec): + " Returns a function to filter the formats according to the filter_spec " OPERATORS = { '<': operator.lt, @@ -839,13 +872,13 @@ class YoutubeDL(object): '=': operator.eq, '!=': operator.ne, } - operator_rex = re.compile(r'''(?x)\s*\[ + operator_rex = re.compile(r'''(?x)\s* (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps) \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s* (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?) - \]$ + $ ''' % '|'.join(map(re.escape, OPERATORS.keys()))) - m = operator_rex.search(format_spec) + m = operator_rex.search(filter_spec) if m: try: comparison_value = int(m.group('value')) @@ -856,7 +889,7 @@ class YoutubeDL(object): if comparison_value is None: raise ValueError( 'Invalid value %r in format specification %r' % ( - m.group('value'), format_spec)) + m.group('value'), filter_spec)) op = OPERATORS[m.group('op')] if not m: @@ -864,78 +897,283 @@ class YoutubeDL(object): '=': operator.eq, '!=': operator.ne, } - str_operator_rex = re.compile(r'''(?x)\s*\[ + str_operator_rex = re.compile(r'''(?x) \s*(?P<key>ext|acodec|vcodec|container|protocol) \s*(?P<op>%s)(?P<none_inclusive>\s*\?)? \s*(?P<value>[a-zA-Z0-9_-]+) - \s*\]$ + \s*$ ''' % '|'.join(map(re.escape, STR_OPERATORS.keys()))) - m = str_operator_rex.search(format_spec) + m = str_operator_rex.search(filter_spec) if m: comparison_value = m.group('value') op = STR_OPERATORS[m.group('op')] if not m: - raise ValueError('Invalid format specification %r' % format_spec) + raise ValueError('Invalid filter specification %r' % filter_spec) def _filter(f): actual_value = f.get(m.group('key')) if actual_value is None: return m.group('none_inclusive') return op(actual_value, comparison_value) - new_formats = [f for f in available_formats if _filter(f)] + return _filter + + def build_format_selector(self, format_spec): + def syntax_error(note, start): + message = ( + 'Invalid format specification: ' + '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1])) + return SyntaxError(message) + + PICKFIRST = 'PICKFIRST' + MERGE = 'MERGE' + SINGLE = 'SINGLE' + GROUP = 'GROUP' + FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters']) + + def _parse_filter(tokens): + filter_parts = [] + for type, string, start, _, _ in tokens: + if type == tokenize.OP and string == ']': + return ''.join(filter_parts) + else: + filter_parts.append(string) + + def _remove_unused_ops(tokens): + # Remove operators that we don't use and join them with the surrounding strings + # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9' + ALLOWED_OPS = ('/', '+', ',', '(', ')') + last_string, last_start, last_end, last_line = None, None, None, None + for type, string, start, end, line in tokens: + if type == tokenize.OP and string == '[': + if last_string: + yield tokenize.NAME, last_string, last_start, last_end, last_line + last_string = None + yield type, string, start, end, line + # everything inside brackets will be handled by _parse_filter + for type, string, start, end, line in tokens: + yield type, string, start, end, line + if type == tokenize.OP and string == ']': + break + elif type == tokenize.OP and string in ALLOWED_OPS: + if last_string: + yield tokenize.NAME, last_string, last_start, last_end, last_line + last_string = None + yield type, string, start, end, line + elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]: + if not last_string: + last_string = string + last_start = start + last_end = end + else: + last_string += string + if last_string: + yield tokenize.NAME, last_string, last_start, last_end, last_line + + def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False): + selectors = [] + current_selector = None + for type, string, start, _, _ in tokens: + # ENCODING is only defined in python 3.x + if type == getattr(tokenize, 'ENCODING', None): + continue + elif type in [tokenize.NAME, tokenize.NUMBER]: + current_selector = FormatSelector(SINGLE, string, []) + elif type == tokenize.OP: + if string == ')': + if not inside_group: + # ')' will be handled by the parentheses group + tokens.restore_last_token() + break + elif inside_merge and string in ['/', ',']: + tokens.restore_last_token() + break + elif inside_choice and string == ',': + tokens.restore_last_token() + break + elif string == ',': + if not current_selector: + raise syntax_error('"," must follow a format selector', start) + selectors.append(current_selector) + current_selector = None + elif string == '/': + if not current_selector: + raise syntax_error('"/" must follow a format selector', start) + first_choice = current_selector + second_choice = _parse_format_selection(tokens, inside_choice=True) + current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), []) + elif string == '[': + if not current_selector: + current_selector = FormatSelector(SINGLE, 'best', []) + format_filter = _parse_filter(tokens) + current_selector.filters.append(format_filter) + elif string == '(': + if current_selector: + raise syntax_error('Unexpected "("', start) + group = _parse_format_selection(tokens, inside_group=True) + current_selector = FormatSelector(GROUP, group, []) + elif string == '+': + video_selector = current_selector + audio_selector = _parse_format_selection(tokens, inside_merge=True) + if not video_selector or not audio_selector: + raise syntax_error('"+" must be between two format selectors', start) + current_selector = FormatSelector(MERGE, (video_selector, audio_selector), []) + else: + raise syntax_error('Operator not recognized: "{0}"'.format(string), start) + elif type == tokenize.ENDMARKER: + break + if current_selector: + selectors.append(current_selector) + return selectors + + def _build_selector_function(selector): + if isinstance(selector, list): + fs = [_build_selector_function(s) for s in selector] + + def selector_function(formats): + for f in fs: + for format in f(formats): + yield format + return selector_function + elif selector.type == GROUP: + selector_function = _build_selector_function(selector.selector) + elif selector.type == PICKFIRST: + fs = [_build_selector_function(s) for s in selector.selector] + + def selector_function(formats): + for f in fs: + picked_formats = list(f(formats)) + if picked_formats: + return picked_formats + return [] + elif selector.type == SINGLE: + format_spec = selector.selector + + def selector_function(formats): + formats = list(formats) + if not formats: + return + if format_spec == 'all': + for f in formats: + yield f + elif format_spec in ['best', 'worst', None]: + format_idx = 0 if format_spec == 'worst' else -1 + audiovideo_formats = [ + f for f in formats + if f.get('vcodec') != 'none' and f.get('acodec') != 'none'] + if audiovideo_formats: + yield audiovideo_formats[format_idx] + # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format + elif (all(f.get('acodec') != 'none' for f in formats) or + all(f.get('vcodec') != 'none' for f in formats)): + yield formats[format_idx] + elif format_spec == 'bestaudio': + audio_formats = [ + f for f in formats + if f.get('vcodec') == 'none'] + if audio_formats: + yield audio_formats[-1] + elif format_spec == 'worstaudio': + audio_formats = [ + f for f in formats + if f.get('vcodec') == 'none'] + if audio_formats: + yield audio_formats[0] + elif format_spec == 'bestvideo': + video_formats = [ + f for f in formats + if f.get('acodec') == 'none'] + if video_formats: + yield video_formats[-1] + elif format_spec == 'worstvideo': + video_formats = [ + f for f in formats + if f.get('acodec') == 'none'] + if video_formats: + yield video_formats[0] + else: + extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav'] + if format_spec in extensions: + filter_f = lambda f: f['ext'] == format_spec + else: + filter_f = lambda f: f['format_id'] == format_spec + matches = list(filter(filter_f, formats)) + if matches: + yield matches[-1] + elif selector.type == MERGE: + def _merge(formats_info): + format_1, format_2 = [f['format_id'] for f in formats_info] + # The first format must contain the video and the + # second the audio + if formats_info[0].get('vcodec') == 'none': + self.report_error('The first format must ' + 'contain the video, try using ' + '"-f %s+%s"' % (format_2, format_1)) + return + output_ext = ( + formats_info[0]['ext'] + if self.params.get('merge_output_format') is None + else self.params['merge_output_format']) + return { + 'requested_formats': formats_info, + 'format': '%s+%s' % (formats_info[0].get('format'), + formats_info[1].get('format')), + 'format_id': '%s+%s' % (formats_info[0].get('format_id'), + formats_info[1].get('format_id')), + 'width': formats_info[0].get('width'), + 'height': formats_info[0].get('height'), + 'resolution': formats_info[0].get('resolution'), + 'fps': formats_info[0].get('fps'), + 'vcodec': formats_info[0].get('vcodec'), + 'vbr': formats_info[0].get('vbr'), + 'stretched_ratio': formats_info[0].get('stretched_ratio'), + 'acodec': formats_info[1].get('acodec'), + 'abr': formats_info[1].get('abr'), + 'ext': output_ext, + } + video_selector, audio_selector = map(_build_selector_function, selector.selector) - new_format_spec = format_spec[:-len(m.group(0))] - if not new_format_spec: - new_format_spec = 'best' + def selector_function(formats): + formats = list(formats) + for pair in itertools.product(video_selector(formats), audio_selector(formats)): + yield _merge(pair) - return (new_format_spec, new_formats) + filters = [self._build_format_filter(f) for f in selector.filters] - def select_format(self, format_spec, available_formats): - while format_spec.endswith(']'): - format_spec, available_formats = self._apply_format_filter( - format_spec, available_formats) - if not available_formats: - return None + def final_selector(formats): + for _filter in filters: + formats = list(filter(_filter, formats)) + return selector_function(formats) + return final_selector - if format_spec == 'best' or format_spec is None: - return available_formats[-1] - elif format_spec == 'worst': - return available_formats[0] - elif format_spec == 'bestaudio': - audio_formats = [ - f for f in available_formats - if f.get('vcodec') == 'none'] - if audio_formats: - return audio_formats[-1] - elif format_spec == 'worstaudio': - audio_formats = [ - f for f in available_formats - if f.get('vcodec') == 'none'] - if audio_formats: - return audio_formats[0] - elif format_spec == 'bestvideo': - video_formats = [ - f for f in available_formats - if f.get('acodec') == 'none'] - if video_formats: - return video_formats[-1] - elif format_spec == 'worstvideo': - video_formats = [ - f for f in available_formats - if f.get('acodec') == 'none'] - if video_formats: - return video_formats[0] - else: - extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav'] - if format_spec in extensions: - filter_f = lambda f: f['ext'] == format_spec - else: - filter_f = lambda f: f['format_id'] == format_spec - matches = list(filter(filter_f, available_formats)) - if matches: - return matches[-1] - return None + stream = io.BytesIO(format_spec.encode('utf-8')) + try: + tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline))) + except tokenize.TokenError: + raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec))) + + class TokenIterator(object): + def __init__(self, tokens): + self.tokens = tokens + self.counter = 0 + + def __iter__(self): + return self + + def __next__(self): + if self.counter >= len(self.tokens): + raise StopIteration() + value = self.tokens[self.counter] + self.counter += 1 + return value + + next = __next__ + + def restore_last_token(self): + self.counter -= 1 + + parsed_selector = _parse_format_selection(iter(TokenIterator(tokens))) + return _build_selector_function(parsed_selector) def _calc_headers(self, info_dict): res = std_headers.copy() @@ -951,30 +1189,9 @@ class YoutubeDL(object): return res def _calc_cookies(self, info_dict): - class _PseudoRequest(object): - def __init__(self, url): - self.url = url - self.headers = {} - self.unverifiable = False - - def add_unredirected_header(self, k, v): - self.headers[k] = v - - def get_full_url(self): - return self.url - - def is_unverifiable(self): - return self.unverifiable - - def has_header(self, h): - return h in self.headers - - def get_header(self, h, default=None): - return self.headers.get(h, default) - - pr = _PseudoRequest(info_dict['url']) + pr = sanitized_Request(info_dict['url']) self.cookiejar.add_cookie_header(pr) - return pr.headers.get('Cookie') + return pr.get_header('Cookie') def process_video_result(self, info_dict, download=True): assert info_dict.get('_type', 'video') == 'video' @@ -999,7 +1216,7 @@ class YoutubeDL(object): t.get('preference'), t.get('width'), t.get('height'), t.get('id'), t.get('url'))) for i, t in enumerate(thumbnails): - if 'width' in t and 'height' in t: + if t.get('width') and t.get('height'): t['resolution'] = '%dx%d' % (t['width'], t['height']) if t.get('id') is None: t['id'] = '%d' % i @@ -1011,19 +1228,29 @@ class YoutubeDL(object): info_dict['display_id'] = info_dict['id'] if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None: - # Working around negative timestamps in Windows - # (see http://bugs.python.org/issue1646728) - if info_dict['timestamp'] < 0 and os.name == 'nt': - info_dict['timestamp'] = 0 - upload_date = datetime.datetime.utcfromtimestamp( - info_dict['timestamp']) - info_dict['upload_date'] = upload_date.strftime('%Y%m%d') - - # This extractors handle format selection themselves - if info_dict['extractor'] in ['Youku']: - if download: - self.process_info(info_dict) - return info_dict + # Working around out-of-range timestamp values (e.g. negative ones on Windows, + # see http://bugs.python.org/issue1646728) + try: + upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp']) + info_dict['upload_date'] = upload_date.strftime('%Y%m%d') + except (ValueError, OverflowError, OSError): + pass + + subtitles = info_dict.get('subtitles') + if subtitles: + for _, subtitle in subtitles.items(): + for subtitle_format in subtitle: + if 'ext' not in subtitle_format: + subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower() + + if self.params.get('listsubtitles', False): + if 'automatic_captions' in info_dict: + self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions') + self.list_subtitles(info_dict['id'], subtitles, 'subtitles') + return + info_dict['requested_subtitles'] = self.process_subtitles( + info_dict['id'], subtitles, + info_dict.get('automatic_captions')) # We now pick which formats have to be downloaded if info_dict.get('formats') is None: @@ -1035,6 +1262,8 @@ class YoutubeDL(object): if not formats: raise ExtractorError('No video formats found!') + formats_dict = {} + # We check that all the formats have the format and format_id fields for i, format in enumerate(formats): if 'url' not in format: @@ -1042,6 +1271,18 @@ class YoutubeDL(object): if format.get('format_id') is None: format['format_id'] = compat_str(i) + format_id = format['format_id'] + if format_id not in formats_dict: + formats_dict[format_id] = [] + formats_dict[format_id].append(format) + + # Make sure all formats have unique format_id + for format_id, ambiguous_formats in formats_dict.items(): + if len(ambiguous_formats) > 1: + for i, format in enumerate(ambiguous_formats): + format['format_id'] = '%s-%d' % (format_id, i) + + for i, format in enumerate(formats): if format.get('format') is None: format['format'] = '{id} - {res}{note}'.format( id=format['format_id'], @@ -1057,12 +1298,6 @@ class YoutubeDL(object): full_format_info.update(format) format['http_headers'] = self._calc_headers(full_format_info) - format_limit = self.params.get('format_limit', None) - if format_limit: - formats = list(takewhile_inclusive( - lambda f: f['format_id'] != format_limit, formats - )) - # TODO Central sorting goes here if formats[0] is not info_dict: @@ -1080,58 +1315,17 @@ class YoutubeDL(object): req_format = self.params.get('format') if req_format is None: - req_format = 'best' - formats_to_download = [] - # The -1 is for supporting YoutubeIE - if req_format in ('-1', 'all'): - formats_to_download = formats - else: - for rfstr in req_format.split(','): - # We can accept formats requested in the format: 34/5/best, we pick - # the first that is available, starting from left - req_formats = rfstr.split('/') - for rf in req_formats: - if re.match(r'.+?\+.+?', rf) is not None: - # Two formats have been requested like '137+139' - format_1, format_2 = rf.split('+') - formats_info = (self.select_format(format_1, formats), - self.select_format(format_2, formats)) - if all(formats_info): - # The first format must contain the video and the - # second the audio - if formats_info[0].get('vcodec') == 'none': - self.report_error('The first format must ' - 'contain the video, try using ' - '"-f %s+%s"' % (format_2, format_1)) - return - output_ext = ( - formats_info[0]['ext'] - if self.params.get('merge_output_format') is None - else self.params['merge_output_format']) - selected_format = { - 'requested_formats': formats_info, - 'format': '%s+%s' % (formats_info[0].get('format'), - formats_info[1].get('format')), - 'format_id': '%s+%s' % (formats_info[0].get('format_id'), - formats_info[1].get('format_id')), - 'width': formats_info[0].get('width'), - 'height': formats_info[0].get('height'), - 'resolution': formats_info[0].get('resolution'), - 'fps': formats_info[0].get('fps'), - 'vcodec': formats_info[0].get('vcodec'), - 'vbr': formats_info[0].get('vbr'), - 'stretched_ratio': formats_info[0].get('stretched_ratio'), - 'acodec': formats_info[1].get('acodec'), - 'abr': formats_info[1].get('abr'), - 'ext': output_ext, - } - else: - selected_format = None - else: - selected_format = self.select_format(rf, formats) - if selected_format is not None: - formats_to_download.append(selected_format) - break + req_format_list = [] + if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and + info_dict['extractor'] in ['youtube', 'ted'] and + not info_dict.get('is_live')): + merger = FFmpegMergerPP(self) + if merger.available and merger.can_merge(): + req_format_list.append('bestvideo+bestaudio') + req_format_list.append('best') + req_format = '/'.join(req_format_list) + format_selector = self.build_format_selector(req_format) + formats_to_download = list(format_selector(formats)) if not formats_to_download: raise ExtractorError('requested format not available', expected=True) @@ -1147,6 +1341,55 @@ class YoutubeDL(object): info_dict.update(formats_to_download[-1]) return info_dict + def process_subtitles(self, video_id, normal_subtitles, automatic_captions): + """Select the requested subtitles and their format""" + available_subs = {} + if normal_subtitles and self.params.get('writesubtitles'): + available_subs.update(normal_subtitles) + if automatic_captions and self.params.get('writeautomaticsub'): + for lang, cap_info in automatic_captions.items(): + if lang not in available_subs: + available_subs[lang] = cap_info + + if (not self.params.get('writesubtitles') and not + self.params.get('writeautomaticsub') or not + available_subs): + return None + + if self.params.get('allsubtitles', False): + requested_langs = available_subs.keys() + else: + if self.params.get('subtitleslangs', False): + requested_langs = self.params.get('subtitleslangs') + elif 'en' in available_subs: + requested_langs = ['en'] + else: + requested_langs = [list(available_subs.keys())[0]] + + formats_query = self.params.get('subtitlesformat', 'best') + formats_preference = formats_query.split('/') if formats_query else [] + subs = {} + for lang in requested_langs: + formats = available_subs.get(lang) + if formats is None: + self.report_warning('%s subtitles not available for %s' % (lang, video_id)) + continue + for ext in formats_preference: + if ext == 'best': + f = formats[-1] + break + matches = list(filter(lambda f: f['ext'] == ext, formats)) + if matches: + f = matches[-1] + break + else: + f = formats[-1] + self.report_warning( + 'No subtitle format found matching "%s" for language %s, ' + 'using %s' % (formats_query, lang, f['ext'])) + subs[lang] = f + return subs + def process_info(self, info_dict): """Process a single resolved IE result.""" @@ -1161,9 +1404,6 @@ class YoutubeDL(object): if len(info_dict['title']) > 200: info_dict['title'] = info_dict['title'][:197] + '...' - # Keep for backwards compatibility - info_dict['stitle'] = info_dict['title'] - if 'format' not in info_dict: info_dict['format'] = info_dict['ext'] @@ -1209,7 +1449,7 @@ class YoutubeDL(object): return try: - dn = os.path.dirname(encodeFilename(filename)) + dn = os.path.dirname(sanitize_path(encodeFilename(filename))) if dn and not os.path.exists(dn): os.makedirs(dn) except (OSError, IOError) as err: @@ -1217,7 +1457,7 @@ class YoutubeDL(object): return if self.params.get('writedescription', False): - descfn = filename + '.description' + descfn = replace_extension(filename, 'description', info_dict.get('ext')) if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)): self.to_screen('[info] Video description is already present') elif info_dict.get('description') is None: @@ -1232,7 +1472,7 @@ class YoutubeDL(object): return if self.params.get('writeannotations', False): - annofn = filename + '.annotations.xml' + annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext')) if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)): self.to_screen('[info] Video annotations are already present') else: @@ -1249,15 +1489,23 @@ class YoutubeDL(object): subtitles_are_requested = any([self.params.get('writesubtitles', False), self.params.get('writeautomaticsub')]) - if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']: + if subtitles_are_requested and info_dict.get('requested_subtitles'): # subtitles download errors are already managed as troubles in relevant IE # that way it will silently go on when used with unsupporting IE - subtitles = info_dict['subtitles'] - sub_format = self.params.get('subtitlesformat', 'srt') - for sub_lang in subtitles.keys(): - sub = subtitles[sub_lang] - if sub is None: - continue + subtitles = info_dict['requested_subtitles'] + ie = self.get_info_extractor(info_dict['extractor_key']) + for sub_lang, sub_info in subtitles.items(): + sub_format = sub_info['ext'] + if sub_info.get('data') is not None: + sub_data = sub_info['data'] + else: + try: + sub_data = ie._download_webpage( + sub_info['url'], info_dict['id'], note=False) + except ExtractorError as err: + self.report_warning('Unable to download subtitle for "%s": %s' % + (sub_lang, compat_str(err.cause))) + continue try: sub_filename = subtitles_filename(filename, sub_lang, sub_format) if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): @@ -1265,19 +1513,19 @@ class YoutubeDL(object): else: self.to_screen('[info] Writing video subtitles to: ' + sub_filename) with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: - subfile.write(sub) + subfile.write(sub_data) except (OSError, IOError): self.report_error('Cannot write subtitles file ' + sub_filename) return if self.params.get('writeinfojson', False): - infofn = os.path.splitext(filename)[0] + '.info.json' + infofn = replace_extension(filename, 'info.json', info_dict.get('ext')) if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)): self.to_screen('[info] Video description metadata is already present') else: self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn) try: - write_json_file(info_dict, infofn) + write_json_file(self.filter_requested_info(info_dict), infofn) except (OSError, IOError): self.report_error('Cannot write metadata to JSON file ' + infofn) return @@ -1297,24 +1545,57 @@ class YoutubeDL(object): if info_dict.get('requested_formats') is not None: downloaded = [] success = True - merger = FFmpegMergerPP(self, not self.params.get('keepvideo')) - if not merger._executable: + merger = FFmpegMergerPP(self) + if not merger.available: postprocessors = [] self.report_warning('You have requested multiple ' 'formats but ffmpeg or avconv are not installed.' - ' The formats won\'t be merged') + ' The formats won\'t be merged.') else: postprocessors = [merger] - for f in info_dict['requested_formats']: - new_info = dict(info_dict) - new_info.update(f) - fname = self.prepare_filename(new_info) - fname = prepend_extension(fname, 'f%s' % f['format_id']) - downloaded.append(fname) - partial_success = dl(fname, new_info) - success = success and partial_success - info_dict['__postprocessors'] = postprocessors - info_dict['__files_to_merge'] = downloaded + + def compatible_formats(formats): + video, audio = formats + # Check extension + video_ext, audio_ext = audio.get('ext'), video.get('ext') + if video_ext and audio_ext: + COMPATIBLE_EXTS = ( + ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v'), + ('webm') + ) + for exts in COMPATIBLE_EXTS: + if video_ext in exts and audio_ext in exts: + return True + # TODO: Check acodec/vcodec + return False + + filename_real_ext = os.path.splitext(filename)[1][1:] + filename_wo_ext = ( + os.path.splitext(filename)[0] + if filename_real_ext == info_dict['ext'] + else filename) + requested_formats = info_dict['requested_formats'] + if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats): + info_dict['ext'] = 'mkv' + self.report_warning( + 'Requested formats are incompatible for merge and will be merged into mkv.') + # Ensure filename always has a correct extension for successful merge + filename = '%s.%s' % (filename_wo_ext, info_dict['ext']) + if os.path.exists(encodeFilename(filename)): + self.to_screen( + '[download] %s has already been downloaded and ' + 'merged' % filename) + else: + for f in requested_formats: + new_info = dict(info_dict) + new_info.update(f) + fname = self.prepare_filename(new_info) + fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext']) + downloaded.append(fname) + partial_success = dl(fname, new_info) + success = success and partial_success + info_dict['__postprocessors'] = postprocessors + info_dict['__files_to_merge'] = downloaded else: # Just a single file success = dl(filename, info_dict) @@ -1377,14 +1658,15 @@ class YoutubeDL(object): """Download a given list of URLs.""" outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) if (len(url_list) > 1 and - '%' not in outtmpl - and self.params.get('max_downloads') != 1): + '%' not in outtmpl and + self.params.get('max_downloads') != 1): raise SameFileError(outtmpl) for url in url_list: try: # It also downloads the videos - res = self.extract_info(url) + res = self.extract_info( + url, force_generic_extractor=self.params.get('force_generic_extractor', False)) except UnavailableVideoError: self.report_error('unable to download video') except MaxDownloadsReached: @@ -1397,8 +1679,11 @@ class YoutubeDL(object): return self._download_retcode def download_with_info_file(self, info_filename): - with io.open(info_filename, 'r', encoding='utf-8') as f: - info = json.load(f) + with contextlib.closing(fileinput.FileInput( + [info_filename], mode='r', + openhook=fileinput.hook_encoded('utf-8'))) as f: + # FileInput doesn't have a read method, we can't call json.load + info = self.filter_requested_info(json.loads('\n'.join(f))) try: self.process_ie_result(info, download=True) except DownloadError: @@ -1410,6 +1695,12 @@ class YoutubeDL(object): raise return self._download_retcode + @staticmethod + def filter_requested_info(info_dict): + return dict( + (k, v) for k, v in info_dict.items() + if k not in ['requested_formats', 'requested_subtitles']) + def post_process(self, filename, ie_info): """Run all the postprocessors on the given file.""" info = dict(ie_info) @@ -1419,24 +1710,18 @@ class YoutubeDL(object): pps_chain.extend(ie_info['__postprocessors']) pps_chain.extend(self._pps) for pp in pps_chain: - keep_video = None - old_filename = info['filepath'] + files_to_delete = [] try: - keep_video_wish, info = pp.run(info) - if keep_video_wish is not None: - if keep_video_wish: - keep_video = keep_video_wish - elif keep_video is None: - # No clear decision yet, let IE decide - keep_video = keep_video_wish + files_to_delete, info = pp.run(info) except PostProcessingError as e: self.report_error(e.msg) - if keep_video is False and not self.params.get('keepvideo', False): - try: + if files_to_delete and not self.params.get('keepvideo', False): + for old_filename in files_to_delete: self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename) - os.remove(encodeFilename(old_filename)) - except (IOError, OSError): - self.report_warning('Unable to remove downloaded video file') + try: + os.remove(encodeFilename(old_filename)) + except (IOError, OSError): + self.report_warning('Unable to remove downloaded original file') def _make_archive_id(self, info_dict): # Future-proof against any change in case @@ -1545,29 +1830,18 @@ class YoutubeDL(object): return res def list_formats(self, info_dict): - def line(format, idlen=20): - return (('%-' + compat_str(idlen + 1) + 's%-10s%-12s%s') % ( - format['format_id'], - format['ext'], - self.format_resolution(format), - self._format_note(format), - )) - formats = info_dict.get('formats', [info_dict]) - idlen = max(len('format code'), - max(len(f['format_id']) for f in formats)) - formats_s = [ - line(f, idlen) for f in formats + table = [ + [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)] + for f in formats if f.get('preference') is None or f['preference'] >= -1000] if len(formats) > 1: - formats_s[-1] += (' ' if self._format_note(formats[-1]) else '') + '(best)' + table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)' - header_line = line({ - 'format_id': 'format code', 'ext': 'extension', - 'resolution': 'resolution', 'format_note': 'note'}, idlen=idlen) + header_line = ['format code', 'extension', 'resolution', 'note'] self.to_screen( - '[info] Available formats for %s:\n%s\n%s' % - (info_dict['id'], header_line, '\n'.join(formats_s))) + '[info] Available formats for %s:\n%s' % + (info_dict['id'], render_table(header_line, table))) def list_thumbnails(self, info_dict): thumbnails = info_dict.get('thumbnails') @@ -1586,28 +1860,21 @@ class YoutubeDL(object): ['ID', 'width', 'height', 'URL'], [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])) + def list_subtitles(self, video_id, subtitles, name='subtitles'): + if not subtitles: + self.to_screen('%s has no %s' % (video_id, name)) + return + self.to_screen( + 'Available %s for %s:' % (name, video_id)) + self.to_screen(render_table( + ['Language', 'formats'], + [[lang, ', '.join(f['ext'] for f in reversed(formats))] + for lang, formats in subtitles.items()])) + def urlopen(self, req): """ Start an HTTP download """ - - # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not - # always respected by websites, some tend to give out URLs with non percent-encoded - # non-ASCII characters (see telemb.py, ard.py [#3412]) - # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) - # To work around aforementioned issue we will replace request's original URL with - # percent-encoded one - req_is_string = isinstance(req, compat_basestring) - url = req if req_is_string else req.get_full_url() - url_escaped = escape_url(url) - - # Substitute URL if any change after escaping - if url != url_escaped: - if req_is_string: - req = url_escaped - else: - req = compat_urllib_request.Request( - url_escaped, data=req.data, headers=req.headers, - origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) - + if isinstance(req, compat_basestring): + req = sanitized_Request(req) return self._opener.open(req, timeout=self._socket_timeout) def print_debug_header(self): @@ -1639,15 +1906,15 @@ class YoutubeDL(object): out = out.decode().strip() if re.match('[0-9a-f]+', out): self._write_string('[debug] Git HEAD: ' + out + '\n') - except: + except Exception: try: sys.exc_clear() - except: + except Exception: pass self._write_string('[debug] Python version %s - %s\n' % ( platform.python_version(), platform_name())) - exe_versions = FFmpegPostProcessor.get_versions() + exe_versions = FFmpegPostProcessor.get_versions(self) exe_versions['rtmpdump'] = rtmpdump_version() exe_str = ', '.join( '%s %s' % (exe, v) @@ -1690,8 +1957,7 @@ class YoutubeDL(object): if os.access(opts_cookiefile, os.R_OK): self.cookiejar.load() - cookie_processor = compat_urllib_request.HTTPCookieProcessor( - self.cookiejar) + cookie_processor = YoutubeDLCookieProcessor(self.cookiejar) if opts_proxy is not None: if opts_proxy == '': proxies = {} @@ -1702,13 +1968,15 @@ class YoutubeDL(object): # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805) if 'http' in proxies and 'https' not in proxies: proxies['https'] = proxies['http'] - proxy_handler = compat_urllib_request.ProxyHandler(proxies) + proxy_handler = PerRequestProxyHandler(proxies) debuglevel = 1 if self.params.get('debug_printtraffic') else 0 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel) ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel) + data_handler = compat_urllib_request_DataHandler() opener = compat_urllib_request.build_opener( - https_handler, proxy_handler, cookie_processor, ydlh) + proxy_handler, https_handler, cookie_processor, ydlh, data_handler) + # Delete the default user-agent header, which would otherwise apply in # cases where our custom HTTP handler doesn't come into play # (See https://github.com/rg3/youtube-dl/issues/1309 for details) @@ -1749,7 +2017,7 @@ class YoutubeDL(object): thumb_ext = determine_ext(t['url'], 'jpg') suffix = '_%s' % t['id'] if len(thumbnails) > 1 else '' thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else '' - thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext + t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)): self.to_screen('[%s] %s: Thumbnail %sis already present' % @@ -1759,7 +2027,7 @@ class YoutubeDL(object): (info_dict['extractor'], info_dict['id'], thumb_display_id)) try: uf = self.urlopen(t['url']) - with open(thumb_filename, 'wb') as thumbf: + with open(encodeFilename(thumb_filename), 'wb') as thumbf: shutil.copyfileobj(uf, thumbf) self.to_screen('[%s] %s: Writing thumbnail %sto: %s' % (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename)) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index ed22f169f..9f131f5db 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -19,6 +19,7 @@ from .compat import ( compat_expanduser, compat_getpass, compat_print, + compat_shlex_split, workaround_optparse_bug9161, ) from .utils import ( @@ -168,8 +169,11 @@ def _real_main(argv=None): if not opts.audioquality.isdigit(): parser.error('invalid audio quality specified') if opts.recodevideo is not None: - if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv']: + if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv', 'avi']: parser.error('invalid video recode format specified') + if opts.convertsubtitles is not None: + if opts.convertsubtitles not in ['srt', 'vtt', 'ass']: + parser.error('invalid subtitle format specified') if opts.date is not None: date = DateRange.day(opts.date) @@ -185,18 +189,14 @@ def _real_main(argv=None): if opts.allsubtitles and not opts.writeautomaticsub: opts.writesubtitles = True - if sys.version_info < (3,): - # In Python 2, sys.argv is a bytestring (also note http://bugs.python.org/issue2128 for Windows systems) - if opts.outtmpl is not None: - opts.outtmpl = opts.outtmpl.decode(preferredencoding()) - outtmpl = ((opts.outtmpl is not None and opts.outtmpl) - or (opts.format == '-1' and opts.usetitle and '%(title)s-%(id)s-%(format)s.%(ext)s') - or (opts.format == '-1' and '%(id)s-%(format)s.%(ext)s') - or (opts.usetitle and opts.autonumber and '%(autonumber)s-%(title)s-%(id)s.%(ext)s') - or (opts.usetitle and '%(title)s-%(id)s.%(ext)s') - or (opts.useid and '%(id)s.%(ext)s') - or (opts.autonumber and '%(autonumber)s-%(id)s.%(ext)s') - or DEFAULT_OUTTMPL) + outtmpl = ((opts.outtmpl is not None and opts.outtmpl) or + (opts.format == '-1' and opts.usetitle and '%(title)s-%(id)s-%(format)s.%(ext)s') or + (opts.format == '-1' and '%(id)s-%(format)s.%(ext)s') or + (opts.usetitle and opts.autonumber and '%(autonumber)s-%(title)s-%(id)s.%(ext)s') or + (opts.usetitle and '%(title)s-%(id)s.%(ext)s') or + (opts.useid and '%(id)s.%(ext)s') or + (opts.autonumber and '%(autonumber)s-%(id)s.%(ext)s') or + DEFAULT_OUTTMPL) if not os.path.splitext(outtmpl)[1] and opts.extractaudio: parser.error('Cannot download a video and extract audio into the same' ' file! Use "{0}.%(ext)s" instead of "{0}" as the output' @@ -209,6 +209,11 @@ def _real_main(argv=None): # PostProcessors postprocessors = [] # Add the metadata pp first, the other pps will copy it + if opts.metafromtitle: + postprocessors.append({ + 'key': 'MetadataFromTitle', + 'titleformat': opts.metafromtitle + }) if opts.addmetadata: postprocessors.append({'key': 'FFmpegMetadata'}) if opts.extractaudio: @@ -223,23 +228,30 @@ def _real_main(argv=None): 'key': 'FFmpegVideoConvertor', 'preferedformat': opts.recodevideo, }) + if opts.convertsubtitles: + postprocessors.append({ + 'key': 'FFmpegSubtitlesConvertor', + 'format': opts.convertsubtitles, + }) if opts.embedsubtitles: postprocessors.append({ 'key': 'FFmpegEmbedSubtitle', - 'subtitlesformat': opts.subtitlesformat, }) if opts.xattrs: postprocessors.append({'key': 'XAttrMetadata'}) if opts.embedthumbnail: - if not opts.addmetadata: - postprocessors.append({'key': 'FFmpegAudioFix'}) - postprocessors.append({'key': 'AtomicParsley'}) + already_have_thumbnail = opts.writethumbnail or opts.write_all_thumbnails + postprocessors.append({ + 'key': 'EmbedThumbnail', + 'already_have_thumbnail': already_have_thumbnail + }) + if not already_have_thumbnail: + opts.writethumbnail = True # Please keep ExecAfterDownload towards the bottom as it allows the user to modify the final file in any way. # So if the user is able to remove the file before your postprocessor runs it might cause a few problems. if opts.exec_cmd: postprocessors.append({ 'key': 'ExecAfterDownload', - 'verboseOutput': opts.verbose, 'exec_cmd': opts.exec_cmd, }) if opts.xattr_set_filesize: @@ -248,6 +260,12 @@ def _real_main(argv=None): xattr # Confuse flake8 except ImportError: parser.error('setting filesize xattr requested but python-xattr is not available') + external_downloader_args = None + if opts.external_downloader_args: + external_downloader_args = compat_shlex_split(opts.external_downloader_args) + postprocessor_args = None + if opts.postprocessor_args: + postprocessor_args = compat_shlex_split(opts.postprocessor_args) match_filter = ( None if opts.match_filter is None else match_filter_func(opts.match_filter)) @@ -273,12 +291,12 @@ def _real_main(argv=None): 'simulate': opts.simulate or any_getting, 'skip_download': opts.skip_download, 'format': opts.format, - 'format_limit': opts.format_limit, 'listformats': opts.listformats, 'outtmpl': outtmpl, 'autonumber_size': opts.autonumber_size, 'restrictfilenames': opts.restrictfilenames, 'ignoreerrors': opts.ignoreerrors, + 'force_generic_extractor': opts.force_generic_extractor, 'ratelimit': opts.ratelimit, 'nooverwrites': opts.nooverwrites, 'retries': opts_retries, @@ -336,7 +354,6 @@ def _real_main(argv=None): 'default_search': opts.default_search, 'youtube_include_dash_manifest': opts.youtube_include_dash_manifest, 'encoding': opts.encoding, - 'exec_cmd': opts.exec_cmd, 'extract_flat': opts.extract_flat, 'merge_output_format': opts.merge_output_format, 'postprocessors': postprocessors, @@ -350,12 +367,17 @@ def _real_main(argv=None): 'xattr_set_filesize': opts.xattr_set_filesize, 'match_filter': match_filter, 'no_color': opts.no_color, + 'ffmpeg_location': opts.ffmpeg_location, + 'hls_prefer_native': opts.hls_prefer_native, + 'external_downloader_args': external_downloader_args, + 'postprocessor_args': postprocessor_args, + 'cn_verification_proxy': opts.cn_verification_proxy, } with YoutubeDL(ydl_opts) as ydl: # Update version if opts.update_self: - update_self(ydl.to_screen, opts.verbose) + update_self(ydl.to_screen, opts.verbose, ydl._opener) # Remove cache dir if opts.rm_cachedir: diff --git a/youtube_dl/__main__.py b/youtube_dl/__main__.py index 65a0f891c..42a0f8c6f 100755 --- a/youtube_dl/__main__.py +++ b/youtube_dl/__main__.py @@ -11,7 +11,7 @@ if __package__ is None and not hasattr(sys, "frozen"): # direct call of __main__.py import os.path path = os.path.realpath(os.path.abspath(__file__)) - sys.path.append(os.path.dirname(os.path.dirname(path))) + sys.path.insert(0, os.path.dirname(os.path.dirname(path))) import youtube_dl diff --git a/youtube_dl/aes.py b/youtube_dl/aes.py index 07224d508..7817adcfd 100644 --- a/youtube_dl/aes.py +++ b/youtube_dl/aes.py @@ -152,7 +152,7 @@ def aes_decrypt_text(data, password, key_size_bytes): """ NONCE_LENGTH_BYTES = 8 - data = bytes_to_intlist(base64.b64decode(data)) + data = bytes_to_intlist(base64.b64decode(data.encode('utf-8'))) password = bytes_to_intlist(password.encode('utf-8')) key = password[:key_size_bytes] + [0] * (key_size_bytes - len(password)) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index e989cdbbd..a3e85264a 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -1,12 +1,20 @@ from __future__ import unicode_literals +import binascii +import collections +import email import getpass +import io import optparse import os import re +import shlex +import shutil import socket import subprocess import sys +import itertools +import xml.etree.ElementTree try: @@ -34,20 +42,25 @@ try: except ImportError: # Python 2 import urlparse as compat_urlparse +try: + import urllib.response as compat_urllib_response +except ImportError: # Python 2 + import urllib as compat_urllib_response + try: import http.cookiejar as compat_cookiejar except ImportError: # Python 2 import cookielib as compat_cookiejar try: - import html.entities as compat_html_entities + import http.cookies as compat_cookies except ImportError: # Python 2 - import htmlentitydefs as compat_html_entities + import Cookie as compat_cookies try: - import html.parser as compat_html_parser + import html.entities as compat_html_entities except ImportError: # Python 2 - import HTMLParser as compat_html_parser + import htmlentitydefs as compat_html_entities try: import http.client as compat_http_client @@ -77,47 +90,113 @@ except ImportError: import BaseHTTPServer as compat_http_server try: + compat_str = unicode # Python 2 +except NameError: + compat_str = str + +try: + from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes from urllib.parse import unquote as compat_urllib_parse_unquote -except ImportError: - def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'): - if string == '': + from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus +except ImportError: # Python 2 + _asciire = (compat_urllib_parse._asciire if hasattr(compat_urllib_parse, '_asciire') + else re.compile('([\x00-\x7f]+)')) + + # HACK: The following are the correct unquote_to_bytes, unquote and unquote_plus + # implementations from cpython 3.4.3's stdlib. Python 2's version + # is apparently broken (see https://github.com/rg3/youtube-dl/pull/6244) + + def compat_urllib_parse_unquote_to_bytes(string): + """unquote_to_bytes('abc%20def') -> b'abc def'.""" + # Note: strings are encoded as UTF-8. This is only an issue if it contains + # unescaped non-ASCII characters, which URIs should not. + if not string: + # Is it a string-like object? + string.split + return b'' + if isinstance(string, compat_str): + string = string.encode('utf-8') + bits = string.split(b'%') + if len(bits) == 1: return string - res = string.split('%') - if len(res) == 1: + res = [bits[0]] + append = res.append + for item in bits[1:]: + try: + append(compat_urllib_parse._hextochr[item[:2]]) + append(item[2:]) + except KeyError: + append(b'%') + append(item) + return b''.join(res) + + def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'): + """Replace %xx escapes by their single-character equivalent. The optional + encoding and errors parameters specify how to decode percent-encoded + sequences into Unicode characters, as accepted by the bytes.decode() + method. + By default, percent-encoded sequences are decoded with UTF-8, and invalid + sequences are replaced by a placeholder character. + + unquote('abc%20def') -> 'abc def'. + """ + if '%' not in string: + string.split return string if encoding is None: encoding = 'utf-8' if errors is None: errors = 'replace' - # pct_sequence: contiguous sequence of percent-encoded bytes, decoded - pct_sequence = b'' - string = res[0] - for item in res[1:]: - try: - if not item: - raise ValueError - pct_sequence += item[:2].decode('hex') - rest = item[2:] - if not rest: - # This segment was just a single percent-encoded character. - # May be part of a sequence of code units, so delay decoding. - # (Stored in pct_sequence). - continue - except ValueError: - rest = '%' + item - # Encountered non-percent-encoded characters. Flush the current - # pct_sequence. - string += pct_sequence.decode(encoding, errors) + rest - pct_sequence = b'' - if pct_sequence: - # Flush the final pct_sequence - string += pct_sequence.decode(encoding, errors) - return string + bits = _asciire.split(string) + res = [bits[0]] + append = res.append + for i in range(1, len(bits), 2): + append(compat_urllib_parse_unquote_to_bytes(bits[i]).decode(encoding, errors)) + append(bits[i + 1]) + return ''.join(res) + + def compat_urllib_parse_unquote_plus(string, encoding='utf-8', errors='replace'): + """Like unquote(), but also replace plus signs by spaces, as required for + unquoting HTML form values. + + unquote_plus('%7e/abc+def') -> '~/abc def' + """ + string = string.replace('+', ' ') + return compat_urllib_parse_unquote(string, encoding, errors) try: - compat_str = unicode # Python 2 -except NameError: - compat_str = str + from urllib.request import DataHandler as compat_urllib_request_DataHandler +except ImportError: # Python < 3.4 + # Ported from CPython 98774:1733b3bd46db, Lib/urllib/request.py + class compat_urllib_request_DataHandler(compat_urllib_request.BaseHandler): + def data_open(self, req): + # data URLs as specified in RFC 2397. + # + # ignores POSTed data + # + # syntax: + # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data + # mediatype := [ type "/" subtype ] *( ";" parameter ) + # data := *urlchar + # parameter := attribute "=" value + url = req.get_full_url() + + scheme, data = url.split(":", 1) + mediatype, data = data.split(",", 1) + + # even base64 encoded data URLs might be quoted so unquote in any case: + data = compat_urllib_parse_unquote_to_bytes(data) + if mediatype.endswith(";base64"): + data = binascii.a2b_base64(data) + mediatype = mediatype[:-7] + + if not mediatype: + mediatype = "text/plain;charset=US-ASCII" + + headers = email.message_from_string( + "Content-type: %s\nContent-length: %d\n" % (mediatype, len(data))) + + return compat_urllib_response.addinfourl(io.BytesIO(data), headers, url) try: compat_basestring = basestring # Python 2 @@ -134,6 +213,43 @@ try: except ImportError: # Python 2.6 from xml.parsers.expat import ExpatError as compat_xml_parse_error +if sys.version_info[0] >= 3: + compat_etree_fromstring = xml.etree.ElementTree.fromstring +else: + # python 2.x tries to encode unicode strings with ascii (see the + # XMLParser._fixtext method) + etree = xml.etree.ElementTree + + try: + _etree_iter = etree.Element.iter + except AttributeError: # Python <=2.6 + def _etree_iter(root): + for el in root.findall('*'): + yield el + for sub in _etree_iter(el): + yield sub + + # on 2.6 XML doesn't have a parser argument, function copied from CPython + # 2.7 source + def _XML(text, parser=None): + if not parser: + parser = etree.XMLParser(target=etree.TreeBuilder()) + parser.feed(text) + return parser.close() + + def _element_factory(*args, **kwargs): + el = etree.Element(*args, **kwargs) + for k, v in el.items(): + if isinstance(v, bytes): + el.set(k, v.decode('utf-8')) + return el + + def compat_etree_fromstring(text): + doc = _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory))) + for el in _etree_iter(doc): + if el.text is not None and isinstance(el.text, bytes): + el.text = el.text.decode('utf-8') + return doc try: from urllib.parse import parse_qs as compat_parse_qs @@ -192,6 +308,17 @@ except ImportError: # Python < 3.3 return "'" + s.replace("'", "'\"'\"'") + "'" +if sys.version_info >= (2, 7, 3): + compat_shlex_split = shlex.split +else: + # Working around shlex issue with unicode strings on some python 2 + # versions (see http://bugs.python.org/issue1548891) + def compat_shlex_split(s, comments=False, posix=True): + if isinstance(s, compat_str): + s = s.encode('utf-8') + return shlex.split(s, comments, posix) + + def compat_ord(c): if type(c) is int: return c @@ -364,31 +491,87 @@ def workaround_optparse_bug9161(): return real_add_option(self, *bargs, **bkwargs) optparse.OptionGroup.add_option = _compat_add_option +if hasattr(shutil, 'get_terminal_size'): # Python >= 3.3 + compat_get_terminal_size = shutil.get_terminal_size +else: + _terminal_size = collections.namedtuple('terminal_size', ['columns', 'lines']) + + def compat_get_terminal_size(fallback=(80, 24)): + columns = compat_getenv('COLUMNS') + if columns: + columns = int(columns) + else: + columns = None + lines = compat_getenv('LINES') + if lines: + lines = int(lines) + else: + lines = None + + if columns is None or lines is None or columns <= 0 or lines <= 0: + try: + sp = subprocess.Popen( + ['stty', 'size'], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + out, err = sp.communicate() + _lines, _columns = map(int, out.split()) + except Exception: + _columns, _lines = _terminal_size(*fallback) + + if columns is None or columns <= 0: + columns = _columns + if lines is None or lines <= 0: + lines = _lines + return _terminal_size(columns, lines) + +try: + itertools.count(start=0, step=1) + compat_itertools_count = itertools.count +except TypeError: # Python 2.6 + def compat_itertools_count(start=0, step=1): + n = start + while True: + yield n + n += step + +if sys.version_info >= (3, 0): + from tokenize import tokenize as compat_tokenize_tokenize +else: + from tokenize import generate_tokens as compat_tokenize_tokenize __all__ = [ 'compat_HTTPError', 'compat_basestring', 'compat_chr', 'compat_cookiejar', + 'compat_cookies', + 'compat_etree_fromstring', 'compat_expanduser', + 'compat_get_terminal_size', 'compat_getenv', 'compat_getpass', 'compat_html_entities', - 'compat_html_parser', 'compat_http_client', 'compat_http_server', + 'compat_itertools_count', 'compat_kwargs', 'compat_ord', 'compat_parse_qs', 'compat_print', + 'compat_shlex_split', 'compat_socket_create_connection', 'compat_str', 'compat_subprocess_get_DEVNULL', + 'compat_tokenize_tokenize', 'compat_urllib_error', 'compat_urllib_parse', 'compat_urllib_parse_unquote', + 'compat_urllib_parse_unquote_plus', + 'compat_urllib_parse_unquote_to_bytes', 'compat_urllib_parse_urlparse', 'compat_urllib_request', + 'compat_urllib_request_DataHandler', + 'compat_urllib_response', 'compat_urlparse', 'compat_urlretrieve', 'compat_xml_parse_error', diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index eff1122c5..dccc59212 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -6,8 +6,9 @@ from .f4m import F4mFD from .hls import HlsFD from .hls import NativeHlsFD from .http import HttpFD -from .mplayer import MplayerFD +from .rtsp import RtspFD from .rtmp import RtmpFD +from .dash import DashSegmentsFD from ..utils import ( determine_protocol, @@ -17,9 +18,10 @@ PROTOCOL_MAP = { 'rtmp': RtmpFD, 'm3u8_native': NativeHlsFD, 'm3u8': HlsFD, - 'mms': MplayerFD, - 'rtsp': MplayerFD, + 'mms': RtspFD, + 'rtsp': RtspFD, 'f4m': F4mFD, + 'http_dash_segments': DashSegmentsFD, } @@ -34,6 +36,9 @@ def get_suitable_downloader(info_dict, params={}): if ed.supports(info_dict): return ed + if protocol == 'm3u8' and params.get('hls_prefer_native'): + return NativeHlsFD + return PROTOCOL_MAP.get(protocol, HttpFD) diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 7bb3a948d..b8bf8daf8 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -1,4 +1,4 @@ -from __future__ import unicode_literals +from __future__ import division, unicode_literals import os import re @@ -8,6 +8,7 @@ import time from ..compat import compat_str from ..utils import ( encodeFilename, + decodeArgument, format_bytes, timeconvert, ) @@ -41,7 +42,9 @@ class FileDownloader(object): min_filesize: Skip files smaller than this size max_filesize: Skip files larger than this size xattr_set_filesize: Set ytdl.filesize user xattribute with expected size. - (experimenatal) + (experimental) + external_downloader_args: A list of additional command-line arguments for the + external downloader. Subclasses of this one must re-define the real_download method. """ @@ -54,6 +57,7 @@ class FileDownloader(object): self.ydl = ydl self._progress_hooks = [] self.params = params + self.add_progress_hook(self.report_progress) @staticmethod def format_seconds(seconds): @@ -201,7 +205,7 @@ class FileDownloader(object): return try: os.utime(filename, (time.time(), filetime)) - except: + except Exception: pass return filetime @@ -226,42 +230,64 @@ class FileDownloader(object): self.to_screen(clear_line + fullmsg, skip_eol=not is_last_line) self.to_console_title('youtube-dl ' + msg) - def report_progress(self, percent, data_len_str, speed, eta): - """Report download progress.""" - if self.params.get('noprogress', False): + def report_progress(self, s): + if s['status'] == 'finished': + if self.params.get('noprogress', False): + self.to_screen('[download] Download completed') + else: + s['_total_bytes_str'] = format_bytes(s['total_bytes']) + if s.get('elapsed') is not None: + s['_elapsed_str'] = self.format_seconds(s['elapsed']) + msg_template = '100%% of %(_total_bytes_str)s in %(_elapsed_str)s' + else: + msg_template = '100%% of %(_total_bytes_str)s' + self._report_progress_status( + msg_template % s, is_last_line=True) + + if self.params.get('noprogress'): return - if eta is not None: - eta_str = self.format_eta(eta) - else: - eta_str = 'Unknown ETA' - if percent is not None: - percent_str = self.format_percent(percent) + + if s['status'] != 'downloading': + return + + if s.get('eta') is not None: + s['_eta_str'] = self.format_eta(s['eta']) else: - percent_str = 'Unknown %' - speed_str = self.format_speed(speed) + s['_eta_str'] = 'Unknown ETA' - msg = ('%s of %s at %s ETA %s' % - (percent_str, data_len_str, speed_str, eta_str)) - self._report_progress_status(msg) + if s.get('total_bytes') and s.get('downloaded_bytes') is not None: + s['_percent_str'] = self.format_percent(100 * s['downloaded_bytes'] / s['total_bytes']) + elif s.get('total_bytes_estimate') and s.get('downloaded_bytes') is not None: + s['_percent_str'] = self.format_percent(100 * s['downloaded_bytes'] / s['total_bytes_estimate']) + else: + if s.get('downloaded_bytes') == 0: + s['_percent_str'] = self.format_percent(0) + else: + s['_percent_str'] = 'Unknown %' - def report_progress_live_stream(self, downloaded_data_len, speed, elapsed): - if self.params.get('noprogress', False): - return - downloaded_str = format_bytes(downloaded_data_len) - speed_str = self.format_speed(speed) - elapsed_str = FileDownloader.format_seconds(elapsed) - msg = '%s at %s (%s)' % (downloaded_str, speed_str, elapsed_str) - self._report_progress_status(msg) - - def report_finish(self, data_len_str, tot_time): - """Report download finished.""" - if self.params.get('noprogress', False): - self.to_screen('[download] Download completed') + if s.get('speed') is not None: + s['_speed_str'] = self.format_speed(s['speed']) + else: + s['_speed_str'] = 'Unknown speed' + + if s.get('total_bytes') is not None: + s['_total_bytes_str'] = format_bytes(s['total_bytes']) + msg_template = '%(_percent_str)s of %(_total_bytes_str)s at %(_speed_str)s ETA %(_eta_str)s' + elif s.get('total_bytes_estimate') is not None: + s['_total_bytes_estimate_str'] = format_bytes(s['total_bytes_estimate']) + msg_template = '%(_percent_str)s of ~%(_total_bytes_estimate_str)s at %(_speed_str)s ETA %(_eta_str)s' else: - self._report_progress_status( - ('100%% of %s in %s' % - (data_len_str, self.format_seconds(tot_time))), - is_last_line=True) + if s.get('downloaded_bytes') is not None: + s['_downloaded_bytes_str'] = format_bytes(s['downloaded_bytes']) + if s.get('elapsed'): + s['_elapsed_str'] = self.format_seconds(s['elapsed']) + msg_template = '%(_downloaded_bytes_str)s at %(_speed_str)s (%(_elapsed_str)s)' + else: + msg_template = '%(_downloaded_bytes_str)s at %(_speed_str)s' + else: + msg_template = '%(_percent_str)s % at %(_speed_str)s ETA %(_eta_str)s' + + self._report_progress_status(msg_template % s) def report_resuming_byte(self, resume_len): """Report attempt to resume at given byte.""" @@ -288,18 +314,18 @@ class FileDownloader(object): """ nooverwrites_and_exists = ( - self.params.get('nooverwrites', False) - and os.path.exists(encodeFilename(filename)) + self.params.get('nooverwrites', False) and + os.path.exists(encodeFilename(filename)) ) continuedl_and_exists = ( - self.params.get('continuedl', False) - and os.path.isfile(encodeFilename(filename)) - and not self.params.get('nopart', False) + self.params.get('continuedl', True) and + os.path.isfile(encodeFilename(filename)) and + not self.params.get('nopart', False) ) # Check file already present - if filename != '-' and nooverwrites_and_exists or continuedl_and_exists: + if filename != '-' and (nooverwrites_and_exists or continuedl_and_exists): self.report_file_already_downloaded(filename) self._hook_progress({ 'filename': filename, @@ -328,19 +354,15 @@ class FileDownloader(object): # this interface self._progress_hooks.append(ph) - def _debug_cmd(self, args, subprocess_encoding, exe=None): + def _debug_cmd(self, args, exe=None): if not self.params.get('verbose', False): return + str_args = [decodeArgument(a) for a in args] + if exe is None: - exe = os.path.basename(args[0]) + exe = os.path.basename(str_args[0]) - if subprocess_encoding: - str_args = [ - a.decode(subprocess_encoding) if isinstance(a, bytes) else a - for a in args] - else: - str_args = args try: import pipes shell_quote = lambda args: ' '.join(map(pipes.quote, str_args)) diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py new file mode 100644 index 000000000..535f2a7fc --- /dev/null +++ b/youtube_dl/downloader/dash.py @@ -0,0 +1,66 @@ +from __future__ import unicode_literals + +import re + +from .common import FileDownloader +from ..utils import sanitized_Request + + +class DashSegmentsFD(FileDownloader): + """ + Download segments in a DASH manifest + """ + def real_download(self, filename, info_dict): + self.report_destination(filename) + tmpfilename = self.temp_name(filename) + base_url = info_dict['url'] + segment_urls = info_dict['segment_urls'] + + is_test = self.params.get('test', False) + remaining_bytes = self._TEST_FILE_SIZE if is_test else None + byte_counter = 0 + + def append_url_to_file(outf, target_url, target_name, remaining_bytes=None): + self.to_screen('[DashSegments] %s: Downloading %s' % (info_dict['id'], target_name)) + req = sanitized_Request(target_url) + if remaining_bytes is not None: + req.add_header('Range', 'bytes=0-%d' % (remaining_bytes - 1)) + + data = self.ydl.urlopen(req).read() + + if remaining_bytes is not None: + data = data[:remaining_bytes] + + outf.write(data) + return len(data) + + def combine_url(base_url, target_url): + if re.match(r'^https?://', target_url): + return target_url + return '%s%s%s' % (base_url, '' if base_url.endswith('/') else '/', target_url) + + with open(tmpfilename, 'wb') as outf: + append_url_to_file( + outf, combine_url(base_url, info_dict['initialization_url']), + 'initialization segment') + for i, segment_url in enumerate(segment_urls): + segment_len = append_url_to_file( + outf, combine_url(base_url, segment_url), + 'segment %d / %d' % (i + 1, len(segment_urls)), + remaining_bytes) + byte_counter += segment_len + if remaining_bytes is not None: + remaining_bytes -= segment_len + if remaining_bytes <= 0: + break + + self.try_rename(tmpfilename, filename) + + self._hook_progress({ + 'downloaded_bytes': byte_counter, + 'total_bytes': byte_counter, + 'filename': filename, + 'status': 'finished', + }) + + return True diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index ff031d2e0..2bc011266 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -2,11 +2,15 @@ from __future__ import unicode_literals import os.path import subprocess -import sys from .common import FileDownloader from ..utils import ( + cli_option, + cli_valueless_option, + cli_bool_option, + cli_configuration_args, encodeFilename, + encodeArgument, ) @@ -45,25 +49,23 @@ class ExternalFD(FileDownloader): def supports(cls, info_dict): return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps') - def _source_address(self, command_option): - source_address = self.params.get('source_address') - if source_address is None: - return [] - return [command_option, source_address] + def _option(self, command_option, param): + return cli_option(self.params, command_option, param) + + def _bool_option(self, command_option, param, true_value='true', false_value='false', separator=None): + return cli_bool_option(self.params, command_option, param, true_value, false_value, separator) + + def _valueless_option(self, command_option, param, expected_value=True): + return cli_valueless_option(self.params, command_option, param, expected_value) + + def _configuration_args(self, default=[]): + return cli_configuration_args(self.params, 'external_downloader_args', default) def _call_downloader(self, tmpfilename, info_dict): """ Either overwrite this or implement _make_cmd """ - cmd = self._make_cmd(tmpfilename, info_dict) - - if sys.platform == 'win32' and sys.version_info < (3, 0): - # Windows subprocess module does not actually support Unicode - # on Python 2.x - # See http://stackoverflow.com/a/9951851/35070 - subprocess_encoding = sys.getfilesystemencoding() - cmd = [a.encode(subprocess_encoding, 'ignore') for a in cmd] - else: - subprocess_encoding = None - self._debug_cmd(cmd, subprocess_encoding) + cmd = [encodeArgument(a) for a in self._make_cmd(tmpfilename, info_dict)] + + self._debug_cmd(cmd) p = subprocess.Popen( cmd, stderr=subprocess.PIPE) @@ -75,10 +77,23 @@ class ExternalFD(FileDownloader): class CurlFD(ExternalFD): def _make_cmd(self, tmpfilename, info_dict): - cmd = [self.exe, '-o', tmpfilename] + cmd = [self.exe, '--location', '-o', tmpfilename] for key, val in info_dict['http_headers'].items(): cmd += ['--header', '%s: %s' % (key, val)] - cmd += self._source_address('--interface') + cmd += self._option('--interface', 'source_address') + cmd += self._option('--proxy', 'proxy') + cmd += self._valueless_option('--insecure', 'nocheckcertificate') + cmd += self._configuration_args() + cmd += ['--', info_dict['url']] + return cmd + + +class AxelFD(ExternalFD): + def _make_cmd(self, tmpfilename, info_dict): + cmd = [self.exe, '-o', tmpfilename] + for key, val in info_dict['http_headers'].items(): + cmd += ['-H', '%s: %s' % (key, val)] + cmd += self._configuration_args() cmd += ['--', info_dict['url']] return cmd @@ -88,26 +103,39 @@ class WgetFD(ExternalFD): cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies'] for key, val in info_dict['http_headers'].items(): cmd += ['--header', '%s: %s' % (key, val)] - cmd += self._source_address('--bind-address') + cmd += self._option('--bind-address', 'source_address') + cmd += self._option('--proxy', 'proxy') + cmd += self._valueless_option('--no-check-certificate', 'nocheckcertificate') + cmd += self._configuration_args() cmd += ['--', info_dict['url']] return cmd class Aria2cFD(ExternalFD): def _make_cmd(self, tmpfilename, info_dict): - cmd = [ - self.exe, '-c', - '--min-split-size', '1M', '--max-connection-per-server', '4'] + cmd = [self.exe, '-c'] + cmd += self._configuration_args([ + '--min-split-size', '1M', '--max-connection-per-server', '4']) dn = os.path.dirname(tmpfilename) if dn: cmd += ['--dir', dn] cmd += ['--out', os.path.basename(tmpfilename)] for key, val in info_dict['http_headers'].items(): cmd += ['--header', '%s: %s' % (key, val)] - cmd += self._source_address('--interface') + cmd += self._option('--interface', 'source_address') + cmd += self._option('--all-proxy', 'proxy') + cmd += self._bool_option('--check-certificate', 'nocheckcertificate', 'false', 'true', '=') cmd += ['--', info_dict['url']] return cmd + +class HttpieFD(ExternalFD): + def _make_cmd(self, tmpfilename, info_dict): + cmd = ['http', '--download', '--output', tmpfilename, info_dict['url']] + for key, val in info_dict['http_headers'].items(): + cmd += ['%s:%s' % (key, val)] + return cmd + _BY_NAME = dict( (klass.get_basename(), klass) for name, klass in globals().items() @@ -122,5 +150,6 @@ def list_external_downloaders(): def get_external_downloader(external_downloader): """ Given the name of the executable, see whether we support the given downloader . """ - bn = os.path.basename(external_downloader) + # Drop .exe extension on Windows + bn = os.path.splitext(os.path.basename(external_downloader))[0] return _BY_NAME[bn] diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 0e7a1c200..6170cc155 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -1,23 +1,23 @@ -from __future__ import unicode_literals +from __future__ import division, unicode_literals import base64 import io import itertools import os import time -import xml.etree.ElementTree as etree -from .common import FileDownloader -from .http import HttpFD +from .fragment import FragmentFD from ..compat import ( + compat_etree_fromstring, compat_urlparse, + compat_urllib_error, + compat_urllib_parse_urlparse, ) from ..utils import ( - struct_pack, - struct_unpack, - format_bytes, encodeFilename, sanitize_open, + struct_pack, + struct_unpack, xpath_text, ) @@ -122,7 +122,8 @@ class FlvReader(io.BytesIO): self.read_unsigned_int() # BootstrapinfoVersion # Profile,Live,Update,Reserved - self.read(1) + flags = self.read_unsigned_char() + live = flags & 0x20 != 0 # time scale self.read_unsigned_int() # CurrentMediaTime @@ -161,6 +162,7 @@ class FlvReader(io.BytesIO): return { 'segments': segments, 'fragments': fragments, + 'live': live, } def read_bootstrap_info(self): @@ -183,6 +185,10 @@ def build_fragments_list(boot_info): for segment, fragments_count in segment_run_table['segment_run']: for _ in range(fragments_count): res.append((segment, next(fragments_counter))) + + if boot_info['live']: + res = res[-2:] + return res @@ -220,16 +226,13 @@ def _add_ns(prop): return '{http://ns.adobe.com/f4m/1.0}%s' % prop -class HttpQuietDownloader(HttpFD): - def to_screen(self, *args, **kargs): - pass - - -class F4mFD(FileDownloader): +class F4mFD(FragmentFD): """ A downloader for f4m manifests or AdobeHDS. """ + FD_NAME = 'f4m' + def _get_unencrypted_media(self, doc): media = doc.findall(_add_ns('media')) if not media: @@ -247,24 +250,47 @@ class F4mFD(FileDownloader): self.report_error('Unsupported DRM') return media + def _get_bootstrap_from_url(self, bootstrap_url): + bootstrap = self.ydl.urlopen(bootstrap_url).read() + return read_bootstrap_info(bootstrap) + + def _update_live_fragments(self, bootstrap_url, latest_fragment): + fragments_list = [] + retries = 30 + while (not fragments_list) and (retries > 0): + boot_info = self._get_bootstrap_from_url(bootstrap_url) + fragments_list = build_fragments_list(boot_info) + fragments_list = [f for f in fragments_list if f[1] > latest_fragment] + if not fragments_list: + # Retry after a while + time.sleep(5.0) + retries -= 1 + + if not fragments_list: + self.report_error('Failed to update fragments') + + return fragments_list + + def _parse_bootstrap_node(self, node, base_url): + if node.text is None: + bootstrap_url = compat_urlparse.urljoin( + base_url, node.attrib['url']) + boot_info = self._get_bootstrap_from_url(bootstrap_url) + else: + bootstrap_url = None + bootstrap = base64.b64decode(node.text.encode('ascii')) + boot_info = read_bootstrap_info(bootstrap) + return (boot_info, bootstrap_url) + def real_download(self, filename, info_dict): man_url = info_dict['url'] requested_bitrate = info_dict.get('tbr') - self.to_screen('[download] Downloading f4m manifest') - manifest = self.ydl.urlopen(man_url).read() - self.report_destination(filename) - http_dl = HttpQuietDownloader( - self.ydl, - { - 'continuedl': True, - 'quiet': True, - 'noprogress': True, - 'ratelimit': self.params.get('ratelimit', None), - 'test': self.params.get('test', False), - } - ) - - doc = etree.fromstring(manifest) + self.to_screen('[%s] Downloading f4m manifest' % self.FD_NAME) + urlh = self.ydl.urlopen(man_url) + man_url = urlh.geturl() + manifest = urlh.read() + + doc = compat_etree_fromstring(manifest) formats = [(int(f.attrib.get('bitrate', -1)), f) for f in self._get_unencrypted_media(doc)] if requested_bitrate is None: @@ -277,18 +303,13 @@ class F4mFD(FileDownloader): base_url = compat_urlparse.urljoin(man_url, media.attrib['url']) bootstrap_node = doc.find(_add_ns('bootstrapInfo')) - if bootstrap_node.text is None: - bootstrap_url = compat_urlparse.urljoin( - base_url, bootstrap_node.attrib['url']) - bootstrap = self.ydl.urlopen(bootstrap_url).read() - else: - bootstrap = base64.b64decode(bootstrap_node.text) + boot_info, bootstrap_url = self._parse_bootstrap_node(bootstrap_node, base_url) + live = boot_info['live'] metadata_node = media.find(_add_ns('metadata')) if metadata_node is not None: - metadata = base64.b64decode(metadata_node.text) + metadata = base64.b64decode(metadata_node.text.encode('ascii')) else: metadata = None - boot_info = read_bootstrap_info(bootstrap) fragments_list = build_fragments_list(boot_info) if self.params.get('test', False): @@ -298,74 +319,73 @@ class F4mFD(FileDownloader): # For some akamai manifests we'll need to add a query to the fragment url akamai_pv = xpath_text(doc, _add_ns('pv-2.0')) - tmpfilename = self.temp_name(filename) - (dest_stream, tmpfilename) = sanitize_open(tmpfilename, 'wb') + ctx = { + 'filename': filename, + 'total_frags': total_frags, + } + + self._prepare_frag_download(ctx) + + dest_stream = ctx['dest_stream'] + write_flv_header(dest_stream) - write_metadata_tag(dest_stream, metadata) + if not live: + write_metadata_tag(dest_stream, metadata) - # This dict stores the download progress, it's updated by the progress - # hook - state = { - 'downloaded_bytes': 0, - 'frag_counter': 0, - } - start = time.time() - - def frag_progress_hook(status): - frag_total_bytes = status.get('total_bytes', 0) - estimated_size = (state['downloaded_bytes'] + - (total_frags - state['frag_counter']) * frag_total_bytes) - if status['status'] == 'finished': - state['downloaded_bytes'] += frag_total_bytes - state['frag_counter'] += 1 - progress = self.calc_percent(state['frag_counter'], total_frags) - byte_counter = state['downloaded_bytes'] - else: - frag_downloaded_bytes = status['downloaded_bytes'] - byte_counter = state['downloaded_bytes'] + frag_downloaded_bytes - frag_progress = self.calc_percent(frag_downloaded_bytes, - frag_total_bytes) - progress = self.calc_percent(state['frag_counter'], total_frags) - progress += frag_progress / float(total_frags) - - eta = self.calc_eta(start, time.time(), estimated_size, byte_counter) - self.report_progress(progress, format_bytes(estimated_size), - status.get('speed'), eta) - http_dl.add_progress_hook(frag_progress_hook) + base_url_parsed = compat_urllib_parse_urlparse(base_url) + + self._start_frag_download(ctx) frags_filenames = [] - for (seg_i, frag_i) in fragments_list: + while fragments_list: + seg_i, frag_i = fragments_list.pop(0) name = 'Seg%d-Frag%d' % (seg_i, frag_i) - url = base_url + name + query = [] + if base_url_parsed.query: + query.append(base_url_parsed.query) if akamai_pv: - url += '?' + akamai_pv.strip(';') - frag_filename = '%s-%s' % (tmpfilename, name) - success = http_dl.download(frag_filename, {'url': url}) - if not success: - return False - with open(frag_filename, 'rb') as down: + query.append(akamai_pv.strip(';')) + if info_dict.get('extra_param_to_segment_url'): + query.append(info_dict['extra_param_to_segment_url']) + url_parsed = base_url_parsed._replace(path=base_url_parsed.path + name, query='&'.join(query)) + frag_filename = '%s-%s' % (ctx['tmpfilename'], name) + try: + success = ctx['dl'].download(frag_filename, {'url': url_parsed.geturl()}) + if not success: + return False + (down, frag_sanitized) = sanitize_open(frag_filename, 'rb') down_data = down.read() + down.close() reader = FlvReader(down_data) while True: _, box_type, box_data = reader.read_box_info() if box_type == b'mdat': dest_stream.write(box_data) break - frags_filenames.append(frag_filename) + if live: + os.remove(encodeFilename(frag_sanitized)) + else: + frags_filenames.append(frag_sanitized) + except (compat_urllib_error.HTTPError, ) as err: + if live and (err.code == 404 or err.code == 410): + # We didn't keep up with the live window. Continue + # with the next available fragment. + msg = 'Fragment %d unavailable' % frag_i + self.report_warning(msg) + fragments_list = [] + else: + raise + + if not fragments_list and live and bootstrap_url: + fragments_list = self._update_live_fragments(bootstrap_url, frag_i) + total_frags += len(fragments_list) + if fragments_list and (fragments_list[0][1] > frag_i + 1): + msg = 'Missed %d fragments' % (fragments_list[0][1] - (frag_i + 1)) + self.report_warning(msg) + + self._finish_frag_download(ctx) - dest_stream.close() - self.report_finish(format_bytes(state['downloaded_bytes']), time.time() - start) - - self.try_rename(tmpfilename, filename) for frag_file in frags_filenames: - os.remove(frag_file) - - fsize = os.path.getsize(encodeFilename(filename)) - self._hook_progress({ - 'downloaded_bytes': fsize, - 'total_bytes': fsize, - 'filename': filename, - 'status': 'finished', - }) + os.remove(encodeFilename(frag_file)) return True diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py new file mode 100644 index 000000000..5a64b29ee --- /dev/null +++ b/youtube_dl/downloader/fragment.py @@ -0,0 +1,111 @@ +from __future__ import division, unicode_literals + +import os +import time + +from .common import FileDownloader +from .http import HttpFD +from ..utils import ( + encodeFilename, + sanitize_open, +) + + +class HttpQuietDownloader(HttpFD): + def to_screen(self, *args, **kargs): + pass + + +class FragmentFD(FileDownloader): + """ + A base file downloader class for fragmented media (e.g. f4m/m3u8 manifests). + """ + + def _prepare_and_start_frag_download(self, ctx): + self._prepare_frag_download(ctx) + self._start_frag_download(ctx) + + def _prepare_frag_download(self, ctx): + self.to_screen('[%s] Total fragments: %d' % (self.FD_NAME, ctx['total_frags'])) + self.report_destination(ctx['filename']) + dl = HttpQuietDownloader( + self.ydl, + { + 'continuedl': True, + 'quiet': True, + 'noprogress': True, + 'ratelimit': self.params.get('ratelimit', None), + 'retries': self.params.get('retries', 0), + 'test': self.params.get('test', False), + } + ) + tmpfilename = self.temp_name(ctx['filename']) + dest_stream, tmpfilename = sanitize_open(tmpfilename, 'wb') + ctx.update({ + 'dl': dl, + 'dest_stream': dest_stream, + 'tmpfilename': tmpfilename, + }) + + def _start_frag_download(self, ctx): + total_frags = ctx['total_frags'] + # This dict stores the download progress, it's updated by the progress + # hook + state = { + 'status': 'downloading', + 'downloaded_bytes': 0, + 'frag_index': 0, + 'frag_count': total_frags, + 'filename': ctx['filename'], + 'tmpfilename': ctx['tmpfilename'], + } + start = time.time() + ctx['started'] = start + + def frag_progress_hook(s): + if s['status'] not in ('downloading', 'finished'): + return + + frag_total_bytes = s.get('total_bytes', 0) + if s['status'] == 'finished': + state['downloaded_bytes'] += frag_total_bytes + state['frag_index'] += 1 + + estimated_size = ( + (state['downloaded_bytes'] + frag_total_bytes) / + (state['frag_index'] + 1) * total_frags) + time_now = time.time() + state['total_bytes_estimate'] = estimated_size + state['elapsed'] = time_now - start + + if s['status'] == 'finished': + progress = self.calc_percent(state['frag_index'], total_frags) + else: + frag_downloaded_bytes = s['downloaded_bytes'] + frag_progress = self.calc_percent(frag_downloaded_bytes, + frag_total_bytes) + progress = self.calc_percent(state['frag_index'], total_frags) + progress += frag_progress / float(total_frags) + + state['eta'] = self.calc_eta( + start, time_now, estimated_size, state['downloaded_bytes'] + frag_downloaded_bytes) + state['speed'] = s.get('speed') + self._hook_progress(state) + + ctx['dl'].add_progress_hook(frag_progress_hook) + + return start + + def _finish_frag_download(self, ctx): + ctx['dest_stream'].close() + elapsed = time.time() - ctx['started'] + self.try_rename(ctx['tmpfilename'], ctx['filename']) + fsize = os.path.getsize(encodeFilename(ctx['filename'])) + + self._hook_progress({ + 'downloaded_bytes': fsize, + 'total_bytes': fsize, + 'filename': ctx['filename'], + 'status': 'finished', + 'elapsed': elapsed, + }) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index e527ee425..b5a3e1167 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -4,15 +4,16 @@ import os import re import subprocess -from ..postprocessor.ffmpeg import FFmpegPostProcessor from .common import FileDownloader -from ..compat import ( - compat_urlparse, - compat_urllib_request, -) +from .fragment import FragmentFD + +from ..compat import compat_urlparse +from ..postprocessor.ffmpeg import FFmpegPostProcessor from ..utils import ( encodeArgument, encodeFilename, + sanitize_open, + handle_youtubedl_headers, ) @@ -23,16 +24,27 @@ class HlsFD(FileDownloader): tmpfilename = self.temp_name(filename) ffpp = FFmpegPostProcessor(downloader=self) - program = ffpp._executable - if program is None: + if not ffpp.available: self.report_error('m3u8 download detected but ffmpeg or avconv could not be found. Please install one.') return False ffpp.check_version() - args = [ - encodeArgument(opt) - for opt in (program, '-y', '-i', url, '-f', 'mp4', '-c', 'copy', '-bsf:a', 'aac_adtstoasc')] - args.append(encodeFilename(tmpfilename, True)) + args = [ffpp.executable, '-y'] + + if info_dict['http_headers'] and re.match(r'^https?://', url): + # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: + # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. + headers = handle_youtubedl_headers(info_dict['http_headers']) + args += [ + '-headers', + ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())] + + args += ['-i', url, '-f', 'mp4', '-c', 'copy', '-bsf:a', 'aac_adtstoasc'] + + args = [encodeArgument(opt) for opt in args] + args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True)) + + self._debug_cmd(args) retval = subprocess.call(args) if retval == 0: @@ -48,58 +60,55 @@ class HlsFD(FileDownloader): return True else: self.to_stderr('\n') - self.report_error('%s exited with code %d' % (program, retval)) + self.report_error('%s exited with code %d' % (ffpp.basename, retval)) return False -class NativeHlsFD(FileDownloader): +class NativeHlsFD(FragmentFD): """ A more limited implementation that does not require ffmpeg """ + FD_NAME = 'hlsnative' + def real_download(self, filename, info_dict): - url = info_dict['url'] - self.report_destination(filename) - tmpfilename = self.temp_name(filename) + man_url = info_dict['url'] + self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME) + manifest = self.ydl.urlopen(man_url).read() - self.to_screen( - '[hlsnative] %s: Downloading m3u8 manifest' % info_dict['id']) - data = self.ydl.urlopen(url).read() - s = data.decode('utf-8', 'ignore') - segment_urls = [] + s = manifest.decode('utf-8', 'ignore') + fragment_urls = [] for line in s.splitlines(): line = line.strip() if line and not line.startswith('#'): segment_url = ( line if re.match(r'^https?://', line) - else compat_urlparse.urljoin(url, line)) - segment_urls.append(segment_url) - - is_test = self.params.get('test', False) - remaining_bytes = self._TEST_FILE_SIZE if is_test else None - byte_counter = 0 - with open(tmpfilename, 'wb') as outf: - for i, segurl in enumerate(segment_urls): - self.to_screen( - '[hlsnative] %s: Downloading segment %d / %d' % - (info_dict['id'], i + 1, len(segment_urls))) - seg_req = compat_urllib_request.Request(segurl) - if remaining_bytes is not None: - seg_req.add_header('Range', 'bytes=0-%d' % (remaining_bytes - 1)) - - segment = self.ydl.urlopen(seg_req).read() - if remaining_bytes is not None: - segment = segment[:remaining_bytes] - remaining_bytes -= len(segment) - outf.write(segment) - byte_counter += len(segment) - if remaining_bytes is not None and remaining_bytes <= 0: + else compat_urlparse.urljoin(man_url, line)) + fragment_urls.append(segment_url) + # We only download the first fragment during the test + if self.params.get('test', False): break - self._hook_progress({ - 'downloaded_bytes': byte_counter, - 'total_bytes': byte_counter, + ctx = { 'filename': filename, - 'status': 'finished', - }) - self.try_rename(tmpfilename, filename) + 'total_frags': len(fragment_urls), + } + + self._prepare_and_start_frag_download(ctx) + + frags_filenames = [] + for i, frag_url in enumerate(fragment_urls): + frag_filename = '%s-Frag%d' % (ctx['tmpfilename'], i) + success = ctx['dl'].download(frag_filename, {'url': frag_url}) + if not success: + return False + down, frag_sanitized = sanitize_open(frag_filename, 'rb') + ctx['dest_stream'].write(down.read()) + down.close() + frags_filenames.append(frag_sanitized) + + self._finish_frag_download(ctx) + + for frag_file in frags_filenames: + os.remove(encodeFilename(frag_file)) + return True diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index 49170cf9d..56840e026 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -1,21 +1,18 @@ from __future__ import unicode_literals +import errno import os +import socket import time - -from socket import error as SocketError -import errno +import re from .common import FileDownloader -from ..compat import ( - compat_urllib_request, - compat_urllib_error, -) +from ..compat import compat_urllib_error from ..utils import ( ContentTooShortError, encodeFilename, sanitize_open, - format_bytes, + sanitized_Request, ) @@ -30,13 +27,8 @@ class HttpFD(FileDownloader): add_headers = info_dict.get('http_headers') if add_headers: headers.update(add_headers) - data = info_dict.get('http_post_data') - http_method = info_dict.get('http_method') - basic_request = compat_urllib_request.Request(url, data, headers) - request = compat_urllib_request.Request(url, data, headers) - if http_method is not None: - basic_request.get_method = lambda: http_method - request.get_method = lambda: http_method + basic_request = sanitized_Request(url, None, headers) + request = sanitized_Request(url, None, headers) is_test = self.params.get('test', False) @@ -51,7 +43,7 @@ class HttpFD(FileDownloader): open_mode = 'wb' if resume_len != 0: - if self.params.get('continuedl', False): + if self.params.get('continuedl', True): self.report_resuming_byte(resume_len) request.add_header('Range', 'bytes=%d-' % resume_len) open_mode = 'ab' @@ -64,6 +56,24 @@ class HttpFD(FileDownloader): # Establish connection try: data = self.ydl.urlopen(request) + # When trying to resume, Content-Range HTTP header of response has to be checked + # to match the value of requested Range HTTP header. This is due to a webservers + # that don't support resuming and serve a whole file with no Content-Range + # set in response despite of requested Range (see + # https://github.com/rg3/youtube-dl/issues/6057#issuecomment-126129799) + if resume_len > 0: + content_range = data.headers.get('Content-Range') + if content_range: + content_range_m = re.search(r'bytes (\d+)-', content_range) + # Content-Range is present and matches requested Range, resume is possible + if content_range_m and resume_len == int(content_range_m.group(1)): + break + # Content-Range is either not present or invalid. Assuming remote webserver is + # trying to send the whole file, resume is not possible, so wiping the local file + # and performing entire redownload + self.report_unable_to_resume() + resume_len = 0 + open_mode = 'wb' break except (compat_urllib_error.HTTPError, ) as err: if (err.code < 500 or err.code >= 600) and err.code != 416: @@ -94,6 +104,8 @@ class HttpFD(FileDownloader): self._hook_progress({ 'filename': filename, 'status': 'finished', + 'downloaded_bytes': resume_len, + 'total_bytes': resume_len, }) return True else: @@ -102,7 +114,7 @@ class HttpFD(FileDownloader): resume_len = 0 open_mode = 'wb' break - except SocketError as e: + except socket.error as e: if e.errno != errno.ECONNRESET: # Connection reset is no problem, just retry raise @@ -137,7 +149,6 @@ class HttpFD(FileDownloader): self.to_screen('\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len)) return False - data_len_str = format_bytes(data_len) byte_counter = 0 + resume_len block_size = self.params.get('buffersize', 1024) start = time.time() @@ -196,20 +207,19 @@ class HttpFD(FileDownloader): # Progress message speed = self.calc_speed(start, now, byte_counter - resume_len) if data_len is None: - eta = percent = None + eta = None else: - percent = self.calc_percent(byte_counter, data_len) eta = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) - self.report_progress(percent, data_len_str, speed, eta) self._hook_progress({ + 'status': 'downloading', 'downloaded_bytes': byte_counter, 'total_bytes': data_len, 'tmpfilename': tmpfilename, 'filename': filename, - 'status': 'downloading', 'eta': eta, 'speed': speed, + 'elapsed': now - start, }) if is_test and byte_counter == data_len: @@ -221,7 +231,7 @@ class HttpFD(FileDownloader): return False if tmpfilename != '-': stream.close() - self.report_finish(data_len_str, (time.time() - start)) + if data_len is not None and byte_counter != data_len: raise ContentTooShortError(byte_counter, int(data_len)) self.try_rename(tmpfilename, filename) @@ -235,6 +245,7 @@ class HttpFD(FileDownloader): 'total_bytes': byte_counter, 'filename': filename, 'status': 'finished', + 'elapsed': time.time() - start, }) return True diff --git a/youtube_dl/downloader/mplayer.py b/youtube_dl/downloader/mplayer.py deleted file mode 100644 index 72cef30ea..000000000 --- a/youtube_dl/downloader/mplayer.py +++ /dev/null @@ -1,43 +0,0 @@ -from __future__ import unicode_literals - -import os -import subprocess - -from .common import FileDownloader -from ..utils import ( - check_executable, - encodeFilename, -) - - -class MplayerFD(FileDownloader): - def real_download(self, filename, info_dict): - url = info_dict['url'] - self.report_destination(filename) - tmpfilename = self.temp_name(filename) - - args = [ - 'mplayer', '-really-quiet', '-vo', 'null', '-vc', 'dummy', - '-dumpstream', '-dumpfile', tmpfilename, url] - # Check for mplayer first - if not check_executable('mplayer', ['-h']): - self.report_error('MMS or RTSP download detected but "%s" could not be run' % args[0]) - return False - - # Download using mplayer. - retval = subprocess.call(args) - if retval == 0: - fsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen('\r[%s] %s bytes' % (args[0], fsize)) - self.try_rename(tmpfilename, filename) - self._hook_progress({ - 'downloaded_bytes': fsize, - 'total_bytes': fsize, - 'filename': filename, - 'status': 'finished', - }) - return True - else: - self.to_stderr('\n') - self.report_error('mplayer exited with code %d' % retval) - return False diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py index f7eeb6f43..14d56db47 100644 --- a/youtube_dl/downloader/rtmp.py +++ b/youtube_dl/downloader/rtmp.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import os import re import subprocess -import sys import time from .common import FileDownloader @@ -11,7 +10,7 @@ from ..compat import compat_str from ..utils import ( check_executable, encodeFilename, - format_bytes, + encodeArgument, get_exe_version, ) @@ -51,23 +50,23 @@ class RtmpFD(FileDownloader): if not resume_percent: resume_percent = percent resume_downloaded_data_len = downloaded_data_len - eta = self.calc_eta(start, time.time(), 100 - resume_percent, percent - resume_percent) - speed = self.calc_speed(start, time.time(), downloaded_data_len - resume_downloaded_data_len) + time_now = time.time() + eta = self.calc_eta(start, time_now, 100 - resume_percent, percent - resume_percent) + speed = self.calc_speed(start, time_now, downloaded_data_len - resume_downloaded_data_len) data_len = None if percent > 0: data_len = int(downloaded_data_len * 100 / percent) - data_len_str = '~' + format_bytes(data_len) - self.report_progress(percent, data_len_str, speed, eta) - cursor_in_new_line = False self._hook_progress({ + 'status': 'downloading', 'downloaded_bytes': downloaded_data_len, - 'total_bytes': data_len, + 'total_bytes_estimate': data_len, 'tmpfilename': tmpfilename, 'filename': filename, - 'status': 'downloading', 'eta': eta, + 'elapsed': time_now - start, 'speed': speed, }) + cursor_in_new_line = False else: # no percent for live streams mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec', line) @@ -75,15 +74,15 @@ class RtmpFD(FileDownloader): downloaded_data_len = int(float(mobj.group(1)) * 1024) time_now = time.time() speed = self.calc_speed(start, time_now, downloaded_data_len) - self.report_progress_live_stream(downloaded_data_len, speed, time_now - start) - cursor_in_new_line = False self._hook_progress({ 'downloaded_bytes': downloaded_data_len, 'tmpfilename': tmpfilename, 'filename': filename, 'status': 'downloading', + 'elapsed': time_now - start, 'speed': speed, }) + cursor_in_new_line = False elif self.params.get('verbose', False): if not cursor_in_new_line: self.to_screen('') @@ -106,7 +105,7 @@ class RtmpFD(FileDownloader): protocol = info_dict.get('rtmp_protocol', None) real_time = info_dict.get('rtmp_real_time', False) no_resume = info_dict.get('no_resume', False) - continue_dl = info_dict.get('continuedl', False) + continue_dl = self.params.get('continuedl', True) self.report_destination(filename) tmpfilename = self.temp_name(filename) @@ -118,9 +117,11 @@ class RtmpFD(FileDownloader): return False # Download using rtmpdump. rtmpdump returns exit code 2 when - # the connection was interrumpted and resuming appears to be + # the connection was interrupted and resuming appears to be # possible. This is part of rtmpdump's normal usage, AFAIK. - basic_args = ['rtmpdump', '--verbose', '-r', url, '-o', tmpfilename] + basic_args = [ + 'rtmpdump', '--verbose', '-r', url, + '-o', tmpfilename] if player_url is not None: basic_args += ['--swfVfy', player_url] if page_url is not None: @@ -130,7 +131,7 @@ class RtmpFD(FileDownloader): if play_path is not None: basic_args += ['--playpath', play_path] if tc_url is not None: - basic_args += ['--tcUrl', url] + basic_args += ['--tcUrl', tc_url] if test: basic_args += ['--stop', '1'] if flash_version is not None: @@ -153,16 +154,9 @@ class RtmpFD(FileDownloader): if not live and continue_dl: args += ['--skip', '1'] - if sys.platform == 'win32' and sys.version_info < (3, 0): - # Windows subprocess module does not actually support Unicode - # on Python 2.x - # See http://stackoverflow.com/a/9951851/35070 - subprocess_encoding = sys.getfilesystemencoding() - args = [a.encode(subprocess_encoding, 'ignore') for a in args] - else: - subprocess_encoding = None + args = [encodeArgument(a) for a in args] - self._debug_cmd(args, subprocess_encoding, exe='rtmpdump') + self._debug_cmd(args, exe='rtmpdump') RD_SUCCESS = 0 RD_FAILED = 1 @@ -179,7 +173,11 @@ class RtmpFD(FileDownloader): prevsize = os.path.getsize(encodeFilename(tmpfilename)) self.to_screen('[rtmpdump] %s bytes' % prevsize) time.sleep(5.0) # This seems to be needed - retval = run_rtmpdump(basic_args + ['-e'] + [[], ['-k', '1']][retval == RD_FAILED]) + args = basic_args + ['--resume'] + if retval == RD_FAILED: + args += ['--skip', '1'] + args = [encodeArgument(a) for a in args] + retval = run_rtmpdump(args) cursize = os.path.getsize(encodeFilename(tmpfilename)) if prevsize == cursize and retval == RD_FAILED: break diff --git a/youtube_dl/downloader/rtsp.py b/youtube_dl/downloader/rtsp.py new file mode 100644 index 000000000..3eb29526c --- /dev/null +++ b/youtube_dl/downloader/rtsp.py @@ -0,0 +1,45 @@ +from __future__ import unicode_literals + +import os +import subprocess + +from .common import FileDownloader +from ..utils import ( + check_executable, + encodeFilename, +) + + +class RtspFD(FileDownloader): + def real_download(self, filename, info_dict): + url = info_dict['url'] + self.report_destination(filename) + tmpfilename = self.temp_name(filename) + + if check_executable('mplayer', ['-h']): + args = [ + 'mplayer', '-really-quiet', '-vo', 'null', '-vc', 'dummy', + '-dumpstream', '-dumpfile', tmpfilename, url] + elif check_executable('mpv', ['-h']): + args = [ + 'mpv', '-really-quiet', '--vo=null', '--stream-dump=' + tmpfilename, url] + else: + self.report_error('MMS or RTSP download detected but neither "mplayer" nor "mpv" could be run. Please install any.') + return False + + retval = subprocess.call(args) + if retval == 0: + fsize = os.path.getsize(encodeFilename(tmpfilename)) + self.to_screen('\r[%s] %s bytes' % (args[0], fsize)) + self.try_rename(tmpfilename, filename) + self._hook_progress({ + 'downloaded_bytes': fsize, + 'total_bytes': fsize, + 'filename': filename, + 'status': 'finished', + }) + return True + else: + self.to_stderr('\n') + self.report_error('%s exited with code %d' % (args[0], retval)) + return False diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a4fab540b..947b83683 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -4,10 +4,14 @@ from .abc import ABCIE from .abc7news import Abc7NewsIE from .academicearth import AcademicEarthCourseIE from .addanime import AddAnimeIE -from .adobetv import AdobeTVIE +from .adobetv import ( + AdobeTVIE, + AdobeTVVideoIE, +) from .adultswim import AdultSwimIE from .aftenposten import AftenpostenIE from .aftonbladet import AftonbladetIE +from .airmozilla import AirMozillaIE from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE from .anitube import AnitubeIE @@ -15,9 +19,14 @@ from .anysex import AnySexIE from .aol import AolIE from .allocine import AllocineIE from .aparat import AparatIE +from .appleconnect import AppleConnectIE from .appletrailers import AppleTrailersIE from .archiveorg import ArchiveOrgIE -from .ard import ARDIE, ARDMediathekIE +from .ard import ( + ARDIE, + ARDMediathekIE, + SportschauIE, +) from .arte import ( ArteTvIE, ArteTVPlus7IE, @@ -31,11 +40,17 @@ from .atresplayer import AtresPlayerIE from .atttechchannel import ATTTechChannelIE from .audiomack import AudiomackIE, AudiomackAlbumIE from .azubu import AzubuIE +from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE -from .bbccouk import BBCCoUkIE +from .bbc import ( + BBCCoUkIE, + BBCCoUkArticleIE, + BBCIE, +) from .beeg import BeegIE from .behindkink import BehindKinkIE +from .beatportpro import BeatportProIE from .bet import BetIE from .bild import BildIE from .bilibili import BiliBiliIE @@ -45,7 +60,10 @@ from .bloomberg import BloombergIE from .bpb import BpbIE from .br import BRIE from .breakcom import BreakIE -from .brightcove import BrightcoveIE +from .brightcove import ( + BrightcoveLegacyIE, + BrightcoveNewIE, +) from .buzzfeed import BuzzFeedIE from .byutv import BYUtvIE from .c56 import C56IE @@ -58,16 +76,24 @@ from .canalplus import CanalplusIE from .canalc2 import Canalc2IE from .cbs import CBSIE from .cbsnews import CBSNewsIE +from .cbssports import CBSSportsIE from .ccc import CCCIE from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE +from .chaturbate import ChaturbateIE from .chilloutzone import ChilloutzoneIE +from .chirbit import ( + ChirbitIE, + ChirbitProfileIE, +) from .cinchcast import CinchcastIE +from .cinemassacre import CinemassacreIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE from .clipsyndicate import ClipsyndicateIE from .cloudy import CloudyIE from .clubic import ClubicIE +from .clyp import ClypIE from .cmt import CMTIE from .cnet import CNETIE from .cnn import ( @@ -83,6 +109,7 @@ from .commonmistakes import CommonMistakesIE, UnicodeBOMIE from .condenast import CondeNastIE from .cracked import CrackedIE from .criterion import CriterionIE +from .crooksandliars import CrooksAndLiarsIE from .crunchyroll import ( CrunchyrollIE, CrunchyrollShowPlaylistIE @@ -93,23 +120,34 @@ from .dailymotion import ( DailymotionIE, DailymotionPlaylistIE, DailymotionUserIE, + DailymotionCloudIE, ) from .daum import DaumIE from .dbtv import DBTVIE +from .dcn import DCNIE from .dctp import DctpTvIE from .deezer import DeezerPlaylistIE +from .democracynow import DemocracynowIE from .dfb import DFBIE +from .dhm import DHMIE from .dotsub import DotsubIE +from .douyutv import DouyuTVIE +from .dplay import DPlayIE +from .dramafever import ( + DramaFeverIE, + DramaFeverSeriesIE, +) from .dreisat import DreiSatIE from .drbonanza import DRBonanzaIE from .drtuber import DrTuberIE from .drtv import DRTVIE from .dvtv import DVTVIE from .dump import DumpIE +from .dumpert import DumpertIE from .defense import DefenseGouvFrIE from .discovery import DiscoveryIE -from .divxstage import DivxStageIE from .dropbox import DropboxIE +from .eagleplatform import EaglePlatformIE from .ebaumsworld import EbaumsWorldIE from .echomsk import EchoMskIE from .ehow import EHowIE @@ -121,11 +159,14 @@ from .ellentv import ( EllenTVClipsIE, ) from .elpais import ElPaisIE -from .empflix import EMPFlixIE +from .embedly import EmbedlyIE from .engadget import EngadgetIE from .eporner import EpornerIE from .eroprofile import EroProfileIE from .escapist import EscapistIE +from .espn import ESPNIE +from .esri import EsriVideoIE +from .europa import EuropaIE from .everyonesmixtape import EveryonesMixtapeIE from .exfm import ExfmIE from .expotv import ExpoTVIE @@ -133,19 +174,19 @@ from .extremetube import ExtremeTubeIE from .facebook import FacebookIE from .faz import FazIE from .fc2 import FC2IE -from .firedrive import FiredriveIE +from .fczenit import FczenitIE from .firstpost import FirstpostIE from .firsttv import FirstTVIE from .fivemin import FiveMinIE -from .fktv import ( - FKTVIE, - FKTVPosteckeIE, -) +from .fivetv import FiveTVIE +from .fktv import FKTVIE from .flickr import FlickrIE from .folketinget import FolketingetIE +from .footyroom import FootyRoomIE from .fourtube import FourTubeIE from .foxgay import FoxgayIE from .foxnews import FoxNewsIE +from .foxsports import FoxSportsIE from .franceculture import FranceCultureIE from .franceinter import FranceInterIE from .francetv import ( @@ -164,23 +205,27 @@ from .gameone import ( GameOneIE, GameOnePlaylistIE, ) +from .gamersyde import GamersydeIE from .gamespot import GameSpotIE from .gamestar import GameStarIE from .gametrailers import GametrailersIE +from .gazeta import GazetaIE from .gdcvault import GDCVaultIE from .generic import GenericIE +from .gfycat import GfycatIE from .giantbomb import GiantBombIE from .giga import GigaIE from .glide import GlideIE -from .globo import GloboIE +from .globo import ( + GloboIE, + GloboArticleIE, +) from .godtube import GodTubeIE from .goldenmoustache import GoldenMoustacheIE from .golem import GolemIE from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE -from .gorillavid import GorillaVidIE from .goshgay import GoshgayIE -from .grooveshark import GroovesharkIE from .groupon import GrouponIE from .hark import HarkIE from .hearthisat import HearThisAtIE @@ -189,9 +234,9 @@ from .hellporno import HellPornoIE from .helsinki import HelsinkiIE from .hentaistigma import HentaiStigmaIE from .historicfilms import HistoricFilmsIE +from .history import HistoryIE from .hitbox import HitboxIE, HitboxLiveIE from .hornbunny import HornBunnyIE -from .hostingbulk import HostingBulkIE from .hotnewhiphop import HotNewHipHopIE from .howcast import HowcastIE from .howstuffworks import HowStuffWorksIE @@ -203,11 +248,21 @@ from .imdb import ( ImdbIE, ImdbListIE ) +from .imgur import ( + ImgurIE, + ImgurAlbumIE, +) from .ina import InaIE +from .indavideo import ( + IndavideoIE, + IndavideoEmbedIE, +) from .infoq import InfoQIE from .instagram import InstagramIE, InstagramUserIE from .internetvideoarchive import InternetVideoArchiveIE from .iprima import IPrimaIE +from .iqiyi import IqiyiIE +from .ir90tv import Ir90TvIE from .ivi import ( IviIE, IviCompilationIE @@ -218,8 +273,11 @@ from .jeuxvideo import JeuxVideoIE from .jove import JoveIE from .jukebox import JukeboxIE from .jpopsukitv import JpopsukiIE +from .kaltura import KalturaIE +from .kanalplay import KanalPlayIE from .kankan import KankanIE from .karaoketv import KaraoketvIE +from .karrierevideos import KarriereVideosIE from .keezmovies import KeezMoviesIE from .khanacademy import KhanAcademyIE from .kickstarter import KickStarterIE @@ -227,9 +285,32 @@ from .keek import KeekIE from .kontrtube import KontrTubeIE from .krasview import KrasViewIE from .ku6 import Ku6IE +from .kuwo import ( + KuwoIE, + KuwoAlbumIE, + KuwoChartIE, + KuwoSingerIE, + KuwoCategoryIE, + KuwoMvIE, +) from .la7 import LA7IE from .laola1tv import Laola1TvIE -from .lifenews import LifeNewsIE +from .lecture2go import Lecture2GoIE +from .letv import ( + LetvIE, + LetvTvIE, + LetvPlaylistIE +) +from .libsyn import LibsynIE +from .lifenews import ( + LifeNewsIE, + LifeEmbedIE, +) +from .limelight import ( + LimelightMediaIE, + LimelightChannelIE, + LimelightChannelListIE, +) from .liveleak import LiveLeakIE from .livestream import ( LivestreamIE, @@ -252,6 +333,7 @@ from .metacritic import MetacriticIE from .mgoon import MgoonIE from .minhateca import MinhatecaIE from .ministrygrid import MinistryGridIE +from .miomio import MioMioIE from .mit import TechTVMITIE, MITIE, OCWMITIE from .mitele import MiTeleIE from .mixcloud import MixcloudIE @@ -272,33 +354,53 @@ from .mtv import ( MTVIE, MTVServicesEmbeddedIE, MTVIggyIE, + MTVDEIE, ) from .muenchentv import MuenchenTVIE from .musicplayon import MusicPlayOnIE -from .musicvault import MusicVaultIE from .muzu import MuzuTVIE +from .mwave import MwaveIE from .myspace import MySpaceIE, MySpaceAlbumIE from .myspass import MySpassIE +from .myvi import MyviIE from .myvideo import MyVideoIE from .myvidster import MyVidsterIE +from .nationalgeographic import NationalGeographicIE from .naver import NaverIE from .nba import NBAIE from .nbc import ( NBCIE, NBCNewsIE, + NBCSportsIE, + NBCSportsVPlayerIE, + MSNBCIE, +) +from .ndr import ( + NDRIE, + NJoyIE, + NDREmbedBaseIE, + NDREmbedIE, + NJoyEmbedIE, ) -from .ndr import NDRIE from .ndtv import NDTVIE from .netzkino import NetzkinoIE from .nerdcubed import NerdCubedFeedIE from .nerdist import NerdistIE +from .neteasemusic import ( + NetEaseMusicIE, + NetEaseMusicAlbumIE, + NetEaseMusicSingerIE, + NetEaseMusicListIE, + NetEaseMusicMvIE, + NetEaseMusicProgramIE, + NetEaseMusicDjRadioIE, +) from .newgrounds import NewgroundsIE from .newstube import NewstubeIE from .nextmedia import ( NextMediaIE, NextMediaActionNewsIE, - AppleDailyRealtimeNewsIE, - AppleDailyAnimationNewsIE + AppleDailyIE, ) from .nfb import NFBIE from .nfl import NFLIE @@ -312,55 +414,101 @@ from .ninegag import NineGagIE from .noco import NocoIE from .normalboots import NormalbootsIE from .nosvideo import NosVideoIE +from .nova import NovaIE from .novamov import NovaMovIE -from .nowness import NownessIE +from .nowness import ( + NownessIE, + NownessPlaylistIE, + NownessSeriesIE, +) +from .nowtv import ( + NowTVIE, + NowTVListIE, +) from .nowvideo import NowVideoIE from .npo import ( NPOIE, NPOLiveIE, NPORadioIE, NPORadioFragmentIE, - TegenlichtVproIE, + VPROIE, + WNLIE ) from .nrk import ( NRKIE, + NRKPlaylistIE, NRKTVIE, ) from .ntvde import NTVDeIE from .ntvru import NTVRuIE -from .nytimes import NYTimesIE +from .nytimes import ( + NYTimesIE, + NYTimesArticleIE, +) from .nuvid import NuvidIE +from .odnoklassniki import OdnoklassnikiIE from .oktoberfesttv import OktoberfestTVIE -from .ooyala import OoyalaIE -from .openfilm import OpenFilmIE +from .onionstudios import OnionStudiosIE +from .ooyala import ( + OoyalaIE, + OoyalaExternalIE, +) from .orf import ( ORFTVthekIE, ORFOE1IE, ORFFM4IE, + ORFIPTVIE, ) from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE from .pbs import PBSIE +from .periscope import PeriscopeIE +from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE +from .pinkbike import PinkbikeIE from .planetaplay import PlanetaPlayIE +from .pladform import PladformIE from .played import PlayedIE from .playfm import PlayFMIE +from .playtvak import PlaytvakIE from .playvid import PlayvidIE +from .playwire import PlaywireIE +from .pluralsight import ( + PluralsightIE, + PluralsightCourseIE, +) from .podomatic import PodomaticIE +from .porn91 import Porn91IE from .pornhd import PornHdIE -from .pornhub import PornHubIE +from .pornhub import ( + PornHubIE, + PornHubPlaylistIE, +) from .pornotube import PornotubeIE +from .pornovoisines import PornoVoisinesIE from .pornoxo import PornoXOIE +from .primesharetv import PrimeShareTVIE from .promptfile import PromptFileIE from .prosiebensat1 import ProSiebenSat1IE +from .puls4 import Puls4IE from .pyvideo import PyvideoIE +from .qqmusic import ( + QQMusicIE, + QQMusicSingerIE, + QQMusicAlbumIE, + QQMusicToplistIE, + QQMusicPlaylistIE, +) from .quickvid import QuickVidIE +from .r7 import R7IE from .radiode import RadioDeIE +from .radiojavan import RadioJavanIE from .radiobremen import RadioBremenIE from .radiofrance import RadioFranceIE from .rai import RaiIE from .rbmaradio import RBMARadioIE +from .rds import RDSIE from .redtube import RedTubeIE from .restudy import RestudyIE from .reverbnation import ReverbNationIE @@ -370,12 +518,12 @@ from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE from .rtbf import RTBFIE from .rte import RteIE -from .rtlnl import RtlXlIE -from .rtlnow import RTLnowIE +from .rtlnl import RtlNlIE from .rtl2 import RTL2IE from .rtp import RTPIE from .rts import RTSIE -from .rtve import RTVEALaCartaIE, RTVELiveIE +from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE +from .rtvnh import RTVNHIE from .ruhd import RUHDIE from .rutube import ( RutubeIE, @@ -385,16 +533,24 @@ from .rutube import ( RutubePersonIE, ) from .rutv import RUTVIE +from .ruutu import RuutuIE +from .sandia import SandiaIE +from .safari import ( + SafariIE, + SafariCourseIE, +) from .sapo import SapoIE from .savefrom import SaveFromIE from .sbs import SBSIE from .scivee import SciVeeIE from .screencast import ScreencastIE from .screencastomatic import ScreencastOMaticIE -from .screenwavemedia import CinemassacreIE, ScreenwaveMediaIE, TeamFourIE +from .screenwavemedia import ScreenwaveMediaIE, TeamFourIE +from .senateisvp import SenateISVPIE from .servingsys import ServingSysIE from .sexu import SexuIE from .sexykarma import SexyKarmaIE +from .shahid import ShahidIE from .shared import SharedIE from .sharesix import ShareSixIE from .sina import SinaIE @@ -406,36 +562,60 @@ from .smotri import ( SmotriUserIE, SmotriBroadcastIE, ) +from .snagfilms import ( + SnagFilmsIE, + SnagFilmsEmbedIE, +) from .snotr import SnotrIE -from .sockshare import SockshareIE from .sohu import SohuIE +from .soompi import ( + SoompiIE, + SoompiShowIE, +) from .soundcloud import ( SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE, - SoundcloudPlaylistIE + SoundcloudPlaylistIE, + SoundcloudSearchIE +) +from .soundgasm import ( + SoundgasmIE, + SoundgasmProfileIE ) -from .soundgasm import SoundgasmIE from .southpark import ( SouthParkIE, - SouthparkDeIE, + SouthParkDeIE, + SouthParkDkIE, + SouthParkEsIE, + SouthParkNlIE ) from .space import SpaceIE +from .spankbang import SpankBangIE from .spankwire import SpankwireIE from .spiegel import SpiegelIE, SpiegelArticleIE from .spiegeltv import SpiegeltvIE from .spike import SpikeIE +from .stitcher import StitcherIE from .sport5 import Sport5IE -from .sportbox import SportBoxIE +from .sportbox import ( + SportBoxIE, + SportBoxEmbedIE, +) from .sportdeutschland import SportDeutschlandIE +from .srf import SrfIE from .srmediathek import SRMediathekIE +from .ssa import SSAIE from .stanfordoc import StanfordOpenClassroomIE from .steam import SteamIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE from .streetvoice import StreetVoiceIE from .sunporno import SunPornoIE -from .svtplay import SVTPlayIE +from .svt import ( + SVTIE, + SVTPlayIE, +) from .swrmediathek import SWRMediathekIE from .syfy import SyfyIE from .sztvhu import SztvHuIE @@ -452,6 +632,7 @@ from .techtalks import TechTalksIE from .ted import TEDIE from .telebruxelles import TeleBruxellesIE from .telecinco import TelecincoIE +from .telegraaf import TelegraafIE from .telemb import TeleMBIE from .teletask import TeleTaskIE from .tenplay import TenPlayIE @@ -459,13 +640,24 @@ from .testurl import TestURLIE from .testtube import TestTubeIE from .tf1 import TF1IE from .theonion import TheOnionIE -from .theplatform import ThePlatformIE +from .theplatform import ( + ThePlatformIE, + ThePlatformFeedIE, +) from .thesixtyone import TheSixtyOneIE +from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE from .tinypic import TinyPicIE from .tlc import TlcIE, TlcDeIE -from .tmz import TMZIE -from .tnaflix import TNAFlixIE +from .tmz import ( + TMZIE, + TMZArticleIE, +) +from .tnaflix import ( + TNAFlixIE, + EMPFlixIE, + MovieFapIE, +) from .thvideo import ( THVideoIE, THVideoPlaylistIE @@ -476,16 +668,30 @@ from .traileraddict import TrailerAddictIE from .trilulilu import TriluliluIE from .trutube import TruTubeIE from .tube8 import Tube8IE +from .tubitv import TubiTvIE from .tudou import TudouIE from .tumblr import TumblrIE from .tunein import TuneInIE from .turbo import TurboIE from .tutv import TutvIE +from .tv2 import ( + TV2IE, + TV2ArticleIE, +) +from .tv4 import TV4IE +from .tvc import ( + TVCIE, + TVCArticleIE, +) from .tvigle import TvigleIE from .tvp import TvpIE, TvpSeriesIE from .tvplay import TVPlayIE from .tweakers import TweakersIE from .twentyfourvideo import TwentyFourVideoIE +from .twentytwotracks import ( + TwentyTwoTracksIE, + TwentyTwoTracksGenreIE +) from .twitch import ( TwitchVideoIE, TwitchChapterIE, @@ -495,26 +701,33 @@ from .twitch import ( TwitchBookmarksIE, TwitchStreamIE, ) +from .twitter import TwitterCardIE, TwitterIE from .ubu import UbuIE from .udemy import ( UdemyIE, UdemyCourseIE ) +from .udn import UDNEmbedIE +from .ultimedia import UltimediaIE from .unistra import UnistraIE from .urort import UrortIE from .ustream import UstreamIE, UstreamChannelIE +from .varzesh3 import Varzesh3IE from .vbox7 import Vbox7IE from .veehd import VeeHDIE from .veoh import VeohIE +from .vessel import VesselIE from .vesti import VestiIE from .vevo import VevoIE -from .vgtv import VGTVIE +from .vgtv import ( + BTArticleIE, + BTVestlendingenIE, + VGTVIE, +) from .vh1 import VH1IE from .vice import ViceIE from .viddler import ViddlerIE -from .videobam import VideoBamIE from .videodetective import VideoDetectiveIE -from .videolecturesnet import VideoLecturesNetIE from .videofyme import VideofyMeIE from .videomega import VideoMegaIE from .videopremium import VideoPremiumIE @@ -523,6 +736,8 @@ from .videoweed import VideoWeedIE from .vidme import VidmeIE from .vidzi import VidziIE from .vier import VierIE, VierVideosIE +from .viewster import ViewsterIE +from .viidea import ViideaIE from .vimeo import ( VimeoIE, VimeoAlbumIE, @@ -538,12 +753,17 @@ from .vine import ( VineIE, VineUserIE, ) -from .viki import VikiIE +from .viki import ( + VikiIE, + VikiChannelIE, +) from .vk import ( VKIE, VKUserVideosIE, ) +from .vlive import VLiveIE from .vodlocker import VodlockerIE +from .voicerepublic import VoiceRepublicIE from .vporn import VpornIE from .vrt import VRTIE from .vube import VubeIE @@ -558,7 +778,10 @@ from .wdr import ( WDRMobileIE, WDRMausIE, ) -from .webofstories import WebOfStoriesIE +from .webofstories import ( + WebOfStoriesIE, + WebOfStoriesPlaylistIE, +) from .weibo import WeiboIE from .wimp import WimpIE from .wistia import WistiaIE @@ -567,18 +790,30 @@ from .wrzuta import WrzutaIE from .wsj import WSJIE from .xbef import XBefIE from .xboxclips import XboxClipsIE -from .xhamster import XHamsterIE +from .xfileshare import XFileShareIE +from .xhamster import ( + XHamsterIE, + XHamsterEmbedIE, +) from .xminus import XMinusIE from .xnxx import XNXXIE -from .xvideos import XVideosIE +from .xstream import XstreamIE from .xtube import XTubeUserIE, XTubeIE from .xuite import XuiteIE +from .xvideos import XVideosIE from .xxxymovies import XXXYMoviesIE from .yahoo import ( YahooIE, YahooSearchIE, ) +from .yam import YamIE +from .yandexmusic import ( + YandexMusicTrackIE, + YandexMusicAlbumIE, + YandexMusicPlaylistIE, +) from .yesjapan import YesJapanIE +from .yinyuetai import YinYueTaiIE from .ynet import YnetIE from .youjizz import YouJizzIE from .youku import YoukuIE @@ -599,8 +834,10 @@ from .youtube import ( YoutubeTruncatedIDIE, YoutubeTruncatedURLIE, YoutubeUserIE, + YoutubeUserPlaylistsIE, YoutubeWatchLaterIE, ) +from .zapiks import ZapiksIE from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ( ZingMp3SongIE, diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index dc0fb85d6..c0e5d1abf 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -1,16 +1,20 @@ from __future__ import unicode_literals import re -import json from .common import InfoExtractor +from ..utils import ( + ExtractorError, + js_to_json, + int_or_none, +) class ABCIE(InfoExtractor): IE_NAME = 'abc.net.au' - _VALID_URL = r'http://www\.abc\.net\.au/news/[^/]+/[^/]+/(?P<id>\d+)' + _VALID_URL = r'http://www\.abc\.net\.au/news/(?:[^/]+/){1,2}(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.abc.net.au/news/2014-11-05/australia-to-staff-ebola-treatment-centre-in-sierra-leone/5868334', 'md5': 'cb3dd03b18455a661071ee1e28344d9f', 'info_dict': { @@ -19,23 +23,62 @@ class ABCIE(InfoExtractor): 'title': 'Australia to help staff Ebola treatment centre in Sierra Leone', 'description': 'md5:809ad29c67a05f54eb41f2a105693a67', }, - } + }, { + 'url': 'http://www.abc.net.au/news/2015-08-17/warren-entsch-introduces-same-sex-marriage-bill/6702326', + 'md5': 'db2a5369238b51f9811ad815b69dc086', + 'info_dict': { + 'id': 'NvqvPeNZsHU', + 'ext': 'mp4', + 'upload_date': '20150816', + 'uploader': 'ABC News (Australia)', + 'description': 'Government backbencher Warren Entsch introduces a cross-party sponsored bill to legalise same-sex marriage, saying the bill is designed to promote "an inclusive Australia, not a divided one.". Read more here: http://ab.co/1Mwc6ef', + 'uploader_id': 'NewsOnABC', + 'title': 'Marriage Equality: Warren Entsch introduces same sex marriage bill', + }, + 'add_ie': ['Youtube'], + }, { + 'url': 'http://www.abc.net.au/news/2015-10-23/nab-lifts-interest-rates-following-westpac-and-cba/6880080', + 'md5': 'b96eee7c9edf4fc5a358a0252881cc1f', + 'info_dict': { + 'id': '6880080', + 'ext': 'mp3', + 'title': 'NAB lifts interest rates, following Westpac and CBA', + 'description': 'md5:f13d8edc81e462fce4a0437c7dc04728', + }, + }, { + 'url': 'http://www.abc.net.au/news/2015-10-19/6866214', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - urls_info_json = self._search_regex( - r'inlineVideoData\.push\((.*?)\);', webpage, 'video urls', - flags=re.DOTALL) - urls_info = json.loads(urls_info_json.replace('\'', '"')) + mobj = re.search( + r'inline(?P<type>Video|Audio|YouTube)Data\.push\((?P<json_data>[^)]+)\);', + webpage) + if mobj is None: + raise ExtractorError('Unable to extract video urls') + + urls_info = self._parse_json( + mobj.group('json_data'), video_id, transform_source=js_to_json) + + if not isinstance(urls_info, list): + urls_info = [urls_info] + + if mobj.group('type') == 'YouTube': + return self.playlist_result([ + self.url_result(url_info['url']) for url_info in urls_info]) + formats = [{ 'url': url_info['url'], - 'width': int(url_info['width']), - 'height': int(url_info['height']), - 'tbr': int(url_info['bitrate']), - 'filesize': int(url_info['filesize']), + 'vcodec': url_info.get('codec') if mobj.group('type') == 'Video' else 'none', + 'width': int_or_none(url_info.get('width')), + 'height': int_or_none(url_info.get('height')), + 'tbr': int_or_none(url_info.get('bitrate')), + 'filesize': int_or_none(url_info.get('filesize')), } for url_info in urls_info] + self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/academicearth.py b/youtube_dl/extractor/academicearth.py index 47313fba8..34095501c 100644 --- a/youtube_dl/extractor/academicearth.py +++ b/youtube_dl/extractor/academicearth.py @@ -15,7 +15,7 @@ class AcademicEarthCourseIE(InfoExtractor): 'title': 'Laws of Nature', 'description': 'Introduce yourself to the laws of nature with these free online college lectures from Yale, Harvard, and MIT.', }, - 'playlist_count': 4, + 'playlist_count': 3, } def _real_extract(self, url): diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py index 203936e54..e3e6d2113 100644 --- a/youtube_dl/extractor/addanime.py +++ b/youtube_dl/extractor/addanime.py @@ -11,12 +11,13 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + qualities, ) class AddAnimeIE(InfoExtractor): - _VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video\.php\?(?:.*?)v=(?P<id>[\w_]+)(?:.*)' - _TEST = { + _VALID_URL = r'http://(?:\w+\.)?add-anime\.net/(?:watch_video\.php\?(?:.*?)v=|video/)(?P<id>[\w_]+)' + _TESTS = [{ 'url': 'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9', 'md5': '72954ea10bc979ab5e2eb288b21425a0', 'info_dict': { @@ -25,7 +26,10 @@ class AddAnimeIE(InfoExtractor): 'description': 'One Piece 606', 'title': 'One Piece 606', } - } + }, { + 'url': 'http://add-anime.net/video/MDUGWYKNGBD8/One-Piece-687', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -63,8 +67,10 @@ class AddAnimeIE(InfoExtractor): note='Confirming after redirect') webpage = self._download_webpage(url, video_id) + FORMATS = ('normal', 'hq') + quality = qualities(FORMATS) formats = [] - for format_id in ('normal', 'hq'): + for format_id in FORMATS: rex = r"var %s_video_file = '(.*?)';" % re.escape(format_id) video_url = self._search_regex(rex, webpage, 'video file URLx', fatal=False) @@ -73,6 +79,7 @@ class AddAnimeIE(InfoExtractor): formats.append({ 'format_id': format_id, 'url': video_url, + 'quality': quality(format_id), }) self._sort_formats(formats) video_title = self._og_search_title(webpage) diff --git a/youtube_dl/extractor/adobetv.py b/youtube_dl/extractor/adobetv.py index 28e07f8b0..5e43adc51 100644 --- a/youtube_dl/extractor/adobetv.py +++ b/youtube_dl/extractor/adobetv.py @@ -5,6 +5,8 @@ from ..utils import ( parse_duration, unified_strdate, str_to_int, + float_or_none, + ISO639Utils, ) @@ -28,7 +30,6 @@ class AdobeTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) player = self._parse_json( @@ -44,8 +45,10 @@ class AdobeTVIE(InfoExtractor): self._html_search_meta('datepublished', webpage, 'upload date')) duration = parse_duration( - self._html_search_meta('duration', webpage, 'duration') - or self._search_regex(r'Runtime:\s*(\d{2}:\d{2}:\d{2})', webpage, 'duration')) + self._html_search_meta('duration', webpage, 'duration') or + self._search_regex( + r'Runtime:\s*(\d{2}:\d{2}:\d{2})', + webpage, 'duration', fatal=False)) view_count = str_to_int(self._search_regex( r'<div class="views">\s*Views?:\s*([\d,.]+)\s*</div>', @@ -68,3 +71,61 @@ class AdobeTVIE(InfoExtractor): 'view_count': view_count, 'formats': formats, } + + +class AdobeTVVideoIE(InfoExtractor): + _VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P<id>\d+)' + + _TEST = { + # From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners + 'url': 'https://video.tv.adobe.com/v/2456/', + 'md5': '43662b577c018ad707a63766462b1e87', + 'info_dict': { + 'id': '2456', + 'ext': 'mp4', + 'title': 'New experience with Acrobat DC', + 'description': 'New experience with Acrobat DC', + 'duration': 248.667, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + player_params = self._parse_json(self._search_regex( + r'var\s+bridge\s*=\s*([^;]+);', webpage, 'player parameters'), + video_id) + + formats = [{ + 'url': source['src'], + 'width': source.get('width'), + 'height': source.get('height'), + 'tbr': source.get('bitrate'), + } for source in player_params['sources']] + + # For both metadata and downloaded files the duration varies among + # formats. I just pick the max one + duration = max(filter(None, [ + float_or_none(source.get('duration'), scale=1000) + for source in player_params['sources']])) + + subtitles = {} + for translation in player_params.get('translations', []): + lang_id = translation.get('language_w3c') or ISO639Utils.long2short(translation['language_medium']) + if lang_id not in subtitles: + subtitles[lang_id] = [] + subtitles[lang_id].append({ + 'url': translation['vttPath'], + 'ext': 'vtt', + }) + + return { + 'id': video_id, + 'formats': formats, + 'title': player_params['title'], + 'description': self._og_search_description(webpage), + 'duration': duration, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 502a9c25a..3ae618e71 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -2,13 +2,13 @@ from __future__ import unicode_literals import re -import json from .common import InfoExtractor from ..utils import ( + determine_ext, ExtractorError, - xpath_text, float_or_none, + xpath_text, ) @@ -38,9 +38,11 @@ class AdultSwimIE(InfoExtractor): }, ], 'info_dict': { + 'id': 'rQxZvXQ4ROaSOqq-or2Mow', 'title': 'Rick and Morty - Pilot', 'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. " - } + }, + 'skip': 'This video is only available for registered users', }, { 'url': 'http://www.adultswim.com/videos/playlists/american-parenting/putting-francine-out-of-business/', 'playlist': [ @@ -55,9 +57,28 @@ class AdultSwimIE(InfoExtractor): } ], 'info_dict': { + 'id': '-t8CamQlQ2aYZ49ItZCFog', 'title': 'American Dad - Putting Francine Out of Business', 'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].' }, + }, { + 'url': 'http://www.adultswim.com/videos/tim-and-eric-awesome-show-great-job/dr-steve-brule-for-your-wine/', + 'playlist': [ + { + 'md5': '3e346a2ab0087d687a05e1e7f3b3e529', + 'info_dict': { + 'id': 'sY3cMUR_TbuE4YmdjzbIcQ-0', + 'ext': 'flv', + 'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine', + 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n', + }, + } + ], + 'info_dict': { + 'id': 'sY3cMUR_TbuE4YmdjzbIcQ', + 'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine', + 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n', + }, }] @staticmethod @@ -78,6 +99,7 @@ class AdultSwimIE(InfoExtractor): for video in collection.get('videos'): if video.get('slug') == slug: return collection, video + return None, None def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -88,30 +110,39 @@ class AdultSwimIE(InfoExtractor): webpage = self._download_webpage(url, episode_path) # Extract the value of `bootstrappedData` from the Javascript in the page. - bootstrappedDataJS = self._search_regex(r'var bootstrappedData = ({.*});', webpage, episode_path) - - try: - bootstrappedData = json.loads(bootstrappedDataJS) - except ValueError as ve: - errmsg = '%s: Failed to parse JSON ' % episode_path - raise ExtractorError(errmsg, cause=ve) + bootstrapped_data = self._parse_json(self._search_regex( + r'var bootstrappedData = ({.*});', webpage, 'bootstraped data'), episode_path) # Downloading videos from a /videos/playlist/ URL needs to be handled differently. # NOTE: We are only downloading one video (the current one) not the playlist if is_playlist: - collections = bootstrappedData['playlists']['collections'] + collections = bootstrapped_data['playlists']['collections'] collection = self.find_collection_by_linkURL(collections, show_path) video_info = self.find_video_info(collection, episode_path) show_title = video_info['showTitle'] segment_ids = [video_info['videoPlaybackID']] else: - collections = bootstrappedData['show']['collections'] + collections = bootstrapped_data['show']['collections'] collection, video_info = self.find_collection_containing_video(collections, episode_path) - - show = bootstrappedData['show'] + # Video wasn't found in the collections, let's try `slugged_video`. + if video_info is None: + if bootstrapped_data.get('slugged_video', {}).get('slug') == episode_path: + video_info = bootstrapped_data['slugged_video'] + else: + raise ExtractorError('Unable to find video info') + + show = bootstrapped_data['show'] show_title = show['title'] - segment_ids = [clip['videoPlaybackID'] for clip in video_info['clips']] + stream = video_info.get('stream') + clips = [stream] if stream else video_info.get('clips') + if not clips: + raise ExtractorError( + 'This video is only available via cable service provider subscription that' + ' is not currently supported. You may want to use --cookies.' + if video_info.get('auth') is True else 'Unable to find stream or clips', + expected=True) + segment_ids = [clip['videoPlaybackID'] for clip in clips] episode_id = video_info['id'] episode_title = video_info['title'] @@ -120,7 +151,7 @@ class AdultSwimIE(InfoExtractor): entries = [] for part_num, segment_id in enumerate(segment_ids): - segment_url = 'http://www.adultswim.com/videos/api/v0/assets?id=%s&platform=mobile' % segment_id + segment_url = 'http://www.adultswim.com/videos/api/v0/assets?id=%s&platform=desktop' % segment_id segment_title = '%s - %s' % (show_title, episode_title) if len(segment_ids) > 1: @@ -134,19 +165,32 @@ class AdultSwimIE(InfoExtractor): xpath_text(idoc, './/trt', 'segment duration').strip()) formats = [] - file_els = idoc.findall('.//files/file') + file_els = idoc.findall('.//files/file') or idoc.findall('./files/file') + unique_urls = [] + unique_file_els = [] for file_el in file_els: + media_url = file_el.text + if not media_url or determine_ext(media_url) == 'f4m': + continue + if file_el.text not in unique_urls: + unique_urls.append(file_el.text) + unique_file_els.append(file_el) + + for file_el in unique_file_els: bitrate = file_el.attrib.get('bitrate') ftype = file_el.attrib.get('type') - - formats.append({ - 'format_id': '%s_%s' % (bitrate, ftype), - 'url': file_el.text.strip(), - # The bitrate may not be a number (for example: 'iphone') - 'tbr': int(bitrate) if bitrate.isdigit() else None, - 'quality': 1 if ftype == 'hd' else -1 - }) + media_url = file_el.text + if determine_ext(media_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + media_url, segment_title, 'mp4', preference=0, m3u8_id='hls')) + else: + formats.append({ + 'format_id': '%s_%s' % (bitrate, ftype), + 'url': file_el.text.strip(), + # The bitrate may not be a number (for example: 'iphone') + 'tbr': int(bitrate) if bitrate.isdigit() else None, + }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/aftenposten.py b/youtube_dl/extractor/aftenposten.py index 2b257ede7..0c00acfb5 100644 --- a/youtube_dl/extractor/aftenposten.py +++ b/youtube_dl/extractor/aftenposten.py @@ -1,23 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_iso8601, - xpath_with_ns, - xpath_text, - find_xpath_attr, -) class AftenpostenIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?aftenposten\.no/webtv/([^/]+/)*(?P<id>[^/]+)-\d+\.html' - + _VALID_URL = r'https?://(?:www\.)?aftenposten\.no/webtv/(?:#!/)?video/(?P<id>\d+)' _TEST = { - 'url': 'http://www.aftenposten.no/webtv/serier-og-programmer/sweatshopenglish/TRAILER-SWEATSHOP---I-cant-take-any-more-7800835.html?paging=§ion=webtv_serierogprogrammer_sweatshop_sweatshopenglish', + 'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more', 'md5': 'fd828cd29774a729bf4d4425fe192972', 'info_dict': { 'id': '21039', @@ -30,74 +20,4 @@ class AftenpostenIE(InfoExtractor): } def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - video_id = self._html_search_regex( - r'data-xs-id="(\d+)"', webpage, 'video id') - - data = self._download_xml( - 'http://frontend.xstream.dk/ap/feed/video/?platform=web&id=%s' % video_id, video_id) - - NS_MAP = { - 'atom': 'http://www.w3.org/2005/Atom', - 'xt': 'http://xstream.dk/', - 'media': 'http://search.yahoo.com/mrss/', - } - - entry = data.find(xpath_with_ns('./atom:entry', NS_MAP)) - - title = xpath_text( - entry, xpath_with_ns('./atom:title', NS_MAP), 'title') - description = xpath_text( - entry, xpath_with_ns('./atom:summary', NS_MAP), 'description') - timestamp = parse_iso8601(xpath_text( - entry, xpath_with_ns('./atom:published', NS_MAP), 'upload date')) - - formats = [] - media_group = entry.find(xpath_with_ns('./media:group', NS_MAP)) - for media_content in media_group.findall(xpath_with_ns('./media:content', NS_MAP)): - media_url = media_content.get('url') - if not media_url: - continue - tbr = int_or_none(media_content.get('bitrate')) - mobj = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', media_url) - if mobj: - formats.append({ - 'url': mobj.group('url'), - 'play_path': 'mp4:%s' % mobj.group('playpath'), - 'app': mobj.group('app'), - 'ext': 'flv', - 'tbr': tbr, - 'format_id': 'rtmp-%d' % tbr, - }) - else: - formats.append({ - 'url': media_url, - 'tbr': tbr, - }) - self._sort_formats(formats) - - link = find_xpath_attr( - entry, xpath_with_ns('./atom:link', NS_MAP), 'rel', 'original') - if link is not None: - formats.append({ - 'url': link.get('href'), - 'format_id': link.get('rel'), - }) - - thumbnails = [{ - 'url': splash.get('url'), - 'width': int_or_none(splash.get('width')), - 'height': int_or_none(splash.get('height')), - } for splash in media_group.findall(xpath_with_ns('./xt:splash', NS_MAP))] - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'timestamp': timestamp, - 'formats': formats, - 'thumbnails': thumbnails, - } + return self.url_result('xstream:ap:%s' % self._match_id(url), 'Xstream') diff --git a/youtube_dl/extractor/aftonbladet.py b/youtube_dl/extractor/aftonbladet.py index 8442019ea..e0518cf26 100644 --- a/youtube_dl/extractor/aftonbladet.py +++ b/youtube_dl/extractor/aftonbladet.py @@ -2,14 +2,15 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import int_or_none class AftonbladetIE(InfoExtractor): - _VALID_URL = r'^http://tv\.aftonbladet\.se/webbtv.+?(?P<video_id>article[0-9]+)\.ab(?:$|[?#])' + _VALID_URL = r'http://tv\.aftonbladet\.se/abtv/articles/(?P<id>[0-9]+)' _TEST = { - 'url': 'http://tv.aftonbladet.se/webbtv/nyheter/vetenskap/rymden/article36015.ab', + 'url': 'http://tv.aftonbladet.se/abtv/articles/36015', 'info_dict': { - 'id': 'article36015', + 'id': '36015', 'ext': 'mp4', 'title': 'Vulkanutbrott i rymden - nu släpper NASA bilderna', 'description': 'Jupiters mÃ¥ne mest aktiv av alla himlakroppar', @@ -24,8 +25,9 @@ class AftonbladetIE(InfoExtractor): # find internal video meta data meta_url = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json' - internal_meta_id = self._html_search_regex( - r'data-aptomaId="([\w\d]+)"', webpage, 'internal_meta_id') + player_config = self._parse_json(self._html_search_regex( + r'data-player-config="([^"]+)"', webpage, 'player config'), video_id) + internal_meta_id = player_config['videoId'] internal_meta_url = meta_url % internal_meta_id internal_meta_json = self._download_json( internal_meta_url, video_id, 'Downloading video meta data') @@ -43,9 +45,9 @@ class AftonbladetIE(InfoExtractor): formats.append({ 'url': 'http://%s:%d/%s/%s' % (p['address'], p['port'], p['path'], p['filename']), 'ext': 'mp4', - 'width': fmt['width'], - 'height': fmt['height'], - 'tbr': fmt['bitrate'], + 'width': int_or_none(fmt.get('width')), + 'height': int_or_none(fmt.get('height')), + 'tbr': int_or_none(fmt.get('bitrate')), 'protocol': 'http', }) self._sort_formats(formats) @@ -54,9 +56,9 @@ class AftonbladetIE(InfoExtractor): 'id': video_id, 'title': internal_meta_json['title'], 'formats': formats, - 'thumbnail': internal_meta_json['imageUrl'], - 'description': internal_meta_json['shortPreamble'], - 'timestamp': internal_meta_json['timePublished'], - 'duration': internal_meta_json['duration'], - 'view_count': internal_meta_json['views'], + 'thumbnail': internal_meta_json.get('imageUrl'), + 'description': internal_meta_json.get('shortPreamble'), + 'timestamp': int_or_none(internal_meta_json.get('timePublished')), + 'duration': int_or_none(internal_meta_json.get('duration')), + 'view_count': int_or_none(internal_meta_json.get('views')), } diff --git a/youtube_dl/extractor/airmozilla.py b/youtube_dl/extractor/airmozilla.py new file mode 100644 index 000000000..f8e70f4e5 --- /dev/null +++ b/youtube_dl/extractor/airmozilla.py @@ -0,0 +1,74 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, + parse_iso8601, +) + + +class AirMozillaIE(InfoExtractor): + _VALID_URL = r'https?://air\.mozilla\.org/(?P<id>[0-9a-z-]+)/?' + _TEST = { + 'url': 'https://air.mozilla.org/privacy-lab-a-meetup-for-privacy-minded-people-in-san-francisco/', + 'md5': '2e3e7486ba5d180e829d453875b9b8bf', + 'info_dict': { + 'id': '6x4q2w', + 'ext': 'mp4', + 'title': 'Privacy Lab - a meetup for privacy minded people in San Francisco', + 'thumbnail': 're:https?://vid\.ly/(?P<id>[0-9a-z-]+)/poster', + 'description': 'Brings together privacy professionals and others interested in privacy at for-profits, non-profits, and NGOs in an effort to contribute to the state of the ecosystem...', + 'timestamp': 1422487800, + 'upload_date': '20150128', + 'location': 'SFO Commons', + 'duration': 3780, + 'view_count': int, + 'categories': ['Main', 'Privacy'], + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._html_search_regex(r'//vid.ly/(.*?)/embed', webpage, 'id') + + embed_script = self._download_webpage('https://vid.ly/{0}/embed'.format(video_id), video_id) + jwconfig = self._search_regex(r'\svar jwconfig = (\{.*?\});\s', embed_script, 'metadata') + metadata = self._parse_json(jwconfig, video_id) + + formats = [{ + 'url': source['file'], + 'ext': source['type'], + 'format_id': self._search_regex(r'&format=(.*)$', source['file'], 'video format'), + 'format': source['label'], + 'height': int(source['label'].rstrip('p')), + } for source in metadata['playlist'][0]['sources']] + self._sort_formats(formats) + + view_count = int_or_none(self._html_search_regex( + r'Views since archived: ([0-9]+)', + webpage, 'view count', fatal=False)) + timestamp = parse_iso8601(self._html_search_regex( + r'<time datetime="(.*?)"', webpage, 'timestamp', fatal=False)) + duration = parse_duration(self._search_regex( + r'Duration:\s*(\d+\s*hours?\s*\d+\s*minutes?)', + webpage, 'duration', fatal=False)) + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'formats': formats, + 'url': self._og_search_url(webpage), + 'display_id': display_id, + 'thumbnail': metadata['playlist'][0].get('image'), + 'description': self._og_search_description(webpage), + 'timestamp': timestamp, + 'location': self._html_search_regex(r'Location: (.*)', webpage, 'location', default=None), + 'duration': duration, + 'view_count': view_count, + 'categories': re.findall(r'<a href=".*?" class="channel">(.*?)</a>', webpage), + } diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py index 612708e25..5b2c0dc9a 100644 --- a/youtube_dl/extractor/aljazeera.py +++ b/youtube_dl/extractor/aljazeera.py @@ -15,7 +15,8 @@ class AlJazeeraIE(InfoExtractor): 'description': 'As a birth attendant advocating for family planning, Remy is on the frontline of Tondo\'s battle with overcrowding.', 'uploader': 'Al Jazeera English', }, - 'add_ie': ['Brightcove'], + 'add_ie': ['BrightcoveLegacy'], + 'skip': 'Not accessible from Travis CI server', } def _real_extract(self, url): @@ -31,5 +32,5 @@ class AlJazeeraIE(InfoExtractor): 'playerKey=AQ~~%2CAAAAmtVJIFk~%2CTVGOQ5ZTwJbeMWnq5d_H4MOM57xfzApc' '&%40videoPlayer={0}'.format(brightcove_id) ), - 'ie_key': 'Brightcove', + 'ie_key': 'BrightcoveLegacy', } diff --git a/youtube_dl/extractor/anitube.py b/youtube_dl/extractor/anitube.py index 31f0d417c..23f942ae2 100644 --- a/youtube_dl/extractor/anitube.py +++ b/youtube_dl/extractor/anitube.py @@ -26,8 +26,8 @@ class AnitubeIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - key = self._html_search_regex( - r'http://www\.anitube\.se/embed/([A-Za-z0-9_-]*)', webpage, 'key') + key = self._search_regex( + r'src=["\']https?://[^/]+/embed/([A-Za-z0-9_-]+)', webpage, 'key') config_xml = self._download_xml( 'http://www.anitube.se/nuevo/econfig.php?key=%s' % key, key) diff --git a/youtube_dl/extractor/appleconnect.py b/youtube_dl/extractor/appleconnect.py new file mode 100644 index 000000000..ea7a70393 --- /dev/null +++ b/youtube_dl/extractor/appleconnect.py @@ -0,0 +1,50 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + str_to_int, + ExtractorError +) + + +class AppleConnectIE(InfoExtractor): + _VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/idsa\.(?P<id>[\w-]+)' + _TEST = { + 'url': 'https://itunes.apple.com/us/post/idsa.4ab17a39-2720-11e5-96c5-a5b38f6c42d3', + 'md5': '10d0f2799111df4cb1c924520ca78f98', + 'info_dict': { + 'id': '4ab17a39-2720-11e5-96c5-a5b38f6c42d3', + 'ext': 'm4v', + 'title': 'Energy', + 'uploader': 'Drake', + 'thumbnail': 'http://is5.mzstatic.com/image/thumb/Video5/v4/78/61/c5/7861c5fa-ad6d-294b-1464-cf7605b911d6/source/1920x1080sr.jpg', + 'upload_date': '20150710', + 'timestamp': 1436545535, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + try: + video_json = self._html_search_regex( + r'class="auc-video-data">(\{.*?\})', webpage, 'json') + except ExtractorError: + raise ExtractorError('This post doesn\'t contain a video', expected=True) + + video_data = self._parse_json(video_json, video_id) + timestamp = str_to_int(self._html_search_regex(r'data-timestamp="(\d+)"', webpage, 'timestamp')) + like_count = str_to_int(self._html_search_regex(r'(\d+) Loves', webpage, 'like count')) + + return { + 'id': video_id, + 'url': video_data['sslSrc'], + 'title': video_data['title'], + 'description': video_data['description'], + 'uploader': video_data['artistName'], + 'thumbnail': video_data['artworkUrl'], + 'timestamp': timestamp, + 'like_count': like_count, + } diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index 287f71e07..f68dc3236 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -11,56 +11,62 @@ from ..utils import ( class AppleTrailersIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/trailers/(?P<company>[^/]+)/(?P<movie>[^/]+)' - _TEST = { - "url": "http://trailers.apple.com/trailers/wb/manofsteel/", - "playlist": [ + _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/(?:trailers|ca)/(?P<company>[^/]+)/(?P<movie>[^/]+)' + _TESTS = [{ + 'url': 'http://trailers.apple.com/trailers/wb/manofsteel/', + 'info_dict': { + 'id': 'manofsteel', + }, + 'playlist': [ { - "md5": "d97a8e575432dbcb81b7c3acb741f8a8", - "info_dict": { - "id": "manofsteel-trailer4", - "ext": "mov", - "duration": 111, - "title": "Trailer 4", - "upload_date": "20130523", - "uploader_id": "wb", + 'md5': 'd97a8e575432dbcb81b7c3acb741f8a8', + 'info_dict': { + 'id': 'manofsteel-trailer4', + 'ext': 'mov', + 'duration': 111, + 'title': 'Trailer 4', + 'upload_date': '20130523', + 'uploader_id': 'wb', }, }, { - "md5": "b8017b7131b721fb4e8d6f49e1df908c", - "info_dict": { - "id": "manofsteel-trailer3", - "ext": "mov", - "duration": 182, - "title": "Trailer 3", - "upload_date": "20130417", - "uploader_id": "wb", + 'md5': 'b8017b7131b721fb4e8d6f49e1df908c', + 'info_dict': { + 'id': 'manofsteel-trailer3', + 'ext': 'mov', + 'duration': 182, + 'title': 'Trailer 3', + 'upload_date': '20130417', + 'uploader_id': 'wb', }, }, { - "md5": "d0f1e1150989b9924679b441f3404d48", - "info_dict": { - "id": "manofsteel-trailer", - "ext": "mov", - "duration": 148, - "title": "Trailer", - "upload_date": "20121212", - "uploader_id": "wb", + 'md5': 'd0f1e1150989b9924679b441f3404d48', + 'info_dict': { + 'id': 'manofsteel-trailer', + 'ext': 'mov', + 'duration': 148, + 'title': 'Trailer', + 'upload_date': '20121212', + 'uploader_id': 'wb', }, }, { - "md5": "5fe08795b943eb2e757fa95cb6def1cb", - "info_dict": { - "id": "manofsteel-teaser", - "ext": "mov", - "duration": 93, - "title": "Teaser", - "upload_date": "20120721", - "uploader_id": "wb", + 'md5': '5fe08795b943eb2e757fa95cb6def1cb', + 'info_dict': { + 'id': 'manofsteel-teaser', + 'ext': 'mov', + 'duration': 93, + 'title': 'Teaser', + 'upload_date': '20120721', + 'uploader_id': 'wb', }, }, ] - } + }, { + 'url': 'http://trailers.apple.com/ca/metropole/autrui/', + 'only_matching': True, + }] _JSON_RE = r'iTunes.playURL\((.*?)\);' diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index 9fc35a42b..8feb7cb74 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -33,7 +33,7 @@ class ArchiveOrgIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - json_url = url + ('?' if '?' in url else '&') + 'output=json' + json_url = url + ('&' if '?' in url else '?') + 'output=json' data = self._download_json(json_url, video_id) def get_optional(data_dict, field): diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 783b53e23..73be6d204 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -8,13 +8,14 @@ from .generic import GenericIE from ..utils import ( determine_ext, ExtractorError, + get_element_by_attribute, qualities, int_or_none, parse_duration, unified_strdate, xpath_text, - parse_xml, ) +from ..compat import compat_etree_fromstring class ARDMediathekIE(InfoExtractor): @@ -22,19 +23,125 @@ class ARDMediathekIE(InfoExtractor): _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' _TESTS = [{ - 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', - 'only_matching': True, + 'url': 'http://www.ardmediathek.de/tv/Dokumentation-und-Reportage/Ich-liebe-das-Leben-trotzdem/rbb-Fernsehen/Video?documentId=29582122&bcastId=3822114', + 'info_dict': { + 'id': '29582122', + 'ext': 'mp4', + 'title': 'Ich liebe das Leben trotzdem', + 'description': 'md5:45e4c225c72b27993314b31a84a5261c', + 'duration': 4557, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { - 'url': 'http://www.ardmediathek.de/tv/Tatort/Das-Wunder-von-Wolbeck-Video-tgl-ab-20/Das-Erste/Video?documentId=22490580&bcastId=602916', + 'url': 'http://www.ardmediathek.de/tv/Tatort/Tatort-Scheinwelten-H%C3%B6rfassung-Video/Das-Erste/Video?documentId=29522730&bcastId=602916', + 'md5': 'f4d98b10759ac06c0072bbcd1f0b9e3e', 'info_dict': { - 'id': '22490580', + 'id': '29522730', 'ext': 'mp4', - 'title': 'Das Wunder von Wolbeck (Video tgl. ab 20 Uhr)', - 'description': 'Auf einem restaurierten Hof bei Wolbeck wird der Heilpraktiker Raffael Lembeck eines morgens von seiner Frau Stella tot aufgefunden. Das Opfer war offensichtlich in seiner Praxis zu Fall gekommen und ist dann verblutet, erklärt Prof. Boerne am Tatort.', + 'title': 'Tatort: Scheinwelten - Hörfassung (Video tgl. ab 20 Uhr)', + 'description': 'md5:196392e79876d0ac94c94e8cdb2875f1', + 'duration': 5252, + }, + }, { + # audio + 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', + 'md5': '219d94d8980b4f538c7fcb0865eb7f2c', + 'info_dict': { + 'id': '28488308', + 'ext': 'mp3', + 'title': 'Tod eines Fußballers', + 'description': 'md5:f6e39f3461f0e1f54bfa48c8875c86ef', + 'duration': 3240, }, - 'skip': 'Blocked outside of Germany', + }, { + 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', + 'only_matching': True, }] + def _extract_media_info(self, media_info_url, webpage, video_id): + media_info = self._download_json( + media_info_url, video_id, 'Downloading media JSON') + + formats = self._extract_formats(media_info, video_id) + + if not formats: + if '"fsk"' in webpage: + raise ExtractorError( + 'This video is only available after 20:00', expected=True) + elif media_info.get('_geoblocked'): + raise ExtractorError('This video is not available due to geo restriction', expected=True) + + self._sort_formats(formats) + + duration = int_or_none(media_info.get('_duration')) + thumbnail = media_info.get('_previewImage') + + subtitles = {} + subtitle_url = media_info.get('_subtitleUrl') + if subtitle_url: + subtitles['de'] = [{ + 'ext': 'srt', + 'url': subtitle_url, + }] + + return { + 'id': video_id, + 'duration': duration, + 'thumbnail': thumbnail, + 'formats': formats, + 'subtitles': subtitles, + } + + def _extract_formats(self, media_info, video_id): + type_ = media_info.get('_type') + media_array = media_info.get('_mediaArray', []) + formats = [] + for num, media in enumerate(media_array): + for stream in media.get('_mediaStreamArray', []): + stream_urls = stream.get('_stream') + if not stream_urls: + continue + if not isinstance(stream_urls, list): + stream_urls = [stream_urls] + quality = stream.get('_quality') + server = stream.get('_server') + for stream_url in stream_urls: + ext = determine_ext(stream_url) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + stream_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', + video_id, preference=-1, f4m_id='hds')) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, 'mp4', preference=1, m3u8_id='hls')) + else: + if server and server.startswith('rtmp'): + f = { + 'url': server, + 'play_path': stream_url, + 'format_id': 'a%s-rtmp-%s' % (num, quality), + } + elif stream_url.startswith('http'): + f = { + 'url': stream_url, + 'format_id': 'a%s-%s-%s' % (num, ext, quality) + } + else: + continue + m = re.search(r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$', stream_url) + if m: + f.update({ + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) + if type_ == 'audio': + f['vcodec'] = 'none' + formats.append(f) + return formats + def _real_extract(self, url): # determine video id from url m = re.match(self._VALID_URL, url) @@ -50,8 +157,11 @@ class ARDMediathekIE(InfoExtractor): if '>Der gewünschte Beitrag ist nicht mehr verfügbar.<' in webpage: raise ExtractorError('Video %s is no longer available' % video_id, expected=True) + if 'Diese Sendung ist für Jugendliche unter 12 Jahren nicht geeignet. Der Clip ist deshalb nur von 20 bis 6 Uhr verfügbar.' in webpage: + raise ExtractorError('This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id, expected=True) + if re.search(r'[\?&]rss($|[=&])', url): - doc = parse_xml(webpage) + doc = compat_etree_fromstring(webpage.encode('utf-8')) if doc.tag == 'rss': return GenericIE()._extract_rss(url, video_id, doc) @@ -89,46 +199,22 @@ class ARDMediathekIE(InfoExtractor): 'format_id': fid, 'url': furl, }) + self._sort_formats(formats) + info = { + 'formats': formats, + } else: # request JSON file - media_info = self._download_json( - 'http://www.ardmediathek.de/play/media/%s' % video_id, video_id) - # The second element of the _mediaArray contains the standard http urls - streams = media_info['_mediaArray'][1]['_mediaStreamArray'] - if not streams: - if '"fsk"' in webpage: - raise ExtractorError('This video is only available after 20:00') + info = self._extract_media_info( + 'http://www.ardmediathek.de/play/media/%s' % video_id, webpage, video_id) - formats = [] - for s in streams: - if type(s['_stream']) == list: - for index, url in enumerate(s['_stream'][::-1]): - quality = s['_quality'] + index - formats.append({ - 'quality': quality, - 'url': url, - 'format_id': '%s-%s' % (determine_ext(url), quality) - }) - continue - - format = { - 'quality': s['_quality'], - 'url': s['_stream'], - } - - format['format_id'] = '%s-%s' % ( - determine_ext(format['url']), format['quality']) - - formats.append(format) - - self._sort_formats(formats) - - return { + info.update({ 'id': video_id, 'title': title, 'description': description, - 'formats': formats, 'thumbnail': thumbnail, - } + }) + + return info class ARDIE(InfoExtractor): @@ -186,3 +272,41 @@ class ARDIE(InfoExtractor): 'upload_date': upload_date, 'thumbnail': thumbnail, } + + +class SportschauIE(ARDMediathekIE): + IE_NAME = 'Sportschau' + _VALID_URL = r'(?P<baseurl>https?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video(?P<id>[^/#?]+))\.html' + _TESTS = [{ + 'url': 'http://www.sportschau.de/tourdefrance/videoseppeltkokainhatnichtsmitklassischemdopingzutun100.html', + 'info_dict': { + 'id': 'seppeltkokainhatnichtsmitklassischemdopingzutun100', + 'ext': 'mp4', + 'title': 'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun"', + 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'Der ARD-Doping Experte Hajo Seppelt gibt seine Einschätzung zum ersten Dopingfall der diesjährigen Tour de France um den Italiener Luca Paolini ab.', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + base_url = mobj.group('baseurl') + + webpage = self._download_webpage(url, video_id) + title = get_element_by_attribute('class', 'headline', webpage) + description = self._html_search_meta('description', webpage, 'description') + + info = self._extract_media_info( + base_url + '-mc_defaultQuality-h.json', webpage, video_id) + + info.update({ + 'title': title, + 'description': description, + }) + + return info diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 929dd3cc5..2a00da3ee 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -4,10 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( find_xpath_attr, unified_strdate, - get_element_by_id, get_element_by_attribute, int_or_none, qualities, @@ -78,7 +81,13 @@ class ArteTVPlus7IE(InfoExtractor): def _extract_from_webpage(self, webpage, video_id, lang): json_url = self._html_search_regex( [r'arte_vp_url=["\'](.*?)["\']', r'data-url=["\']([^"]+)["\']'], - webpage, 'json vp url') + webpage, 'json vp url', default=None) + if not json_url: + iframe_url = self._html_search_regex( + r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1', + webpage, 'iframe url', group='url') + json_url = compat_parse_qs( + compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0] return self._extract_from_json_url(json_url, video_id, lang) def _extract_from_json_url(self, json_url, video_id, lang): @@ -146,6 +155,7 @@ class ArteTVPlus7IE(InfoExtractor): formats.append(format) + self._check_formats(formats, video_id) self._sort_formats(formats) info_dict['formats'] = formats @@ -194,7 +204,9 @@ class ArteTVFutureIE(ArteTVPlus7IE): def _real_extract(self, url): anchor_id, lang = self._extract_url_info(url) webpage = self._download_webpage(url, anchor_id) - row = get_element_by_id(anchor_id, webpage) + row = self._search_regex( + r'(?s)id="%s"[^>]*>.+?(<div[^>]*arte_vp_url[^>]*>)' % anchor_id, + webpage, 'row') return self._extract_from_webpage(row, anchor_id, lang) diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index f016368fa..50e47ba0a 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -3,22 +3,23 @@ from __future__ import unicode_literals import time import hmac -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor from ..compat import ( compat_str, compat_urllib_parse, - compat_urllib_request, ) from ..utils import ( int_or_none, float_or_none, + sanitized_Request, xpath_text, ExtractorError, ) -class AtresPlayerIE(SubtitlesInfoExtractor): +class AtresPlayerIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/television/[^/]+/[^/]+/[^/]+/(?P<id>.+?)_\d+\.html' + _NETRC_MACHINE = 'atresplayer' _TESTS = [ { 'url': 'http://www.atresplayer.com/television/programas/el-club-de-la-comedia/temporada-4/capitulo-10-especial-solidario-nochebuena_2014122100174.html', @@ -62,7 +63,7 @@ class AtresPlayerIE(SubtitlesInfoExtractor): 'j_password': password, } - request = compat_urllib_request.Request( + request = sanitized_Request( self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) request.add_header('Content-Type', 'application/x-www-form-urlencoded') response = self._download_webpage( @@ -93,7 +94,7 @@ class AtresPlayerIE(SubtitlesInfoExtractor): formats = [] for fmt in ['windows', 'android_tablet']: - request = compat_urllib_request.Request( + request = sanitized_Request( self._URL_VIDEO_TEMPLATE.format(fmt, episode_id, timestamp_shifted, token)) request.add_header('User-Agent', self._USER_AGENT) @@ -144,13 +145,12 @@ class AtresPlayerIE(SubtitlesInfoExtractor): thumbnail = xpath_text(episode, './media/asset/files/background', 'thumbnail') subtitles = {} - subtitle = xpath_text(episode, './media/asset/files/subtitle', 'subtitle') - if subtitle: - subtitles['es'] = subtitle - - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return + subtitle_url = xpath_text(episode, './media/asset/files/subtitle', 'subtitle') + if subtitle_url: + subtitles['es'] = [{ + 'ext': 'srt', + 'url': subtitle_url, + }] return { 'id': video_id, @@ -159,5 +159,5 @@ class AtresPlayerIE(SubtitlesInfoExtractor): 'thumbnail': thumbnail, 'duration': duration, 'formats': formats, - 'subtitles': self.extract_subtitles(video_id, subtitles), + 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/baidu.py b/youtube_dl/extractor/baidu.py new file mode 100644 index 000000000..e37ee4440 --- /dev/null +++ b/youtube_dl/extractor/baidu.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_urlparse + + +class BaiduVideoIE(InfoExtractor): + IE_DESC = '百度视频' + _VALID_URL = r'http://v\.baidu\.com/(?P<type>[a-z]+)/(?P<id>\d+)\.htm' + _TESTS = [{ + 'url': 'http://v.baidu.com/comic/1069.htm?frp=bdbrand&q=%E4%B8%AD%E5%8D%8E%E5%B0%8F%E5%BD%93%E5%AE%B6', + 'info_dict': { + 'id': '1069', + 'title': '中华小当家 TV版 (全52集)', + 'description': 'md5:395a419e41215e531c857bb037bbaf80', + }, + 'playlist_count': 52, + }, { + 'url': 'http://v.baidu.com/show/11595.htm?frp=bdbrand', + 'info_dict': { + 'id': '11595', + 'title': 're:^奔跑吧兄弟', + 'description': 'md5:1bf88bad6d850930f542d51547c089b8', + }, + 'playlist_mincount': 3, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + category = category2 = mobj.group('type') + if category == 'show': + category2 = 'tvshow' + + webpage = self._download_webpage(url, playlist_id) + + playlist_title = self._html_search_regex( + r'title\s*:\s*(["\'])(?P<title>[^\']+)\1', webpage, + 'playlist title', group='title') + playlist_description = self._html_search_regex( + r'<input[^>]+class="j-data-intro"[^>]+value="([^"]+)"/>', webpage, + playlist_id, 'playlist description') + + site = self._html_search_regex( + r'filterSite\s*:\s*["\']([^"]*)["\']', webpage, + 'primary provider site') + api_result = self._download_json( + 'http://v.baidu.com/%s_intro/?dtype=%sPlayUrl&id=%s&site=%s' % ( + category, category2, playlist_id, site), + playlist_id, 'Get playlist links') + + entries = [] + for episode in api_result[0]['episodes']: + episode_id = '%s_%s' % (playlist_id, episode['episode']) + + redirect_page = self._download_webpage( + compat_urlparse.urljoin(url, episode['url']), episode_id, + note='Download Baidu redirect page') + real_url = self._html_search_regex( + r'location\.replace\("([^"]+)"\)', redirect_page, 'real URL') + + entries.append(self.url_result( + real_url, video_title=episode['single_title'])) + + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py index c193e66ca..da986e063 100644 --- a/youtube_dl/extractor/bambuser.py +++ b/youtube_dl/extractor/bambuser.py @@ -1,12 +1,18 @@ from __future__ import unicode_literals import re -import json import itertools from .common import InfoExtractor from ..compat import ( - compat_urllib_request, + compat_urllib_parse, + compat_str, +) +from ..utils import ( + ExtractorError, + int_or_none, + float_or_none, + sanitized_Request, ) @@ -14,6 +20,8 @@ class BambuserIE(InfoExtractor): IE_NAME = 'bambuser' _VALID_URL = r'https?://bambuser\.com/v/(?P<id>\d+)' _API_KEY = '005f64509e19a868399060af746a00aa' + _LOGIN_URL = 'https://bambuser.com/user' + _NETRC_MACHINE = 'bambuser' _TEST = { 'url': 'http://bambuser.com/v/4050584', @@ -26,6 +34,9 @@ class BambuserIE(InfoExtractor): 'duration': 3741, 'uploader': 'pixelversity', 'uploader_id': '344706', + 'timestamp': 1382976692, + 'upload_date': '20131028', + 'view_count': int, }, 'params': { # It doesn't respect the 'Range' header, it would download the whole video @@ -34,23 +45,60 @@ class BambuserIE(InfoExtractor): }, } + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + login_form = { + 'form_id': 'user_login', + 'op': 'Log in', + 'name': username, + 'pass': password, + } + + request = sanitized_Request( + self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) + request.add_header('Referer', self._LOGIN_URL) + response = self._download_webpage( + request, None, 'Logging in as %s' % username) + + login_error = self._html_search_regex( + r'(?s)<div class="messages error">(.+?)</div>', + response, 'login error', default=None) + if login_error: + raise ExtractorError( + 'Unable to login: %s' % login_error, expected=True) + + def _real_initialize(self): + self._login() + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - info_url = ('http://player-c.api.bambuser.com/getVideo.json?' - '&api_key=%s&vid=%s' % (self._API_KEY, video_id)) - info_json = self._download_webpage(info_url, video_id) - info = json.loads(info_json)['result'] + video_id = self._match_id(url) + + info = self._download_json( + 'http://player-c.api.bambuser.com/getVideo.json?api_key=%s&vid=%s' + % (self._API_KEY, video_id), video_id) + + error = info.get('error') + if error: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, error), expected=True) + + result = info['result'] return { 'id': video_id, - 'title': info['title'], - 'url': info['url'], - 'thumbnail': info.get('preview'), - 'duration': int(info['length']), - 'view_count': int(info['views_total']), - 'uploader': info['username'], - 'uploader_id': info['owner']['uid'], + 'title': result['title'], + 'url': result['url'], + 'thumbnail': result.get('preview'), + 'duration': int_or_none(result.get('length')), + 'uploader': result.get('username'), + 'uploader_id': compat_str(result.get('owner', {}).get('uid')), + 'timestamp': int_or_none(result.get('created')), + 'fps': float_or_none(result.get('framerate')), + 'view_count': int_or_none(result.get('views_total')), + 'comment_count': int_or_none(result.get('comment_count')), } @@ -78,7 +126,7 @@ class BambuserChannelIE(InfoExtractor): '&sort=created&access_mode=0%2C1%2C2&limit={count}' '&method=broadcast&format=json&vid_older_than={last}' ).format(user=user, count=self._STEP, last=last_id) - req = compat_urllib_request.Request(req_url) + req = sanitized_Request(req_url) # Without setting this header, we wouldn't get any result req.add_header('Referer', 'http://bambuser.com/channel/%s' % user) data = self._download_json( diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 490cc961a..c1ef8051d 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -10,6 +10,8 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + float_or_none, + int_or_none, ) @@ -52,11 +54,11 @@ class BandcampIE(InfoExtractor): ext, abr_str = format_id.split('-', 1) formats.append({ 'format_id': format_id, - 'url': format_url, + 'url': self._proto_relative_url(format_url, 'http:'), 'ext': ext, 'vcodec': 'none', 'acodec': ext, - 'abr': int(abr_str), + 'abr': int_or_none(abr_str), }) self._sort_formats(formats) @@ -65,14 +67,14 @@ class BandcampIE(InfoExtractor): 'id': compat_str(data['id']), 'title': data['title'], 'formats': formats, - 'duration': float(data['duration']), + 'duration': float_or_none(data.get('duration')), } else: raise ExtractorError('No free songs found') download_link = m_download.group(1) video_id = self._search_regex( - r'(?ms)var TralbumData = {.*?id: (?P<id>\d+),?$', + r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$', webpage, 'video id') download_webpage = self._download_webpage(download_link, video_id, 'Downloading free downloads page') @@ -93,8 +95,8 @@ class BandcampIE(InfoExtractor): final_url_webpage = self._download_webpage(request_url, video_id, 'Requesting download url') # If we could correctly generate the .rand field the url would be # in the "download_url" key - final_url = self._search_regex( - r'"retry_url":"(.*?)"', final_url_webpage, 'final video URL') + final_url = self._proto_relative_url(self._search_regex( + r'"retry_url":"(.+?)"', final_url_webpage, 'final video URL'), 'http:') return { 'id': video_id, @@ -109,7 +111,7 @@ class BandcampIE(InfoExtractor): class BandcampAlbumIE(InfoExtractor): IE_NAME = 'Bandcamp:album' - _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<title>[^?#]+)|/?(?:$|[?#]))' + _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^?#]+)|/?(?:$|[?#]))' _TESTS = [{ 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', @@ -133,31 +135,37 @@ class BandcampAlbumIE(InfoExtractor): ], 'info_dict': { 'title': 'Jazz Format Mixtape vol.1', + 'id': 'jazz-format-mixtape-vol-1', + 'uploader_id': 'blazo', }, 'params': { 'playlistend': 2 }, - 'skip': 'Bandcamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test' + 'skip': 'Bandcamp imposes download limits.' }, { 'url': 'http://nightbringer.bandcamp.com/album/hierophany-of-the-open-grave', 'info_dict': { 'title': 'Hierophany of the Open Grave', + 'uploader_id': 'nightbringer', + 'id': 'hierophany-of-the-open-grave', }, 'playlist_mincount': 9, }, { 'url': 'http://dotscale.bandcamp.com', 'info_dict': { 'title': 'Loom', + 'id': 'dotscale', + 'uploader_id': 'dotscale', }, 'playlist_mincount': 7, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - playlist_id = mobj.group('subdomain') - title = mobj.group('title') - display_id = title or playlist_id - webpage = self._download_webpage(url, display_id) + uploader_id = mobj.group('subdomain') + album_id = mobj.group('album_id') + playlist_id = album_id or uploader_id + webpage = self._download_webpage(url, playlist_id) tracks_paths = re.findall(r'<a href="(.*?)" itemprop="url">', webpage) if not tracks_paths: raise ExtractorError('The page doesn\'t contain any tracks') @@ -168,8 +176,8 @@ class BandcampAlbumIE(InfoExtractor): r'album_title\s*:\s*"(.*?)"', webpage, 'title', fatal=False) return { '_type': 'playlist', + 'uploader_id': uploader_id, 'id': playlist_id, - 'display_id': display_id, 'title': title, 'entries': entries, } diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py new file mode 100644 index 000000000..33b296eaf --- /dev/null +++ b/youtube_dl/extractor/bbc.py @@ -0,0 +1,937 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + parse_duration, + parse_iso8601, + remove_end, + unescapeHTML, +) +from ..compat import ( + compat_etree_fromstring, + compat_HTTPError, +) + + +class BBCCoUkIE(InfoExtractor): + IE_NAME = 'bbc.co.uk' + IE_DESC = 'BBC iPlayer' + _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:programmes/(?!articles/)|iplayer(?:/[^/]+)?/(?:episode/|playlist/))|music/clips[/#])(?P<id>[\da-z]{8})' + + _MEDIASELECTOR_URLS = [ + # Provides HQ HLS streams with even better quality that pc mediaset but fails + # with geolocation in some cases when it's even not geo restricted at all (e.g. + # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable. + 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s', + 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s', + ] + + _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection' + _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist' + + _NAMESPACES = ( + _MEDIASELECTION_NS, + _EMP_PLAYLIST_NS, + ) + + _TESTS = [ + { + 'url': 'http://www.bbc.co.uk/programmes/b039g8p7', + 'info_dict': { + 'id': 'b039d07m', + 'ext': 'flv', + 'title': 'Kaleidoscope, Leonard Cohen', + 'description': 'The Canadian poet and songwriter reflects on his musical career.', + 'duration': 1740, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, + { + 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/', + 'info_dict': { + 'id': 'b00yng1d', + 'ext': 'flv', + 'title': 'The Man in Black: Series 3: The Printed Name', + 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.", + 'duration': 1800, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Episode is no longer available on BBC iPlayer Radio', + }, + { + 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/', + 'info_dict': { + 'id': 'b00yng1d', + 'ext': 'flv', + 'title': 'The Voice UK: Series 3: Blind Auditions 5', + 'description': "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.", + 'duration': 5100, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only', + }, + { + 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion', + 'info_dict': { + 'id': 'b03k3pb7', + 'ext': 'flv', + 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction", + 'description': '2. Invasion', + 'duration': 3600, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only', + }, { + 'url': 'http://www.bbc.co.uk/programmes/b04v20dw', + 'info_dict': { + 'id': 'b04v209v', + 'ext': 'flv', + 'title': 'Pete Tong, The Essential New Tune Special', + 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!", + 'duration': 10800, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'http://www.bbc.co.uk/music/clips/p02frcc3', + 'note': 'Audio', + 'info_dict': { + 'id': 'p02frcch', + 'ext': 'flv', + 'title': 'Pete Tong, Past, Present and Future Special, Madeon - After Hours mix', + 'description': 'French house superstar Madeon takes us out of the club and onto the after party.', + 'duration': 3507, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz', + 'note': 'Video', + 'info_dict': { + 'id': 'p025c103', + 'ext': 'flv', + 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)', + 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014', + 'duration': 226, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls', + 'info_dict': { + 'id': 'p02n76xf', + 'ext': 'flv', + 'title': 'Natural World, 2015-2016: 2. Super Powered Owls', + 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d', + 'duration': 3540, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'geolocation', + }, { + 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition', + 'info_dict': { + 'id': 'b05zmgw1', + 'ext': 'flv', + 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.', + 'title': 'Royal Academy Summer Exhibition', + 'duration': 3540, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'geolocation', + }, { + # iptv-all mediaset fails with geolocation however there is no geo restriction + # for this programme at all + 'url': 'http://www.bbc.co.uk/programmes/b06bp7lf', + 'info_dict': { + 'id': 'b06bp7kf', + 'ext': 'flv', + 'title': "Annie Mac's Friday Night, B.Traits sits in for Annie", + 'description': 'B.Traits sits in for Annie Mac with a Mini-Mix from Disclosure.', + 'duration': 10800, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo', + 'only_matching': True, + } + ] + + class MediaSelectionError(Exception): + def __init__(self, id): + self.id = id + + def _extract_asx_playlist(self, connection, programme_id): + asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist') + return [ref.get('href') for ref in asx.findall('./Entry/ref')] + + def _extract_connection(self, connection, programme_id): + formats = [] + kind = connection.get('kind') + protocol = connection.get('protocol') + supplier = connection.get('supplier') + if protocol == 'http': + href = connection.get('href') + transfer_format = connection.get('transferFormat') + # ASX playlist + if supplier == 'asx': + for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)): + formats.append({ + 'url': ref, + 'format_id': 'ref%s_%s' % (i, supplier), + }) + # Skip DASH until supported + elif transfer_format == 'dash': + pass + elif transfer_format == 'hls': + m3u8_formats = self._extract_m3u8_formats( + href, programme_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id=supplier, fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + # Direct link + else: + formats.append({ + 'url': href, + 'format_id': supplier or kind or protocol, + }) + elif protocol == 'rtmp': + application = connection.get('application', 'ondemand') + auth_string = connection.get('authString') + identifier = connection.get('identifier') + server = connection.get('server') + formats.append({ + 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string), + 'play_path': identifier, + 'app': '%s?%s' % (application, auth_string), + 'page_url': 'http://www.bbc.co.uk', + 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf', + 'rtmp_live': False, + 'ext': 'flv', + 'format_id': supplier, + }) + return formats + + def _extract_items(self, playlist): + return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS) + + def _findall_ns(self, element, xpath): + elements = [] + for ns in self._NAMESPACES: + elements.extend(element.findall(xpath % ns)) + return elements + + def _extract_medias(self, media_selection): + error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS) + if error is None: + media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS) + if error is not None: + raise BBCCoUkIE.MediaSelectionError(error.get('id')) + return self._findall_ns(media_selection, './{%s}media') + + def _extract_connections(self, media): + return self._findall_ns(media, './{%s}connection') + + def _extract_video(self, media, programme_id): + formats = [] + vbr = int_or_none(media.get('bitrate')) + vcodec = media.get('encoding') + service = media.get('service') + width = int_or_none(media.get('width')) + height = int_or_none(media.get('height')) + file_size = int_or_none(media.get('media_file_size')) + for connection in self._extract_connections(media): + conn_formats = self._extract_connection(connection, programme_id) + for format in conn_formats: + format.update({ + 'width': width, + 'height': height, + 'vbr': vbr, + 'vcodec': vcodec, + 'filesize': file_size, + }) + if service: + format['format_id'] = '%s_%s' % (service, format['format_id']) + formats.extend(conn_formats) + return formats + + def _extract_audio(self, media, programme_id): + formats = [] + abr = int_or_none(media.get('bitrate')) + acodec = media.get('encoding') + service = media.get('service') + for connection in self._extract_connections(media): + conn_formats = self._extract_connection(connection, programme_id) + for format in conn_formats: + format.update({ + 'format_id': '%s_%s' % (service, format['format_id']), + 'abr': abr, + 'acodec': acodec, + }) + formats.extend(conn_formats) + return formats + + def _get_subtitles(self, media, programme_id): + subtitles = {} + for connection in self._extract_connections(media): + captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions') + lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en') + subtitles[lang] = [ + { + 'url': connection.get('href'), + 'ext': 'ttml', + }, + ] + return subtitles + + def _raise_extractor_error(self, media_selection_error): + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, media_selection_error.id), + expected=True) + + def _download_media_selector(self, programme_id): + last_exception = None + for mediaselector_url in self._MEDIASELECTOR_URLS: + try: + return self._download_media_selector_url( + mediaselector_url % programme_id, programme_id) + except BBCCoUkIE.MediaSelectionError as e: + if e.id in ('notukerror', 'geolocation', 'selectionunavailable'): + last_exception = e + continue + self._raise_extractor_error(e) + self._raise_extractor_error(last_exception) + + def _download_media_selector_url(self, url, programme_id=None): + try: + media_selection = self._download_xml( + url, programme_id, 'Downloading media selection XML') + except ExtractorError as ee: + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code in (403, 404): + media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8')) + else: + raise + return self._process_media_selector(media_selection, programme_id) + + def _process_media_selector(self, media_selection, programme_id): + formats = [] + subtitles = None + + for media in self._extract_medias(media_selection): + kind = media.get('kind') + if kind == 'audio': + formats.extend(self._extract_audio(media, programme_id)) + elif kind == 'video': + formats.extend(self._extract_video(media, programme_id)) + elif kind == 'captions': + subtitles = self.extract_subtitles(media, programme_id) + return formats, subtitles + + def _download_playlist(self, playlist_id): + try: + playlist = self._download_json( + 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id, + playlist_id, 'Downloading playlist JSON') + + version = playlist.get('defaultAvailableVersion') + if version: + smp_config = version['smpConfig'] + title = smp_config['title'] + description = smp_config['summary'] + for item in smp_config['items']: + kind = item['kind'] + if kind != 'programme' and kind != 'radioProgramme': + continue + programme_id = item.get('vpid') + duration = int_or_none(item.get('duration')) + formats, subtitles = self._download_media_selector(programme_id) + return programme_id, title, description, duration, formats, subtitles + except ExtractorError as ee: + if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404): + raise + + # fallback to legacy playlist + return self._process_legacy_playlist(playlist_id) + + def _process_legacy_playlist_url(self, url, display_id): + playlist = self._download_legacy_playlist_url(url, display_id) + return self._extract_from_legacy_playlist(playlist, display_id) + + def _process_legacy_playlist(self, playlist_id): + return self._process_legacy_playlist_url( + 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id) + + def _download_legacy_playlist_url(self, url, playlist_id=None): + return self._download_xml( + url, playlist_id, 'Downloading legacy playlist XML') + + def _extract_from_legacy_playlist(self, playlist, playlist_id): + no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS) + if no_items is not None: + reason = no_items.get('reason') + if reason == 'preAvailability': + msg = 'Episode %s is not yet available' % playlist_id + elif reason == 'postAvailability': + msg = 'Episode %s is no longer available' % playlist_id + elif reason == 'noMedia': + msg = 'Episode %s is not currently available' % playlist_id + else: + msg = 'Episode %s is not available: %s' % (playlist_id, reason) + raise ExtractorError(msg, expected=True) + + for item in self._extract_items(playlist): + kind = item.get('kind') + if kind != 'programme' and kind != 'radioProgramme': + continue + title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text + description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS) + description = description_el.text if description_el is not None else None + + def get_programme_id(item): + def get_from_attributes(item): + for p in('identifier', 'group'): + value = item.get(p) + if value and re.match(r'^[pb][\da-z]{7}$', value): + return value + get_from_attributes(item) + mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS) + if mediator is not None: + return get_from_attributes(mediator) + + programme_id = get_programme_id(item) + duration = int_or_none(item.get('duration')) + + if programme_id: + formats, subtitles = self._download_media_selector(programme_id) + else: + formats, subtitles = self._process_media_selector(item, playlist_id) + programme_id = playlist_id + + return programme_id, title, description, duration, formats, subtitles + + def _real_extract(self, url): + group_id = self._match_id(url) + + webpage = self._download_webpage(url, group_id, 'Downloading video page') + + programme_id = None + + tviplayer = self._search_regex( + r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById', + webpage, 'player', default=None) + + if tviplayer: + player = self._parse_json(tviplayer, group_id).get('player', {}) + duration = int_or_none(player.get('duration')) + programme_id = player.get('vpid') + + if not programme_id: + programme_id = self._search_regex( + r'"vpid"\s*:\s*"([\da-z]{8})"', webpage, 'vpid', fatal=False, default=None) + + if programme_id: + formats, subtitles = self._download_media_selector(programme_id) + title = self._og_search_title(webpage) + description = self._search_regex( + r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>', + webpage, 'description', fatal=False) + else: + programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id) + + self._sort_formats(formats) + + return { + 'id': programme_id, + 'title': title, + 'description': description, + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + } + + +class BBCIE(BBCCoUkIE): + IE_NAME = 'bbc' + IE_DESC = 'BBC' + _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)' + + _MEDIASELECTOR_URLS = [ + # Provides HQ HLS streams but fails with geolocation in some cases when it's + # even not geo restricted at all + 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s', + # Provides more formats, namely direct mp4 links, but fails on some videos with + # notukerror for non UK (?) users (e.g. + # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) + 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s', + # Provides fewer formats, but works everywhere for everybody (hopefully) + 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s', + ] + + _TESTS = [{ + # article with multiple videos embedded with data-playable containing vpids + 'url': 'http://www.bbc.com/news/world-europe-32668511', + 'info_dict': { + 'id': 'world-europe-32668511', + 'title': 'Russia stages massive WW2 parade despite Western boycott', + 'description': 'md5:00ff61976f6081841f759a08bf78cc9c', + }, + 'playlist_count': 2, + }, { + # article with multiple videos embedded with data-playable (more videos) + 'url': 'http://www.bbc.com/news/business-28299555', + 'info_dict': { + 'id': 'business-28299555', + 'title': 'Farnborough Airshow: Video highlights', + 'description': 'BBC reports and video highlights at the Farnborough Airshow.', + }, + 'playlist_count': 9, + 'skip': 'Save time', + }, { + # article with multiple videos embedded with `new SMP()` + # broken + 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460', + 'info_dict': { + 'id': '3662a707-0af9-3149-963f-47bea720b460', + 'title': 'BBC Blogs - Adam Curtis - BUGGER', + }, + 'playlist_count': 18, + }, { + # single video embedded with data-playable containing vpid + 'url': 'http://www.bbc.com/news/world-europe-32041533', + 'info_dict': { + 'id': 'p02mprgb', + 'ext': 'mp4', + 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'description': 'md5:2868290467291b37feda7863f7a83f54', + 'duration': 47, + 'timestamp': 1427219242, + 'upload_date': '20150324', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + # article with single video embedded with data-playable containing XML playlist + # with direct video links as progressiveDownloadUrl (for now these are extracted) + # and playlist with f4m and m3u8 as streamingUrl + 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu', + 'info_dict': { + 'id': '150615_telabyad_kentin_cogu', + 'ext': 'mp4', + 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde", + 'timestamp': 1434397334, + 'upload_date': '20150615', + }, + 'params': { + 'skip_download': True, + } + }, { + # single video embedded with data-playable containing XML playlists (regional section) + 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', + 'info_dict': { + 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw', + 'ext': 'mp4', + 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción', + 'timestamp': 1434713142, + 'upload_date': '20150619', + }, + 'params': { + 'skip_download': True, + } + }, { + # single video from video playlist embedded with vxp-playlist-data JSON + 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376', + 'info_dict': { + 'id': 'p02w6qjc', + 'ext': 'mp4', + 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''', + 'duration': 56, + }, + 'params': { + 'skip_download': True, + } + }, { + # single video story with digitalData + 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret', + 'info_dict': { + 'id': 'p02q6gc4', + 'ext': 'flv', + 'title': 'Sri Lanka’s spicy secret', + 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.', + 'timestamp': 1437674293, + 'upload_date': '20150723', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + # single video story without digitalData + 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star', + 'info_dict': { + 'id': 'p018zqqg', + 'ext': 'mp4', + 'title': 'Hyundai Santa Fe Sport: Rock star', + 'description': 'md5:b042a26142c4154a6e472933cf20793d', + 'timestamp': 1415867444, + 'upload_date': '20141113', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + # single video with playlist.sxml URL in playlist param + 'url': 'http://www.bbc.com/sport/0/football/33653409', + 'info_dict': { + 'id': 'p02xycnp', + 'ext': 'mp4', + 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', + 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.', + 'duration': 140, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + # article with multiple videos embedded with playlist.sxml in playlist param + 'url': 'http://www.bbc.com/sport/0/football/34475836', + 'info_dict': { + 'id': '34475836', + 'title': 'What Liverpool can expect from Klopp', + }, + 'playlist_count': 3, + }, { + # single video with playlist URL from weather section + 'url': 'http://www.bbc.com/weather/features/33601775', + 'only_matching': True, + }, { + # custom redirection to www.bbc.com + 'url': 'http://www.bbc.co.uk/news/science-environment-33661876', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if BBCCoUkIE.suitable(url) or BBCCoUkArticleIE.suitable(url) else super(BBCIE, cls).suitable(url) + + def _extract_from_media_meta(self, media_meta, video_id): + # Direct links to media in media metadata (e.g. + # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) + # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml + source_files = media_meta.get('sourceFiles') + if source_files: + return [{ + 'url': f['url'], + 'format_id': format_id, + 'ext': f.get('encoding'), + 'tbr': float_or_none(f.get('bitrate'), 1000), + 'filesize': int_or_none(f.get('filesize')), + } for format_id, f in source_files.items() if f.get('url')], [] + + programme_id = media_meta.get('externalId') + if programme_id: + return self._download_media_selector(programme_id) + + # Process playlist.sxml as legacy playlist + href = media_meta.get('href') + if href: + playlist = self._download_legacy_playlist_url(href) + _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id) + return formats, subtitles + + return [], [] + + def _extract_from_playlist_sxml(self, url, playlist_id, timestamp): + programme_id, title, description, duration, formats, subtitles = \ + self._process_legacy_playlist_url(url, playlist_id) + self._sort_formats(formats) + return { + 'id': programme_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subtitles, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + timestamp = None + playlist_title = None + playlist_description = None + + ld = self._parse_json( + self._search_regex( + r'(?s)<script type="application/ld\+json">(.+?)</script>', + webpage, 'ld json', default='{}'), + playlist_id, fatal=False) + if ld: + timestamp = parse_iso8601(ld.get('datePublished')) + playlist_title = ld.get('headline') + playlist_description = ld.get('articleBody') + + if not timestamp: + timestamp = parse_iso8601(self._search_regex( + [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"', + r'itemprop="datePublished"[^>]+datetime="([^"]+)"', + r'"datePublished":\s*"([^"]+)'], + webpage, 'date', default=None)) + + entries = [] + + # article with multiple videos embedded with playlist.sxml (e.g. + # http://www.bbc.com/sport/0/football/34475836) + playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage) + if playlists: + entries = [ + self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp) + for playlist_url in playlists] + + # news article with multiple videos embedded with data-playable + data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage) + if data_playables: + for _, data_playable_json in data_playables: + data_playable = self._parse_json( + unescapeHTML(data_playable_json), playlist_id, fatal=False) + if not data_playable: + continue + settings = data_playable.get('settings', {}) + if settings: + # data-playable with video vpid in settings.playlistObject.items (e.g. + # http://www.bbc.com/news/world-us-canada-34473351) + playlist_object = settings.get('playlistObject', {}) + if playlist_object: + items = playlist_object.get('items') + if items and isinstance(items, list): + title = playlist_object['title'] + description = playlist_object.get('summary') + duration = int_or_none(items[0].get('duration')) + programme_id = items[0].get('vpid') + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + entries.append({ + 'id': programme_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + }) + else: + # data-playable without vpid but with a playlist.sxml URLs + # in otherSettings.playlist (e.g. + # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani) + playlist = data_playable.get('otherSettings', {}).get('playlist', {}) + if playlist: + entries.append(self._extract_from_playlist_sxml( + playlist.get('progressiveDownloadUrl'), playlist_id, timestamp)) + + if entries: + playlist_title = playlist_title or remove_end(self._og_search_title(webpage), ' - BBC News') + playlist_description = playlist_description or self._og_search_description(webpage, default=None) + return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) + + # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) + programme_id = self._search_regex( + [r'data-video-player-vpid="([\da-z]{8})"', + r'<param[^>]+name="externalIdentifier"[^>]+value="([\da-z]{8})"'], + webpage, 'vpid', default=None) + + if programme_id: + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star) + digital_data = self._parse_json( + self._search_regex( + r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'), + programme_id, fatal=False) + page_info = digital_data.get('page', {}).get('pageInfo', {}) + title = page_info.get('pageName') or self._og_search_title(webpage) + description = page_info.get('description') or self._og_search_description(webpage) + timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp + return { + 'id': programme_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subtitles, + } + + playlist_title = self._html_search_regex( + r'<title>(.*?)(?:\s*-\s*BBC [^ ]+)?', webpage, 'playlist title') + playlist_description = self._og_search_description(webpage, default=None) + + def extract_all(pattern): + return list(filter(None, map( + lambda s: self._parse_json(s, playlist_id, fatal=False), + re.findall(pattern, webpage)))) + + # Multiple video article (e.g. + # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460) + EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+[\da-z]{8}(?:\b[^"]+)?' + entries = [] + for match in extract_all(r'new\s+SMP\(({.+?})\)'): + embed_url = match.get('playerSettings', {}).get('externalEmbedUrl') + if embed_url and re.match(EMBED_URL, embed_url): + entries.append(embed_url) + entries.extend(re.findall( + r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage)) + if entries: + return self.playlist_result( + [self.url_result(entry, 'BBCCoUk') for entry in entries], + playlist_id, playlist_title, playlist_description) + + # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511) + medias = extract_all(r"data-media-meta='({[^']+})'") + + if not medias: + # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international) + media_asset = self._search_regex( + r'mediaAssetPage\.init\(\s*({.+?}), "/', + webpage, 'media asset', default=None) + if media_asset: + media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False) + medias = [] + for video in media_asset_page.get('videos', {}).values(): + medias.extend(video.values()) + + if not medias: + # Multiple video playlist with single `now playing` entry (e.g. + # http://www.bbc.com/news/video_and_audio/must_see/33767813) + vxp_playlist = self._parse_json( + self._search_regex( + r']+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)', + webpage, 'playlist data'), + playlist_id) + playlist_medias = [] + for item in vxp_playlist: + media = item.get('media') + if not media: + continue + playlist_medias.append(media) + # Download single video if found media with asset id matching the video id from URL + if item.get('advert', {}).get('assetId') == playlist_id: + medias = [media] + break + # Fallback to the whole playlist + if not medias: + medias = playlist_medias + + entries = [] + for num, media_meta in enumerate(medias, start=1): + formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id) + if not formats: + continue + self._sort_formats(formats) + + video_id = media_meta.get('externalId') + if not video_id: + video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num) + + title = media_meta.get('caption') + if not title: + title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num) + + duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration')) + + images = [] + for image in media_meta.get('images', {}).values(): + images.extend(image.values()) + if 'image' in media_meta: + images.append(media_meta['image']) + + thumbnails = [{ + 'url': image.get('href'), + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } for image in images] + + entries.append({ + 'id': video_id, + 'title': title, + 'thumbnails': thumbnails, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subtitles, + }) + + return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) + + +class BBCCoUkArticleIE(InfoExtractor): + _VALID_URL = 'http://www.bbc.co.uk/programmes/articles/(?P[a-zA-Z0-9]+)' + IE_NAME = 'bbc.co.uk:article' + IE_DESC = 'BBC articles' + + _TEST = { + 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer', + 'info_dict': { + 'id': '3jNQLTMrPlYGTBn0WV6M2MS', + 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four', + 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.', + }, + 'playlist_count': 4, + 'add_ie': ['BBCCoUk'], + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage).strip() + + entries = [self.url_result(programme_url) for programme_url in re.findall( + r']+typeof="Clip"[^>]+resource="([^"]+)"', webpage)] + + return self.playlist_result(entries, playlist_id, title, description) diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py deleted file mode 100644 index 126c8824c..000000000 --- a/youtube_dl/extractor/bbccouk.py +++ /dev/null @@ -1,340 +0,0 @@ -from __future__ import unicode_literals - -import xml.etree.ElementTree - -from .subtitles import SubtitlesInfoExtractor -from ..utils import ExtractorError -from ..compat import compat_HTTPError - - -class BBCCoUkIE(SubtitlesInfoExtractor): - IE_NAME = 'bbc.co.uk' - IE_DESC = 'BBC iPlayer' - _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P[\da-z]{8})' - - _TESTS = [ - { - 'url': 'http://www.bbc.co.uk/programmes/b039g8p7', - 'info_dict': { - 'id': 'b039d07m', - 'ext': 'flv', - 'title': 'Kaleidoscope, Leonard Cohen', - 'description': 'The Canadian poet and songwriter reflects on his musical career.', - 'duration': 1740, - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, - { - 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/', - 'info_dict': { - 'id': 'b00yng1d', - 'ext': 'flv', - 'title': 'The Man in Black: Series 3: The Printed Name', - 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.", - 'duration': 1800, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'Episode is no longer available on BBC iPlayer Radio', - }, - { - 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/', - 'info_dict': { - 'id': 'b00yng1d', - 'ext': 'flv', - 'title': 'The Voice UK: Series 3: Blind Auditions 5', - 'description': "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.", - 'duration': 5100, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only', - }, - { - 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion', - 'info_dict': { - 'id': 'b03k3pb7', - 'ext': 'flv', - 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction", - 'description': '2. Invasion', - 'duration': 3600, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only', - }, { - 'url': 'http://www.bbc.co.uk/programmes/b04v20dw', - 'info_dict': { - 'id': 'b04v209v', - 'ext': 'flv', - 'title': 'Pete Tong, The Essential New Tune Special', - 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!", - 'duration': 10800, - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, { - 'url': 'http://www.bbc.co.uk/music/clips/p02frcc3', - 'note': 'Audio', - 'info_dict': { - 'id': 'p02frcch', - 'ext': 'flv', - 'title': 'Pete Tong, Past, Present and Future Special, Madeon - After Hours mix', - 'description': 'French house superstar Madeon takes us out of the club and onto the after party.', - 'duration': 3507, - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, { - 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz', - 'note': 'Video', - 'info_dict': { - 'id': 'p025c103', - 'ext': 'flv', - 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)', - 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014', - 'duration': 226, - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, { - 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4', - 'only_matching': True, - }, { - 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3', - 'only_matching': True, - }, { - 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo', - 'only_matching': True, - } - ] - - def _extract_asx_playlist(self, connection, programme_id): - asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist') - return [ref.get('href') for ref in asx.findall('./Entry/ref')] - - def _extract_connection(self, connection, programme_id): - formats = [] - protocol = connection.get('protocol') - supplier = connection.get('supplier') - if protocol == 'http': - href = connection.get('href') - # ASX playlist - if supplier == 'asx': - for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)): - formats.append({ - 'url': ref, - 'format_id': 'ref%s_%s' % (i, supplier), - }) - # Direct link - else: - formats.append({ - 'url': href, - 'format_id': supplier, - }) - elif protocol == 'rtmp': - application = connection.get('application', 'ondemand') - auth_string = connection.get('authString') - identifier = connection.get('identifier') - server = connection.get('server') - formats.append({ - 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string), - 'play_path': identifier, - 'app': '%s?%s' % (application, auth_string), - 'page_url': 'http://www.bbc.co.uk', - 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf', - 'rtmp_live': False, - 'ext': 'flv', - 'format_id': supplier, - }) - return formats - - def _extract_items(self, playlist): - return playlist.findall('./{http://bbc.co.uk/2008/emp/playlist}item') - - def _extract_medias(self, media_selection): - error = media_selection.find('./{http://bbc.co.uk/2008/mp/mediaselection}error') - if error is not None: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error.get('id')), expected=True) - return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media') - - def _extract_connections(self, media): - return media.findall('./{http://bbc.co.uk/2008/mp/mediaselection}connection') - - def _extract_video(self, media, programme_id): - formats = [] - vbr = int(media.get('bitrate')) - vcodec = media.get('encoding') - service = media.get('service') - width = int(media.get('width')) - height = int(media.get('height')) - file_size = int(media.get('media_file_size')) - for connection in self._extract_connections(media): - conn_formats = self._extract_connection(connection, programme_id) - for format in conn_formats: - format.update({ - 'format_id': '%s_%s' % (service, format['format_id']), - 'width': width, - 'height': height, - 'vbr': vbr, - 'vcodec': vcodec, - 'filesize': file_size, - }) - formats.extend(conn_formats) - return formats - - def _extract_audio(self, media, programme_id): - formats = [] - abr = int(media.get('bitrate')) - acodec = media.get('encoding') - service = media.get('service') - for connection in self._extract_connections(media): - conn_formats = self._extract_connection(connection, programme_id) - for format in conn_formats: - format.update({ - 'format_id': '%s_%s' % (service, format['format_id']), - 'abr': abr, - 'acodec': acodec, - }) - formats.extend(conn_formats) - return formats - - def _extract_captions(self, media, programme_id): - subtitles = {} - for connection in self._extract_connections(media): - captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions') - lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en') - ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}')) - srt = '' - for pos, p in enumerate(ps): - srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), - p.text.strip() if p.text is not None else '') - subtitles[lang] = srt - return subtitles - - def _download_media_selector(self, programme_id): - try: - media_selection = self._download_xml( - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' % programme_id, - programme_id, 'Downloading media selection XML') - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: - media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().encode('utf-8')) - else: - raise - - formats = [] - subtitles = None - - for media in self._extract_medias(media_selection): - kind = media.get('kind') - if kind == 'audio': - formats.extend(self._extract_audio(media, programme_id)) - elif kind == 'video': - formats.extend(self._extract_video(media, programme_id)) - elif kind == 'captions': - subtitles = self._extract_captions(media, programme_id) - - return formats, subtitles - - def _download_playlist(self, playlist_id): - try: - playlist = self._download_json( - 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id, - playlist_id, 'Downloading playlist JSON') - - version = playlist.get('defaultAvailableVersion') - if version: - smp_config = version['smpConfig'] - title = smp_config['title'] - description = smp_config['summary'] - for item in smp_config['items']: - kind = item['kind'] - if kind != 'programme' and kind != 'radioProgramme': - continue - programme_id = item.get('vpid') - duration = int(item.get('duration')) - formats, subtitles = self._download_media_selector(programme_id) - return programme_id, title, description, duration, formats, subtitles - except ExtractorError as ee: - if not isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: - raise - - # fallback to legacy playlist - playlist = self._download_xml( - 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, - playlist_id, 'Downloading legacy playlist XML') - - no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems') - if no_items is not None: - reason = no_items.get('reason') - if reason == 'preAvailability': - msg = 'Episode %s is not yet available' % playlist_id - elif reason == 'postAvailability': - msg = 'Episode %s is no longer available' % playlist_id - elif reason == 'noMedia': - msg = 'Episode %s is not currently available' % playlist_id - else: - msg = 'Episode %s is not available: %s' % (playlist_id, reason) - raise ExtractorError(msg, expected=True) - - for item in self._extract_items(playlist): - kind = item.get('kind') - if kind != 'programme' and kind != 'radioProgramme': - continue - title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text - description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text - programme_id = item.get('identifier') - duration = int(item.get('duration')) - formats, subtitles = self._download_media_selector(programme_id) - - return programme_id, title, description, duration, formats, subtitles - - def _real_extract(self, url): - group_id = self._match_id(url) - - webpage = self._download_webpage(url, group_id, 'Downloading video page') - - programme_id = self._search_regex( - r'"vpid"\s*:\s*"([\da-z]{8})"', webpage, 'vpid', fatal=False, default=None) - if programme_id: - player = self._download_json( - 'http://www.bbc.co.uk/iplayer/episode/%s.json' % group_id, - group_id)['jsConf']['player'] - title = player['title'] - description = player['subtitle'] - duration = player['duration'] - formats, subtitles = self._download_media_selector(programme_id) - else: - programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id) - - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(programme_id, subtitles) - return - - self._sort_formats(formats) - - return { - 'id': programme_id, - 'title': title, - 'description': description, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - } diff --git a/youtube_dl/extractor/beatportpro.py b/youtube_dl/extractor/beatportpro.py new file mode 100644 index 000000000..3c7775d3e --- /dev/null +++ b/youtube_dl/extractor/beatportpro.py @@ -0,0 +1,103 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import int_or_none + + +class BeatportProIE(InfoExtractor): + _VALID_URL = r'https?://pro\.beatport\.com/track/(?P[^/]+)/(?P[0-9]+)' + _TESTS = [{ + 'url': 'https://pro.beatport.com/track/synesthesia-original-mix/5379371', + 'md5': 'b3c34d8639a2f6a7f734382358478887', + 'info_dict': { + 'id': '5379371', + 'display_id': 'synesthesia-original-mix', + 'ext': 'mp4', + 'title': 'Froxic - Synesthesia (Original Mix)', + }, + }, { + 'url': 'https://pro.beatport.com/track/love-and-war-original-mix/3756896', + 'md5': 'e44c3025dfa38c6577fbaeb43da43514', + 'info_dict': { + 'id': '3756896', + 'display_id': 'love-and-war-original-mix', + 'ext': 'mp3', + 'title': 'Wolfgang Gartner - Love & War (Original Mix)', + }, + }, { + 'url': 'https://pro.beatport.com/track/birds-original-mix/4991738', + 'md5': 'a1fd8e8046de3950fd039304c186c05f', + 'info_dict': { + 'id': '4991738', + 'display_id': 'birds-original-mix', + 'ext': 'mp4', + 'title': "Tos, Middle Milk, Mumblin' Johnsson - Birds (Original Mix)", + } + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + track_id = mobj.group('id') + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, display_id) + + playables = self._parse_json( + self._search_regex( + r'window\.Playables\s*=\s*({.+?});', webpage, + 'playables info', flags=re.DOTALL), + track_id) + + track = next(t for t in playables['tracks'] if t['id'] == int(track_id)) + + title = ', '.join((a['name'] for a in track['artists'])) + ' - ' + track['name'] + if track['mix']: + title += ' (' + track['mix'] + ')' + + formats = [] + for ext, info in track['preview'].items(): + if not info['url']: + continue + fmt = { + 'url': info['url'], + 'ext': ext, + 'format_id': ext, + 'vcodec': 'none', + } + if ext == 'mp3': + fmt['preference'] = 0 + fmt['acodec'] = 'mp3' + fmt['abr'] = 96 + fmt['asr'] = 44100 + elif ext == 'mp4': + fmt['preference'] = 1 + fmt['acodec'] = 'aac' + fmt['abr'] = 96 + fmt['asr'] = 44100 + formats.append(fmt) + self._sort_formats(formats) + + images = [] + for name, info in track['images'].items(): + image_url = info.get('url') + if name == 'dynamic' or not image_url: + continue + image = { + 'id': name, + 'url': image_url, + 'height': int_or_none(info.get('height')), + 'width': int_or_none(info.get('width')), + } + images.append(image) + + return { + 'id': compat_str(track.get('id')) or track_id, + 'display_id': track.get('slug') or display_id, + 'title': title, + 'formats': formats, + 'thumbnails': images, + } diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index 4e79fea8f..61bc2f744 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -1,65 +1,69 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, +) class BeegIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?beeg\.com/(?P\d+)' _TEST = { 'url': 'http://beeg.com/5416503', - 'md5': '634526ae978711f6b748fe0dd6c11f57', + 'md5': '46c384def73b33dbc581262e5ee67cef', 'info_dict': { 'id': '5416503', 'ext': 'mp4', 'title': 'Sultry Striptease', - 'description': 'md5:6db3c6177972822aaba18652ff59c773', - 'categories': list, # NSFW - 'thumbnail': 're:https?://.*\.jpg$', + 'description': 'md5:d22219c09da287c14bed3d6c37ce4bc2', + 'timestamp': 1391813355, + 'upload_date': '20140207', + 'duration': 383, + 'tags': list, 'age_limit': 18, } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - webpage = self._download_webpage(url, video_id) - - quality_arr = self._search_regex( - r'(?s)var\s+qualityArr\s*=\s*{\s*(.+?)\s*}', webpage, 'quality formats') + video_id = self._match_id(url) - formats = [{ - 'url': fmt[1], - 'format_id': fmt[0], - 'height': int(fmt[0][:-1]), - } for fmt in re.findall(r"'([^']+)'\s*:\s*'([^']+)'", quality_arr)] + video = self._download_json( + 'http://beeg.com/api/v1/video/%s' % video_id, video_id) + formats = [] + for format_id, video_url in video.items(): + if not video_url: + continue + height = self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None) + if not height: + continue + formats.append({ + 'url': self._proto_relative_url(video_url.replace('{DATA_MARKERS}', ''), 'http:'), + 'format_id': format_id, + 'height': int(height), + }) self._sort_formats(formats) - title = self._html_search_regex( - r'([^<]+)\s*-\s*beeg\.?', webpage, 'title') + title = video['title'] + video_id = video.get('id') or video_id + display_id = video.get('code') + description = video.get('desc') - description = self._html_search_regex( - r'[0-9]+)/' - _TEST = { + _TESTS = [{ 'url': 'http://www.bilibili.tv/video/av1074402/', 'md5': '2c301e4dab317596e837c3e7633e7d86', 'info_dict': { - 'id': '1074402', + 'id': '1074402_part1', 'ext': 'flv', 'title': '【金坷垃】金泡沫', 'duration': 308, 'upload_date': '20140420', 'thumbnail': 're:^https?://.+\.jpg', }, - } + }, { + 'url': 'http://www.bilibili.com/video/av1041170/', + 'info_dict': { + 'id': '1041170', + 'title': '【BD1080P】刀语【诸神&异域】', + }, + 'playlist_count': 9, + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + if '(此视频不存在或被删除)' in webpage: + raise ExtractorError( + 'The video does not exist or was deleted', expected=True) + + if '>你没有权限浏览! 由于版权相关问题 我们不对您所在的地区提供服务<' in webpage: + raise ExtractorError( + 'The video is not available in your region due to copyright reasons', + expected=True) + video_code = self._search_regex( r'(?s)
(.*?)
', webpage, 'video code') @@ -54,19 +76,22 @@ class BiliBiliIE(InfoExtractor): cid = self._search_regex(r'cid=(\d+)', webpage, 'cid') - lq_doc = self._download_xml( + entries = [] + + lq_page = self._download_webpage( 'http://interface.bilibili.com/v_cdn_play?appkey=1&cid=%s' % cid, video_id, note='Downloading LQ video info' ) - lq_durl = lq_doc.find('./durl') - formats = [{ - 'format_id': 'lq', - 'quality': 1, - 'url': lq_durl.find('./url').text, - 'filesize': int_or_none( - lq_durl.find('./size'), get_attr='text'), - }] + try: + err_info = json.loads(lq_page) + raise ExtractorError( + 'BiliBili said: ' + err_info['error_text'], expected=True) + except ValueError: + pass + + lq_doc = compat_etree_fromstring(lq_page) + lq_durls = lq_doc.findall('./durl') hq_doc = self._download_xml( 'http://interface.bilibili.com/playurl?appkey=1&cid=%s' % cid, @@ -75,22 +100,45 @@ class BiliBiliIE(InfoExtractor): fatal=False, ) if hq_doc is not False: - hq_durl = hq_doc.find('./durl') - formats.append({ - 'format_id': 'hq', - 'quality': 2, - 'ext': 'flv', - 'url': hq_durl.find('./url').text, + hq_durls = hq_doc.findall('./durl') + assert len(lq_durls) == len(hq_durls) + else: + hq_durls = itertools.repeat(None) + + i = 1 + for lq_durl, hq_durl in zip(lq_durls, hq_durls): + formats = [{ + 'format_id': 'lq', + 'quality': 1, + 'url': lq_durl.find('./url').text, 'filesize': int_or_none( - hq_durl.find('./size'), get_attr='text'), + lq_durl.find('./size'), get_attr='text'), + }] + if hq_durl is not None: + formats.append({ + 'format_id': 'hq', + 'quality': 2, + 'ext': 'flv', + 'url': hq_durl.find('./url').text, + 'filesize': int_or_none( + hq_durl.find('./size'), get_attr='text'), + }) + self._sort_formats(formats) + + entries.append({ + 'id': '%s_part%d' % (video_id, i), + 'title': title, + 'formats': formats, + 'duration': duration, + 'upload_date': upload_date, + 'thumbnail': thumbnail, }) - self._sort_formats(formats) + i += 1 + return { + '_type': 'multi_video', + 'entries': entries, 'id': video_id, - 'title': title, - 'formats': formats, - 'duration': duration, - 'upload_date': upload_date, - 'thumbnail': thumbnail, + 'title': title } diff --git a/youtube_dl/extractor/blinkx.py b/youtube_dl/extractor/blinkx.py index 3e461e715..3b8eabe8f 100644 --- a/youtube_dl/extractor/blinkx.py +++ b/youtube_dl/extractor/blinkx.py @@ -1,40 +1,35 @@ from __future__ import unicode_literals import json -import re from .common import InfoExtractor -from ..utils import remove_start +from ..utils import ( + remove_start, + int_or_none, +) class BlinkxIE(InfoExtractor): - _VALID_URL = r'^(?:https?://(?:www\.)blinkx\.com/#?ce/|blinkx:)(?P[^?]+)' + _VALID_URL = r'(?:https?://(?:www\.)blinkx\.com/#?ce/|blinkx:)(?P[^?]+)' IE_NAME = 'blinkx' _TEST = { - 'url': 'http://www.blinkx.com/ce/8aQUy7GVFYgFzpKhT0oqsilwOGFRVXk3R1ZGWWdGenBLaFQwb3FzaWx3OGFRVXk3R1ZGWWdGenB', - 'md5': '2e9a07364af40163a908edbf10bb2492', + 'url': 'http://www.blinkx.com/ce/Da0Gw3xc5ucpNduzLuDDlv4WC9PuI4fDi1-t6Y3LyfdY2SZS5Urbvn-UPJvrvbo8LTKTc67Wu2rPKSQDJyZeeORCR8bYkhs8lI7eqddznH2ofh5WEEdjYXnoRtj7ByQwt7atMErmXIeYKPsSDuMAAqJDlQZ-3Ff4HJVeH_s3Gh8oQ', + 'md5': '337cf7a344663ec79bf93a526a2e06c7', 'info_dict': { - 'id': '8aQUy7GV', + 'id': 'Da0Gw3xc', 'ext': 'mp4', - 'title': 'Police Car Rolls Away', - 'uploader': 'stupidvideos.com', - 'upload_date': '20131215', - 'timestamp': 1387068000, - 'description': 'A police car gently rolls away from a fight. Maybe it felt weird being around a confrontation and just had to get out of there!', - 'duration': 14.886, - 'thumbnails': [{ - 'width': 100, - 'height': 76, - 'resolution': '100x76', - 'url': 'http://cdn.blinkx.com/stream/b/41/StupidVideos/20131215/1873969261/1873969261_tn_0.jpg', - }], + 'title': 'No Daily Show for John Oliver; HBO Show Renewed - IGN News', + 'uploader': 'IGN News', + 'upload_date': '20150217', + 'timestamp': 1424215740, + 'description': 'HBO has renewed Last Week Tonight With John Oliver for two more seasons.', + 'duration': 47.743333, }, } - def _real_extract(self, rl): - m = re.match(self._VALID_URL, rl) - video_id = m.group('id') + def _real_extract(self, url): + video_id = self._match_id(url) display_id = video_id[:8] api_url = ('https://apib4.blinkx.com/api.php?action=play_video&' + @@ -60,18 +55,20 @@ class BlinkxIE(InfoExtractor): elif m['type'] in ('flv', 'mp4'): vcodec = remove_start(m['vcodec'], 'ff') acodec = remove_start(m['acodec'], 'ff') - tbr = (int(m['vbr']) + int(m['abr'])) // 1000 + vbr = int_or_none(m.get('vbr') or m.get('vbitrate'), 1000) + abr = int_or_none(m.get('abr') or m.get('abitrate'), 1000) + tbr = vbr + abr if vbr and abr else None format_id = '%s-%sk-%s' % (vcodec, tbr, m['w']) formats.append({ 'format_id': format_id, 'url': m['link'], 'vcodec': vcodec, 'acodec': acodec, - 'abr': int(m['abr']) // 1000, - 'vbr': int(m['vbr']) // 1000, + 'abr': abr, + 'vbr': vbr, 'tbr': tbr, - 'width': int(m['w']), - 'height': int(m['h']), + 'width': int_or_none(m.get('w')), + 'height': int_or_none(m.get('h')), }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py index 436cc5155..35375f7b1 100644 --- a/youtube_dl/extractor/bliptv.py +++ b/youtube_dl/extractor/bliptv.py @@ -3,31 +3,29 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from .subtitles import SubtitlesInfoExtractor -from ..compat import ( - compat_str, - compat_urllib_request, - compat_urlparse, -) +from ..compat import compat_urlparse from ..utils import ( clean_html, int_or_none, parse_iso8601, + sanitized_Request, unescapeHTML, + xpath_text, + xpath_with_ns, ) -class BlipTVIE(SubtitlesInfoExtractor): +class BlipTVIE(InfoExtractor): _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/(?:(?:.+-|rss/flash/)(?P\d+)|((?:play/|api\.swf#)(?P[\da-zA-Z+_]+)))' _TESTS = [ { 'url': 'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352', - 'md5': 'c6934ad0b6acf2bd920720ec888eb812', + 'md5': '80baf1ec5c3d2019037c1c707d676b9f', 'info_dict': { 'id': '5779306', - 'ext': 'mov', + 'ext': 'm4v', 'title': 'CBR EXCLUSIVE: "Gotham City Imposters" Bats VS Jokerz Short 3', 'description': 'md5:9bc31f227219cde65e47eeec8d2dc596', 'timestamp': 1323138843, @@ -101,8 +99,31 @@ class BlipTVIE(SubtitlesInfoExtractor): 'vcodec': 'none', } }, + { + # missing duration + 'url': 'http://blip.tv/rss/flash/6700880', + 'info_dict': { + 'id': '6684191', + 'ext': 'm4v', + 'title': 'Cowboy Bebop: Gateway Shuffle Review', + 'description': 'md5:3acc480c0f9ae157f5fe88547ecaf3f8', + 'timestamp': 1386639757, + 'upload_date': '20131210', + 'uploader': 'sfdebris', + 'uploader_id': '706520', + } + } ] + @staticmethod + def _extract_url(webpage): + mobj = re.search(r']*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage) + if mobj: + return 'http://blip.tv/a/a-' + mobj.group(1) + mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage) + if mobj: + return mobj.group(1) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) lookup_id = mobj.group('lookup_id') @@ -120,35 +141,34 @@ class BlipTVIE(SubtitlesInfoExtractor): rss = self._download_xml('http://blip.tv/rss/flash/%s' % video_id, video_id, 'Downloading video RSS') - def blip(s): - return '{http://blip.tv/dtd/blip/1.0}%s' % s - - def media(s): - return '{http://search.yahoo.com/mrss/}%s' % s - - def itunes(s): - return '{http://www.itunes.com/dtds/podcast-1.0.dtd}%s' % s + def _x(p): + return xpath_with_ns(p, { + 'blip': 'http://blip.tv/dtd/blip/1.0', + 'media': 'http://search.yahoo.com/mrss/', + 'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd', + }) item = rss.find('channel/item') - video_id = item.find(blip('item_id')).text - title = item.find('./title').text - description = clean_html(compat_str(item.find(blip('puredescription')).text)) - timestamp = parse_iso8601(item.find(blip('datestamp')).text) - uploader = item.find(blip('user')).text - uploader_id = item.find(blip('userid')).text - duration = int(item.find(blip('runtime')).text) - media_thumbnail = item.find(media('thumbnail')) - thumbnail = media_thumbnail.get('url') if media_thumbnail is not None else item.find(itunes('image')).text - categories = [category.text for category in item.findall('category')] + video_id = xpath_text(item, _x('blip:item_id'), 'video id') or lookup_id + title = xpath_text(item, 'title', 'title', fatal=True) + description = clean_html(xpath_text(item, _x('blip:puredescription'), 'description')) + timestamp = parse_iso8601(xpath_text(item, _x('blip:datestamp'), 'timestamp')) + uploader = xpath_text(item, _x('blip:user'), 'uploader') + uploader_id = xpath_text(item, _x('blip:userid'), 'uploader id') + duration = int_or_none(xpath_text(item, _x('blip:runtime'), 'duration')) + media_thumbnail = item.find(_x('media:thumbnail')) + thumbnail = (media_thumbnail.get('url') if media_thumbnail is not None + else xpath_text(item, 'image', 'thumbnail')) + categories = [category.text for category in item.findall('category') if category is not None] formats = [] - subtitles = {} + subtitles_urls = {} - media_group = item.find(media('group')) - for media_content in media_group.findall(media('content')): + media_group = item.find(_x('media:group')) + for media_content in media_group.findall(_x('media:content')): url = media_content.get('url') - role = media_content.get(blip('role')) + role = media_content.get(_x('blip:role')) msg = self._download_webpage( url + '?showplayer=20140425131715&referrer=http://blip.tv&mask=7&skin=flashvars&view=url', video_id, 'Resolving URL for %s' % role) @@ -161,25 +181,22 @@ class BlipTVIE(SubtitlesInfoExtractor): } lang = role.rpartition('-')[-1].strip().lower() langcode = LANGS.get(lang, lang) - subtitles[langcode] = url + subtitles_urls[langcode] = url elif media_type.startswith('video/'): formats.append({ 'url': real_url, 'format_id': role, 'format_note': media_type, - 'vcodec': media_content.get(blip('vcodec')) or 'none', - 'acodec': media_content.get(blip('acodec')), + 'vcodec': media_content.get(_x('blip:vcodec')) or 'none', + 'acodec': media_content.get(_x('blip:acodec')), 'filesize': media_content.get('filesize'), 'width': int_or_none(media_content.get('width')), 'height': int_or_none(media_content.get('height')), }) + self._check_formats(formats, video_id) self._sort_formats(formats) - # subtitles - video_subtitles = self.extract_subtitles(video_id, subtitles) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return + subtitles = self.extract_subtitles(video_id, subtitles_urls) return { 'id': video_id, @@ -192,15 +209,22 @@ class BlipTVIE(SubtitlesInfoExtractor): 'thumbnail': thumbnail, 'categories': categories, 'formats': formats, - 'subtitles': video_subtitles, + 'subtitles': subtitles, } - def _download_subtitle_url(self, sub_lang, url): - # For some weird reason, blip.tv serves a video instead of subtitles - # when we request with a common UA - req = compat_urllib_request.Request(url) - req.add_header('User-Agent', 'youtube-dl') - return self._download_webpage(req, None, note=False) + def _get_subtitles(self, video_id, subtitles_urls): + subtitles = {} + for lang, url in subtitles_urls.items(): + # For some weird reason, blip.tv serves a video instead of subtitles + # when we request with a common UA + req = sanitized_Request(url) + req.add_header('User-Agent', 'youtube-dl') + subtitles[lang] = [{ + # The extension is 'srt' but it's actually an 'ass' file + 'ext': 'ass', + 'data': self._download_webpage(req, None, note=False), + }] + return subtitles class BlipTVUserIE(InfoExtractor): diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py index c51a97ce4..ebeef8f2a 100644 --- a/youtube_dl/extractor/bloomberg.py +++ b/youtube_dl/extractor/bloomberg.py @@ -6,32 +6,56 @@ from .common import InfoExtractor class BloombergIE(InfoExtractor): - _VALID_URL = r'https?://www\.bloomberg\.com/video/(?P.+?)\.html' + _VALID_URL = r'https?://(?:www\.)?bloomberg\.com/(?:[^/]+/)*(?P[^/?#]+)' - _TEST = { - 'url': 'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html', + _TESTS = [{ + 'url': 'http://www.bloomberg.com/news/videos/b/aaeae121-5949-481e-a1ce-4562db6f5df2', # The md5 checksum changes 'info_dict': { 'id': 'qurhIVlJSB6hzkVi229d8g', 'ext': 'flv', 'title': 'Shah\'s Presentation on Foreign-Exchange Strategies', - 'description': 'md5:0681e0d30dcdfc6abf34594961d8ea88', + 'description': 'md5:a8ba0302912d03d246979735c17d2761', }, - } + }, { + 'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets', + 'only_matching': True, + }, { + 'url': 'http://www.bloomberg.com/politics/videos/2015-11-25/karl-rove-on-jeb-bush-s-struggles-stopping-trump', + 'only_matching': True, + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - name = mobj.group('name') + name = self._match_id(url) webpage = self._download_webpage(url, name) - f4m_url = self._search_regex( - r'.+?)\1', + webpage, 'id', group='url') title = re.sub(': Video$', '', self._og_search_title(webpage)) + embed_info = self._download_json( + 'http://www.bloomberg.com/api/embed?id=%s' % video_id, video_id) + formats = [] + for stream in embed_info['streams']: + stream_url = stream.get('url') + if not stream_url: + continue + if stream['muxing_format'] == 'TS': + m3u8_formats = self._extract_m3u8_formats( + stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + else: + f4m_formats = self._extract_f4m_formats( + stream_url, video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) + self._sort_formats(formats) + return { - 'id': name.split('-')[-1], + 'id': video_id, 'title': title, - 'formats': self._extract_f4m_formats(f4m_url, name), + 'formats': formats, 'description': self._og_search_description(webpage), 'thumbnail': self._og_search_thumbnail(webpage), } diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index 45ba51732..66e394e10 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -16,27 +16,38 @@ class BRIE(InfoExtractor): _TESTS = [ { - 'url': 'http://www.br.de/mediathek/video/sendungen/heimatsound/heimatsound-festival-2014-trailer-100.html', - 'md5': '93556dd2bcb2948d9259f8670c516d59', + 'url': 'http://www.br.de/mediathek/video/sendungen/abendschau/betriebliche-altersvorsorge-104.html', + 'md5': '83a0477cf0b8451027eb566d88b51106', 'info_dict': { - 'id': '25e279aa-1ffd-40fd-9955-5325bd48a53a', + 'id': '48f656ef-287e-486f-be86-459122db22cc', 'ext': 'mp4', - 'title': 'Wenn das Traditions-Theater wackelt', - 'description': 'Heimatsound-Festival 2014: Wenn das Traditions-Theater wackelt', - 'duration': 34, - 'uploader': 'BR', - 'upload_date': '20140802', + 'title': 'Die böse Überraschung', + 'description': 'Betriebliche Altersvorsorge: Die böse Überraschung', + 'duration': 180, + 'uploader': 'Reinhard Weber', + 'upload_date': '20150422', } }, { - 'url': 'http://www.br.de/nachrichten/schaeuble-haushaltsentwurf-bundestag-100.html', - 'md5': '3db0df1a9a9cd9fa0c70e6ea8aa8e820', + 'url': 'http://www.br.de/nachrichten/oberbayern/inhalt/muenchner-polizeipraesident-schreiber-gestorben-100.html', + 'md5': 'a44396d73ab6a68a69a568fae10705bb', 'info_dict': { - 'id': 'c6aae3de-2cf9-43f2-957f-f17fef9afaab', + 'id': 'a4b83e34-123d-4b81-9f4e-c0d3121a4e05', + 'ext': 'mp4', + 'title': 'Manfred Schreiber ist tot', + 'description': 'Abendschau kompakt: Manfred Schreiber ist tot', + 'duration': 26, + } + }, + { + 'url': 'http://www.br.de/radio/br-klassik/sendungen/allegro/premiere-urauffuehrung-the-land-2015-dance-festival-muenchen-100.html', + 'md5': '8b5b27c0b090f3b35eac4ab3f7a73d3d', + 'info_dict': { + 'id': '74c603c9-26d3-48bb-b85b-079aeed66e0b', 'ext': 'aac', - 'title': '"Keine neuen Schulden im nächsten Jahr"', - 'description': 'Haushaltsentwurf: "Keine neuen Schulden im nächsten Jahr"', - 'duration': 64, + 'title': 'Kurzweilig und sehr bewegend', + 'description': '"The Land" von Peeping Tom: Kurzweilig und sehr bewegend', + 'duration': 296, } }, { diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index 4bcc897c9..aa08051b1 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -18,6 +18,7 @@ class BreakIE(InfoExtractor): 'id': '2468056', 'ext': 'mp4', 'title': 'When Girls Act Like D-Bags', + 'age_limit': 13, } }, { 'url': 'http://www.break.com/video/ugc/baby-flex-2773063', @@ -41,7 +42,7 @@ class BreakIE(InfoExtractor): 'tbr': media['bitRate'], 'width': media['width'], 'height': media['height'], - } for media in info['media']] + } for media in info['media'] if media.get('mediaPurpose') == 'play'] if not formats: formats.append({ diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index ea0969d4d..f5ebae1e6 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -3,28 +3,34 @@ from __future__ import unicode_literals import re import json -import xml.etree.ElementTree from .common import InfoExtractor from ..compat import ( + compat_etree_fromstring, compat_parse_qs, compat_str, compat_urllib_parse, compat_urllib_parse_urlparse, - compat_urllib_request, compat_urlparse, + compat_xml_parse_error, ) from ..utils import ( determine_ext, ExtractorError, find_xpath_attr, fix_xml_ampersands, + float_or_none, + js_to_json, + int_or_none, + parse_iso8601, + sanitized_Request, unescapeHTML, unsmuggle_url, ) -class BrightcoveIE(InfoExtractor): +class BrightcoveLegacyIE(InfoExtractor): + IE_NAME = 'brightcove:legacy' _VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P.*)' _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' @@ -95,6 +101,7 @@ class BrightcoveIE(InfoExtractor): 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=3550052898001&playerKey=AQ%7E%7E%2CAAABmA9XpXk%7E%2C-Kp7jNgisre1fG5OdqpAFUTcs0lP_ZoL', 'info_dict': { 'title': 'Sealife', + 'id': '3550319591001', }, 'playlist_mincount': 7, }, @@ -116,7 +123,10 @@ class BrightcoveIE(InfoExtractor): object_str = re.sub(r'(]*)(xmlns=".*?")', r'\1', object_str) object_str = fix_xml_ampersands(object_str) - object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8')) + try: + object_doc = compat_etree_fromstring(object_str.encode('utf-8')) + except compat_xml_parse_error: + return fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars') if fv_el is not None: @@ -152,6 +162,28 @@ class BrightcoveIE(InfoExtractor): linkBase = find_param('linkBaseURL') if linkBase is not None: params['linkBaseURL'] = linkBase + return cls._make_brightcove_url(params) + + @classmethod + def _build_brighcove_url_from_js(cls, object_js): + # The layout of JS is as follows: + # customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) { + # // build Brightcove XML + # } + m = re.search( + r'''(?x)customBC.\createVideo\( + .*? # skipping width and height + ["\'](?P\d+)["\']\s*,\s* # playerID + ["\'](?PAQ[^"\']{48})[^"\']*["\']\s*,\s* # playerKey begins with AQ and is 50 characters + # in length, however it's appended to itself + # in places, so truncate + ["\'](?P\d+)["\'] # @videoPlayer + ''', object_js) + if m: + return cls._make_brightcove_url(m.groupdict()) + + @classmethod + def _make_brightcove_url(cls, params): data = compat_urllib_parse.urlencode(params) return cls._FEDERATED_URL_TEMPLATE % data @@ -168,7 +200,7 @@ class BrightcoveIE(InfoExtractor): """Return a list of all Brightcove URLs from the webpage """ url_m = re.search( - r']+?class=[\'"][^>]*?BrightcoveExperience.*?[\'"] | [^>]*?>\s*''', + ).+?>\s*''', webpage) - return [cls._build_brighcove_url(m) for m in matches] + if matches: + return list(filter(None, [cls._build_brighcove_url(m) for m in matches])) + + return list(filter(None, [ + cls._build_brighcove_url_from_js(custom_bc) + for custom_bc in re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)])) def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -213,7 +250,7 @@ class BrightcoveIE(InfoExtractor): def _get_video_info(self, video_id, query_str, query, referer=None): request_url = self._FEDERATED_URL_TEMPLATE % query_str - req = compat_urllib_request.Request(request_url) + req = sanitized_Request(request_url) linkBase = query.get('linkBaseURL') if linkBase is not None: referer = linkBase[0] @@ -247,7 +284,7 @@ class BrightcoveIE(InfoExtractor): playlist_info = json_data['videoList'] videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']] - return self.playlist_result(videos, playlist_id=playlist_info['id'], + return self.playlist_result(videos, playlist_id='%s' % playlist_info['id'], playlist_title=playlist_info['mediaCollectionDTO']['displayName']) def _extract_video_info(self, video_info): @@ -314,3 +351,172 @@ class BrightcoveIE(InfoExtractor): if 'url' not in info and not info.get('formats'): raise ExtractorError('Unable to extract video url for %s' % info['id']) return info + + +class BrightcoveNewIE(InfoExtractor): + IE_NAME = 'brightcove:new' + _VALID_URL = r'https?://players\.brightcove\.net/(?P\d+)/(?P[^/]+)_(?P[^/]+)/index\.html\?.*videoId=(?P\d+)' + _TESTS = [{ + 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', + 'md5': 'c8100925723840d4b0d243f7025703be', + 'info_dict': { + 'id': '4463358922001', + 'ext': 'mp4', + 'title': 'Meet the man behind Popcorn Time', + 'description': 'md5:eac376a4fe366edc70279bfb681aea16', + 'duration': 165.768, + 'timestamp': 1441391203, + 'upload_date': '20150904', + 'uploader_id': '929656772001', + 'formats': 'mincount:22', + }, + }, { + # with rtmp streams + 'url': 'http://players.brightcove.net/4036320279001/5d112ed9-283f-485f-a7f9-33f42e8bc042_default/index.html?videoId=4279049078001', + 'info_dict': { + 'id': '4279049078001', + 'ext': 'mp4', + 'title': 'Titansgrave: Chapter 0', + 'description': 'Titansgrave: Chapter 0', + 'duration': 1242.058, + 'timestamp': 1433556729, + 'upload_date': '20150606', + 'uploader_id': '4036320279001', + 'formats': 'mincount:41', + }, + 'params': { + 'skip_download': True, + } + }] + + @staticmethod + def _extract_urls(webpage): + # Reference: + # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe + # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript) + # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/embed-in-page.html + + entries = [] + + # Look for iframe embeds [1] + for _, url in re.findall( + r']+src=(["\'])((?:https?:)//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage): + entries.append(url) + + # Look for embed_in_page embeds [2] + for video_id, account_id, player_id, embed in re.findall( + # According to examples from [3] it's unclear whether video id + # may be optional and what to do when it is + r'''(?sx) + ]+ + data-video-id=["\'](\d+)["\'][^>]*>.*? + .*? + ]+ + src=["\'](?:https?:)?//players\.brightcove\.net/ + (\d+)/([\da-f-]+)_([^/]+)/index\.min\.js + ''', webpage): + entries.append( + 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' + % (account_id, player_id, embed, video_id)) + + return entries + + def _real_extract(self, url): + account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups() + + webpage = self._download_webpage( + 'http://players.brightcove.net/%s/%s_%s/index.min.js' + % (account_id, player_id, embed), video_id) + + policy_key = None + + catalog = self._search_regex( + r'catalog\(({.+?})\);', webpage, 'catalog', default=None) + if catalog: + catalog = self._parse_json( + js_to_json(catalog), video_id, fatal=False) + if catalog: + policy_key = catalog.get('policyKey') + + if not policy_key: + policy_key = self._search_regex( + r'policyKey\s*:\s*(["\'])(?P.+?)\1', + webpage, 'policy key', group='pk') + + req = sanitized_Request( + 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' + % (account_id, video_id), + headers={'Accept': 'application/json;pk=%s' % policy_key}) + json_data = self._download_json(req, video_id) + + title = json_data['name'] + + formats = [] + for source in json_data.get('sources', []): + source_type = source.get('type') + src = source.get('src') + if source_type == 'application/x-mpegURL': + if not src: + continue + m3u8_formats = self._extract_m3u8_formats( + src, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + else: + streaming_src = source.get('streaming_src') + stream_name, app_name = source.get('stream_name'), source.get('app_name') + if not src and not streaming_src and (not stream_name or not app_name): + continue + tbr = float_or_none(source.get('avg_bitrate'), 1000) + height = int_or_none(source.get('height')) + f = { + 'tbr': tbr, + 'width': int_or_none(source.get('width')), + 'height': height, + 'filesize': int_or_none(source.get('size')), + 'container': source.get('container'), + 'vcodec': source.get('codec'), + 'ext': source.get('container').lower(), + } + + def build_format_id(kind): + format_id = kind + if tbr: + format_id += '-%dk' % int(tbr) + if height: + format_id += '-%dp' % height + return format_id + + if src or streaming_src: + f.update({ + 'url': src or streaming_src, + 'format_id': build_format_id('http' if src else 'http-streaming'), + 'preference': 2 if src else 1, + }) + else: + f.update({ + 'url': app_name, + 'play_path': stream_name, + 'format_id': build_format_id('rtmp'), + }) + formats.append(f) + self._sort_formats(formats) + + description = json_data.get('description') + thumbnail = json_data.get('thumbnail') + timestamp = parse_iso8601(json_data.get('published_at')) + duration = float_or_none(json_data.get('duration'), 1000) + tags = json_data.get('tags', []) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'uploader_id': account_id, + 'formats': formats, + 'tags': tags, + } diff --git a/youtube_dl/extractor/buzzfeed.py b/youtube_dl/extractor/buzzfeed.py index a5d2af174..df503ecc0 100644 --- a/youtube_dl/extractor/buzzfeed.py +++ b/youtube_dl/extractor/buzzfeed.py @@ -33,6 +33,7 @@ class BuzzFeedIE(InfoExtractor): 'skip_download': True, # Got enough YouTube download tests }, 'info_dict': { + 'id': 'look-at-this-cute-dog-omg', 'description': 're:Munchkin the Teddy Bear is back ?!', 'title': 'You Need To Stop What You\'re Doing And Watching This Dog Walk On A Treadmill', }, @@ -42,8 +43,8 @@ class BuzzFeedIE(InfoExtractor): 'ext': 'mp4', 'upload_date': '20141124', 'uploader_id': 'CindysMunchkin', - 'description': 're:© 2014 Munchkin the Shih Tzu', - 'uploader': 'Munchkin the Shih Tzu', + 'description': 're:© 2014 Munchkin the', + 'uploader': 're:^Munchkin the', 'title': 're:Munchkin the Teddy Bear gets her exercise', }, }] diff --git a/youtube_dl/extractor/byutv.py b/youtube_dl/extractor/byutv.py index 6252be05b..3b2de517e 100644 --- a/youtube_dl/extractor/byutv.py +++ b/youtube_dl/extractor/byutv.py @@ -16,7 +16,7 @@ class BYUtvIE(InfoExtractor): 'ext': 'mp4', 'description': 'md5:5438d33774b6bdc662f9485a340401cc', 'title': 'Season 5 Episode 5', - 'thumbnail': 're:^https?://.*promo.*' + 'thumbnail': 're:^https?://.*\.jpg$' }, 'params': { 'skip_download': True, diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index c4fefefe4..f6a1ff381 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -4,38 +4,53 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import parse_duration class Canalc2IE(InfoExtractor): IE_NAME = 'canalc2.tv' - _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?.*?idVideo=(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?canalc2\.tv/video/(?P\d+)' _TEST = { - 'url': 'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui', + 'url': 'http://www.canalc2.tv/video/12163', 'md5': '060158428b650f896c542dfbb3d6487f', 'info_dict': { 'id': '12163', - 'ext': 'mp4', - 'title': 'Terrasses du Numérique' + 'ext': 'flv', + 'title': 'Terrasses du Numérique', + 'duration': 122, + }, + 'params': { + 'skip_download': True, # Requires rtmpdump } } def _real_extract(self, url): - video_id = re.match(self._VALID_URL, url).group('id') - # We need to set the voir field for getting the file name - url = 'http://www.canalc2.tv/video.asp?idVideo=%s&voir=oui' % video_id + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - file_name = self._search_regex( - r"so\.addVariable\('file','(.*?)'\);", - webpage, 'file name') - video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name + video_url = self._search_regex( + r'jwplayer\((["\'])Player\1\)\.setup\({[^}]*file\s*:\s*(["\'])(?P.+?)\2', + webpage, 'video_url', group='file') + formats = [{'url': video_url}] + if video_url.startswith('rtmp://'): + rtmp = re.search(r'^(?Prtmp://[^/]+/(?P.+/))(?Pmp4:.+)$', video_url) + formats[0].update({ + 'url': rtmp.group('url'), + 'ext': 'flv', + 'app': rtmp.group('app'), + 'play_path': rtmp.group('play_path'), + 'page_url': url, + }) title = self._html_search_regex( - r'class="evenement8">(.*?)
', webpage, 'title') + r'(?s)class="[^"]*col_description[^"]*">.*?

(.*?)

', webpage, 'title') + duration = parse_duration(self._search_regex( + r'id=["\']video_duree["\'][^>]*>([^<]+)', + webpage, 'duration', fatal=False)) return { 'id': video_id, - 'ext': 'mp4', - 'url': video_url, 'title': title, + 'duration': duration, + 'formats': formats, } diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 1b14471e5..004372f8d 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -25,14 +25,14 @@ class CanalplusIE(InfoExtractor): } _TESTS = [{ - 'url': 'http://www.canalplus.fr/c-infos-documentaires/pid1830-c-zapping.html?vid=922470', - 'md5': '3db39fb48b9685438ecf33a1078023e4', + 'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1263092', + 'md5': 'b3481d7ca972f61e37420798d0a9d934', 'info_dict': { - 'id': '922470', + 'id': '1263092', 'ext': 'flv', - 'title': 'Zapping - 26/08/13', - 'description': 'Le meilleur de toutes les chaînes, tous les jours.\nEmission du 26 août 2013', - 'upload_date': '20130826', + 'title': 'Le Zapping - 13/05/15', + 'description': 'md5:09738c0d06be4b5d06a0940edb0da73f', + 'upload_date': '20150513', }, }, { 'url': 'http://www.piwiplus.fr/videos-piwi/pid1405-le-labyrinthe-boing-super-ranger.html?vid=1108190', @@ -56,7 +56,7 @@ class CanalplusIE(InfoExtractor): 'skip': 'videos get deleted after a while', }, { 'url': 'http://www.itele.fr/france/video/aubervilliers-un-lycee-en-colere-111559', - 'md5': '65aa83ad62fe107ce29e564bb8712580', + 'md5': 'f3a46edcdf28006598ffaf5b30e6a2d4', 'info_dict': { 'id': '1213714', 'ext': 'flv', @@ -78,7 +78,8 @@ class CanalplusIE(InfoExtractor): if video_id is None: webpage = self._download_webpage(url, display_id) video_id = self._search_regex( - r']+?videoId="(\d+)"', webpage, 'video id') + [r']+?videoId=(["\'])(?P\d+)', r'id=["\']canal_video_player(?P\d+)'], + webpage, 'video id', group='id') info_url = self._VIDEO_INFO_TEMPLATE % (site_id, video_id) doc = self._download_xml(info_url, video_id, 'Downloading video XML') @@ -106,15 +107,11 @@ class CanalplusIE(InfoExtractor): continue format_id = fmt.tag if format_id == 'HLS': - hls_formats = self._extract_m3u8_formats(format_url, video_id, 'flv') - for fmt in hls_formats: - fmt['preference'] = preference(format_id) - formats.extend(hls_formats) + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', preference=preference(format_id))) elif format_id == 'HDS': - hds_formats = self._extract_f4m_formats(format_url + '?hdcore=2.11.3', video_id) - for fmt in hds_formats: - fmt['preference'] = preference(format_id) - formats.extend(hds_formats) + formats.extend(self._extract_f4m_formats( + format_url + '?hdcore=2.11.3', video_id, preference=preference(format_id))) else: formats.append({ 'url': format_url, diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index e43756ec6..40d07ab18 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -1,17 +1,20 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..utils import ( + sanitized_Request, + smuggle_url, +) class CBSIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cbs\.com/shows/[^/]+/(?:video|artist)/(?P[^/]+)/.*' + _VALID_URL = r'https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/[^/]+/(?P[^/]+)' _TESTS = [{ 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', 'info_dict': { 'id': '4JUVEwq3wUT7', + 'display_id': 'connect-chat-feat-garth-brooks', 'ext': 'flv', 'title': 'Connect Chat feat. Garth Brooks', 'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', @@ -26,6 +29,7 @@ class CBSIE(InfoExtractor): 'url': 'http://www.cbs.com/shows/liveonletterman/artist/221752/st-vincent/', 'info_dict': { 'id': 'WWF_5KqY3PK1', + 'display_id': 'st-vincent', 'ext': 'flv', 'title': 'Live on Letterman - St. Vincent', 'description': 'Live On Letterman: St. Vincent in concert from New York\'s Ed Sullivan Theater on Tuesday, July 16, 2014.', @@ -36,13 +40,29 @@ class CBSIE(InfoExtractor): 'skip_download': True, }, '_skip': 'Blocked outside the US', + }, { + 'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/', + 'only_matching': True, + }, { + 'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) + display_id = self._match_id(url) + request = sanitized_Request(url) + # Android UA is served with higher quality (720p) streams (see + # https://github.com/rg3/youtube-dl/issues/7490) + request.add_header('User-Agent', 'Mozilla/5.0 (Linux; Android 4.4; Nexus 5)') + webpage = self._download_webpage(request, display_id) real_id = self._search_regex( - r"video\.settings\.pid\s*=\s*'([^']+)';", + [r"video\.settings\.pid\s*=\s*'([^']+)';", r"cbsplayer\.pid\s*=\s*'([^']+)';"], webpage, 'real video ID') - return self.url_result('theplatform:%s' % real_id) + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'url': smuggle_url( + 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true&manifest=m3u' % real_id, + {'force_smil_url': True}), + 'display_id': display_id, + } diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 7e47960ab..f9a64a0a2 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -32,7 +32,7 @@ class CBSNewsIE(InfoExtractor): 'id': 'fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack', 'ext': 'flv', 'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack', - 'thumbnail': 'http://cbsnews2.cbsistatic.com/hub/i/r/2014/04/04/0c9fbc66-576b-41ca-8069-02d122060dd2/thumbnail/140x90/6dad7a502f88875ceac38202984b6d58/en-0404-werner-replace-640x360.jpg', + 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 205, }, 'params': { @@ -67,9 +67,12 @@ class CBSNewsIE(InfoExtractor): 'format_id': format_id, } if uri.startswith('rtmp'): + play_path = re.sub( + r'{slistFilePath}', '', + uri.split('')[-1].split('{break}')[-1]) fmt.update({ 'app': 'ondemand?auth=cbs', - 'play_path': 'mp4:' + uri.split('')[-1], + 'play_path': 'mp4:' + play_path, 'player_url': 'http://www.cbsnews.com/[[IMPORT]]/vidtech.cbsinteractive.com/player/3_3_0/CBSI_PLAYER_HD.swf', 'page_url': 'http://www.cbsnews.com', 'ext': 'flv', diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py new file mode 100644 index 000000000..ae47e74cc --- /dev/null +++ b/youtube_dl/extractor/cbssports.py @@ -0,0 +1,30 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class CBSSportsIE(InfoExtractor): + _VALID_URL = r'http://www\.cbssports\.com/video/player/(?P
[^/]+)/(?P[^/]+)' + + _TEST = { + 'url': 'http://www.cbssports.com/video/player/tennis/318462531970/0/us-open-flashbacks-1990s', + 'info_dict': { + 'id': '_d5_GbO8p1sT', + 'ext': 'flv', + 'title': 'US Open flashbacks: 1990s', + 'description': 'Bill Macatee relives the best moments in US Open history from the 1990s.', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + section = mobj.group('section') + video_id = mobj.group('id') + all_videos = self._download_json( + 'http://www.cbssports.com/data/video/player/getVideos/%s?as=json' % section, + video_id) + # The json file contains the info of all the videos in the section + video_info = next(v for v in all_videos if v['pcid'] == video_id) + return self.url_result('theplatform:%s' % video_info['pid'], 'ThePlatform') diff --git a/youtube_dl/extractor/ccc.py b/youtube_dl/extractor/ccc.py index 2a5d4be18..6924eac70 100644 --- a/youtube_dl/extractor/ccc.py +++ b/youtube_dl/extractor/ccc.py @@ -16,7 +16,7 @@ class CCCIE(InfoExtractor): _TEST = { 'url': 'http://media.ccc.de/browse/congress/2013/30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor.html#video', - 'md5': '205a365d0d57c0b1e43a12c9ffe8f9be', + 'md5': '3a1eda8f3a29515d27f5adb967d7e740', 'info_dict': { 'id': '20131228183', 'ext': 'mp4', @@ -51,7 +51,7 @@ class CCCIE(InfoExtractor): matches = re.finditer(r'''(?xs) <(?:span|div)\s+class='label\s+filetype'>(?P.*?)\s* - [^']+)'>\s* + [^']+)'>\s* (?: .*? [^']+\.torrent)' diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index f70e090bb..6f7b2a70d 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -3,68 +3,95 @@ from __future__ import unicode_literals import re -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor from ..compat import ( - compat_urllib_request, compat_urllib_parse, + compat_urllib_parse_unquote, compat_urllib_parse_urlparse, ) from ..utils import ( ExtractorError, float_or_none, + sanitized_Request, ) -class CeskaTelevizeIE(SubtitlesInfoExtractor): - _VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(.+/)?(?P[^?#]+)' - - _TESTS = [ - { - 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', +class CeskaTelevizeIE(InfoExtractor): + _VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(?:[^/]+/)*(?P[^/#?]+)/*(?:[#?].*)?$' + _TESTS = [{ + 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', + 'info_dict': { + 'id': '61924494876951776', + 'ext': 'mp4', + 'title': 'Hyde Park Civilizace', + 'description': 'md5:fe93f6eda372d150759d11644ebbfb4a', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 3350, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina', + 'info_dict': { + 'id': '61924494876844374', + 'ext': 'mp4', + 'title': 'První republika: Zpěvačka z Dupárny Bobina', + 'description': 'Sága mapující atmosféru první republiky od r. 1918 do r. 1945.', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 88.4, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # video with 18+ caution trailer + 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', + 'info_dict': { + 'id': '215562210900007-bogotart', + 'title': 'Queer: Bogotart', + 'description': 'Alternativní průvodce současným queer světem', + }, + 'playlist': [{ 'info_dict': { - 'id': '214411058091220', + 'id': '61924494876844842', 'ext': 'mp4', - 'title': 'Hyde Park Civilizace', - 'description': 'Věda a současná civilizace. Interaktivní pořad - prostor pro vaše otázky a komentáře', - 'thumbnail': 're:^https?://.*\.jpg', - 'duration': 3350, - }, - 'params': { - # m3u8 download - 'skip_download': True, + 'title': 'Queer: Bogotart (Varování 18+)', + 'duration': 10.2, }, - }, - { - 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina', + }, { 'info_dict': { - 'id': '14716', + 'id': '61924494877068022', 'ext': 'mp4', - 'title': 'První republika: Zpěvačka z Dupárny Bobina', - 'description': 'Sága mapující atmosféru první republiky od r. 1918 do r. 1945.', + 'title': 'Queer: Bogotart (Queer)', 'thumbnail': 're:^https?://.*\.jpg', - 'duration': 88.4, - }, - 'params': { - # m3u8 download - 'skip_download': True, + 'duration': 1558.3, }, + }], + 'params': { + # m3u8 download + 'skip_download': True, }, - ] + }] def _real_extract(self, url): url = url.replace('/porady/', '/ivysilani/').replace('/video/', '') mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + playlist_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, playlist_id) NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.' if '%s

' % NOT_AVAILABLE_STRING in webpage: raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) - typ = self._html_search_regex(r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', webpage, 'type') - episode_id = self._html_search_regex(r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', webpage, 'episode_id') + typ = self._html_search_regex( + r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', webpage, 'type') + episode_id = self._html_search_regex( + r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', webpage, 'episode_id') data = { 'playlist[0][type]': typ, @@ -73,7 +100,7 @@ class CeskaTelevizeIE(SubtitlesInfoExtractor): 'requestSource': 'iVysilani', } - req = compat_urllib_request.Request( + req = sanitized_Request( 'http://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', data=compat_urllib_parse.urlencode(data)) @@ -82,54 +109,67 @@ class CeskaTelevizeIE(SubtitlesInfoExtractor): req.add_header('X-Requested-With', 'XMLHttpRequest') req.add_header('Referer', url) - playlistpage = self._download_json(req, video_id) + playlistpage = self._download_json(req, playlist_id) playlist_url = playlistpage['url'] if playlist_url == 'error_region': raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) - req = compat_urllib_request.Request(compat_urllib_parse.unquote(playlist_url)) + req = sanitized_Request(compat_urllib_parse_unquote(playlist_url)) req.add_header('Referer', url) - playlist = self._download_json(req, video_id) - - item = playlist['playlist'][0] - formats = [] - for format_id, stream_url in item['streamUrls'].items(): - formats.extend(self._extract_m3u8_formats(stream_url, video_id, 'mp4')) - self._sort_formats(formats) - - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - duration = float_or_none(item.get('duration')) - thumbnail = item.get('previewImageUrl') - - subtitles = {} - subs = item.get('subtitles') - if subs: - subtitles['cs'] = subs[0]['url'] - - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return - - subtitles = self._fix_subtitles(self.extract_subtitles(video_id, subtitles)) - + playlist_title = self._og_search_title(webpage) + playlist_description = self._og_search_description(webpage) + + playlist = self._download_json(req, playlist_id)['playlist'] + playlist_len = len(playlist) + + entries = [] + for item in playlist: + formats = [] + for format_id, stream_url in item['streamUrls'].items(): + formats.extend(self._extract_m3u8_formats( + stream_url, playlist_id, 'mp4', entry_protocol='m3u8_native')) + self._sort_formats(formats) + + item_id = item.get('id') or item['assetId'] + title = item['title'] + + duration = float_or_none(item.get('duration')) + thumbnail = item.get('previewImageUrl') + + subtitles = {} + if item.get('type') == 'VOD': + subs = item.get('subtitles') + if subs: + subtitles = self.extract_subtitles(episode_id, subs) + + entries.append({ + 'id': item_id, + 'title': playlist_title if playlist_len == 1 else '%s (%s)' % (playlist_title, title), + 'description': playlist_description if playlist_len == 1 else None, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + }) + + return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) + + def _get_subtitles(self, episode_id, subs): + original_subtitles = self._download_webpage( + subs[0]['url'], episode_id, 'Downloading subtitles') + srt_subs = self._fix_subtitles(original_subtitles) return { - 'id': episode_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, + 'cs': [{ + 'ext': 'srt', + 'data': srt_subs, + }] } @staticmethod def _fix_subtitles(subtitles): """ Convert millisecond-based subtitles to SRT """ - if subtitles is None: - return subtitles # subtitles not requested def _msectotimecode(msec): """ Helper utility to convert milliseconds to timecode """ @@ -149,7 +189,4 @@ class CeskaTelevizeIE(SubtitlesInfoExtractor): else: yield line - fixed_subtitles = {} - for k, v in subtitles.items(): - fixed_subtitles[k] = "\r\n".join(_fix_subtitle(v)) - return fixed_subtitles + return "\r\n".join(_fix_subtitle(subtitles)) diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index 3dfc24f5b..c74553dcf 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -3,7 +3,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + parse_filesize, + qualities, +) class Channel9IE(InfoExtractor): @@ -28,7 +32,7 @@ class Channel9IE(InfoExtractor): 'title': 'Developer Kick-Off Session: Stuff We Love', 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f', 'duration': 4576, - 'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg', + 'thumbnail': 're:http://.*\.jpg', 'session_code': 'KOS002', 'session_day': 'Day 1', 'session_room': 'Arena 1A', @@ -44,31 +48,29 @@ class Channel9IE(InfoExtractor): 'title': 'Self-service BI with Power BI - nuclear testing', 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b', 'duration': 1540, - 'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg', + 'thumbnail': 're:http://.*\.jpg', 'authors': ['Mike Wilmot'], }, + }, + { + # low quality mp4 is best + 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', + 'info_dict': { + 'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', + 'ext': 'mp4', + 'title': 'Ranges for the Standard Library', + 'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d', + 'duration': 5646, + 'thumbnail': 're:http://.*\.jpg', + }, + 'params': { + 'skip_download': True, + }, } ] _RSS_URL = 'http://channel9.msdn.com/%s/RSS' - # Sorted by quality - _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4'] - - def _restore_bytes(self, formatted_size): - if not formatted_size: - return 0 - m = re.match(r'^(?P\d+(?:\.\d+)?)\s+(?P[a-zA-Z]+)', formatted_size) - if not m: - return 0 - units = m.group('units') - try: - exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper()) - except ValueError: - return 0 - size = float(m.group('size')) - return int(size * (1024 ** exponent)) - def _formats_from_html(self, html): FORMAT_REGEX = r''' (?x) @@ -78,16 +80,20 @@ class Channel9IE(InfoExtractor):

File\s+size

\s*(?P.*?)\s* )? # File size part may be missing ''' - # Extract known formats + quality = qualities(( + 'MP3', 'MP4', + 'Low Quality WMV', 'Low Quality MP4', + 'Mid Quality WMV', 'Mid Quality MP4', + 'High Quality WMV', 'High Quality MP4')) formats = [{ 'url': x.group('url'), 'format_id': x.group('quality'), 'format_note': x.group('note'), 'format': '%s (%s)' % (x.group('quality'), x.group('note')), - 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate - 'preference': self._known_formats.index(x.group('quality')), + 'filesize_approx': parse_filesize(x.group('filesize')), + 'quality': quality(x.group('quality')), 'vcodec': 'none' if x.group('note') == 'Audio only' else None, - } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats] + } for x in list(re.finditer(FORMAT_REGEX, html))] self._sort_formats(formats) @@ -158,7 +164,7 @@ class Channel9IE(InfoExtractor): def _extract_session_day(self, html): m = re.search(r'
  • \s*(?P[^<]+)\s*
  • ', html) - return m.group('day') if m is not None else None + return m.group('day').strip() if m is not None else None def _extract_session_room(self, html): m = re.search(r'
  • \s*(?P.+?)\s*
  • ', html) @@ -224,12 +230,12 @@ class Channel9IE(InfoExtractor): if contents is None: return contents - authors = self._extract_authors(html) + if len(contents) > 1: + raise ExtractorError('Got more than one entry') + result = contents[0] + result['authors'] = self._extract_authors(html) - for content in contents: - content['authors'] = authors - - return contents + return result def _extract_session(self, html, content_path): contents = self._extract_content(html, content_path) diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py new file mode 100644 index 000000000..0b67ba67d --- /dev/null +++ b/youtube_dl/extractor/chaturbate.py @@ -0,0 +1,50 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class ChaturbateIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?chaturbate\.com/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.chaturbate.com/siswet19/', + 'info_dict': { + 'id': 'siswet19', + 'ext': 'mp4', + 'title': 're:^siswet19 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'age_limit': 18, + 'is_live': True, + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'https://en.chaturbate.com/siswet19/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + m3u8_url = self._search_regex( + r'src=(["\'])(?Phttp.+?\.m3u8.*?)\1', webpage, + 'playlist', default=None, group='url') + + if not m3u8_url: + error = self._search_regex( + r']+class=(["\'])desc_span\1[^>]*>(?P[^<]+)', + webpage, 'error', group='error') + raise ExtractorError(error, expected=True) + + formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') + + return { + 'id': video_id, + 'title': self._live_title(video_id), + 'thumbnail': 'https://cdn-s.highwebmedia.com/uHK3McUtGCG3SMFcd4ZJsRv8/roomimage/%s.jpg' % video_id, + 'age_limit': self._rta_search(webpage), + 'is_live': True, + 'formats': formats, + } diff --git a/youtube_dl/extractor/chilloutzone.py b/youtube_dl/extractor/chilloutzone.py index c922f6959..0206d96db 100644 --- a/youtube_dl/extractor/chilloutzone.py +++ b/youtube_dl/extractor/chilloutzone.py @@ -57,7 +57,7 @@ class ChilloutzoneIE(InfoExtractor): base64_video_info = self._html_search_regex( r'var cozVidData = "(.+?)";', webpage, 'video data') - decoded_video_info = base64.b64decode(base64_video_info).decode("utf-8") + decoded_video_info = base64.b64decode(base64_video_info.encode('utf-8')).decode('utf-8') video_info_dict = json.loads(decoded_video_info) # get video information from dict diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py new file mode 100644 index 000000000..b1eeaf101 --- /dev/null +++ b/youtube_dl/extractor/chirbit.py @@ -0,0 +1,84 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + int_or_none, +) + + +class ChirbitIE(InfoExtractor): + IE_NAME = 'chirbit' + _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?:(?:wp|pl)/|fb_chirbit_player\.swf\?key=)?(?P[\da-zA-Z]+)' + _TESTS = [{ + 'url': 'http://chirb.it/PrIPv5', + 'md5': '9847b0dad6ac3e074568bf2cfb197de8', + 'info_dict': { + 'id': 'PrIPv5', + 'ext': 'mp3', + 'title': 'Фасадстрой', + 'duration': 52, + 'view_count': int, + 'comment_count': int, + } + }, { + 'url': 'https://chirb.it/fb_chirbit_player.swf?key=PrIPv5', + 'only_matching': True, + }] + + def _real_extract(self, url): + audio_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://chirb.it/%s' % audio_id, audio_id) + + audio_url = self._search_regex( + r'"setFile"\s*,\s*"([^"]+)"', webpage, 'audio url') + + title = self._search_regex( + r'itemprop="name">([^<]+)', webpage, 'title') + duration = parse_duration(self._html_search_meta( + 'duration', webpage, 'duration', fatal=False)) + view_count = int_or_none(self._search_regex( + r'itemprop="playCount"\s*>(\d+)', webpage, + 'listen count', fatal=False)) + comment_count = int_or_none(self._search_regex( + r'>(\d+) Comments?:', webpage, + 'comment count', fatal=False)) + + return { + 'id': audio_id, + 'url': audio_url, + 'title': title, + 'duration': duration, + 'view_count': view_count, + 'comment_count': comment_count, + } + + +class ChirbitProfileIE(InfoExtractor): + IE_NAME = 'chirbit:profile' + _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?:rss/)?(?P[^/]+)' + _TEST = { + 'url': 'http://chirbit.com/ScarletBeauty', + 'info_dict': { + 'id': 'ScarletBeauty', + 'title': 'Chirbits by ScarletBeauty', + }, + 'playlist_mincount': 3, + } + + def _real_extract(self, url): + profile_id = self._match_id(url) + + rss = self._download_xml( + 'http://chirbit.com/rss/%s' % profile_id, profile_id) + + entries = [ + self.url_result(audio_url.text, 'Chirbit') + for audio_url in rss.findall('./channel/item/link')] + + title = rss.find('./channel/title').text + + return self.playlist_result(entries, profile_id, title) diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py new file mode 100644 index 000000000..fd1770dac --- /dev/null +++ b/youtube_dl/extractor/cinemassacre.py @@ -0,0 +1,111 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ExtractorError +from .bliptv import BlipTVIE +from .screenwavemedia import ScreenwaveMediaIE + + +class CinemassacreIE(InfoExtractor): + _VALID_URL = 'https?://(?:www\.)?cinemassacre\.com/(?P[0-9]{4})/(?P[0-9]{2})/(?P[0-9]{2})/(?P[^?#/]+)' + _TESTS = [ + { + 'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/', + 'md5': 'fde81fbafaee331785f58cd6c0d46190', + 'info_dict': { + 'id': 'Cinemassacre-19911', + 'ext': 'mp4', + 'upload_date': '20121110', + 'title': '“Angry Video Game Nerd: The Movie” – Trailer', + 'description': 'md5:fb87405fcb42a331742a0dce2708560b', + }, + }, + { + 'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940', + 'md5': 'd72f10cd39eac4215048f62ab477a511', + 'info_dict': { + 'id': 'Cinemassacre-521be8ef82b16', + 'ext': 'mp4', + 'upload_date': '20131002', + 'title': 'The Mummy’s Hand (1940)', + }, + }, + { + # blip.tv embedded video + 'url': 'http://cinemassacre.com/2006/12/07/chronologically-confused-about-bad-movie-and-video-game-sequel-titles/', + 'md5': 'ca9b3c8dd5a66f9375daeb5135f5a3de', + 'info_dict': { + 'id': '4065369', + 'ext': 'flv', + 'title': 'AVGN: Chronologically Confused about Bad Movie and Video Game Sequel Titles', + 'upload_date': '20061207', + 'uploader': 'cinemassacre', + 'uploader_id': '250778', + 'timestamp': 1283233867, + 'description': 'md5:0a108c78d130676b207d0f6d029ecffd', + } + }, + { + # Youtube embedded video + 'url': 'http://cinemassacre.com/2006/09/01/mckids/', + 'md5': '6eb30961fa795fedc750eac4881ad2e1', + 'info_dict': { + 'id': 'FnxsNhuikpo', + 'ext': 'mp4', + 'upload_date': '20060901', + 'uploader': 'Cinemassacre Extras', + 'description': 'md5:de9b751efa9e45fbaafd9c8a1123ed53', + 'uploader_id': 'Cinemassacre', + 'title': 'AVGN: McKids', + } + }, + { + 'url': 'http://cinemassacre.com/2015/05/25/mario-kart-64-nintendo-64-james-mike-mondays/', + 'md5': '1376908e49572389e7b06251a53cdd08', + 'info_dict': { + 'id': 'Cinemassacre-555779690c440', + 'ext': 'mp4', + 'description': 'Let’s Play Mario Kart 64 !! Mario Kart 64 is a classic go-kart racing game released for the Nintendo 64 (N64). Today James & Mike do 4 player Battle Mode with Kyle and Bootsy!', + 'title': 'Mario Kart 64 (Nintendo 64) James & Mike Mondays', + 'upload_date': '20150525', + } + } + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('display_id') + video_date = mobj.group('date_y') + mobj.group('date_m') + mobj.group('date_d') + + webpage = self._download_webpage(url, display_id) + + playerdata_url = self._search_regex( + [ + ScreenwaveMediaIE.EMBED_PATTERN, + r']+src="(?P(?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"', + ], + webpage, 'player data URL', default=None, group='url') + if not playerdata_url: + playerdata_url = BlipTVIE._extract_url(webpage) + if not playerdata_url: + raise ExtractorError('Unable to find player data') + + video_title = self._html_search_regex( + r'(?P<title>.+?)\|', webpage, 'title') + video_description = self._html_search_regex( + r'<div class="entry-content">(?P<description>.+?)</div>', + webpage, 'description', flags=re.DOTALL, fatal=False) + video_thumbnail = self._og_search_thumbnail(webpage) + + return { + '_type': 'url_transparent', + 'display_id': display_id, + 'title': video_title, + 'description': video_description, + 'upload_date': video_date, + 'thumbnail': video_thumbnail, + 'url': playerdata_url, + } diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py index a5c3cb7c6..7af903571 100644 --- a/youtube_dl/extractor/clipfish.py +++ b/youtube_dl/extractor/clipfish.py @@ -1,53 +1,68 @@ from __future__ import unicode_literals import re -import time -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( - ExtractorError, - parse_duration, + determine_ext, + int_or_none, + js_to_json, + parse_iso8601, + remove_end, ) class ClipfishIE(InfoExtractor): - IE_NAME = 'clipfish' - - _VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P<id>[0-9]+)/' + _VALID_URL = r'https?://(?:www\.)?clipfish\.de/(?:[^/]+/)+video/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/', - 'md5': '2521cd644e862936cf2e698206e47385', + 'md5': '79bc922f3e8a9097b3d68a93780fd475', 'info_dict': { 'id': '3966754', 'ext': 'mp4', 'title': 'FIFA 14 - E3 2013 Trailer', + 'timestamp': 1370938118, + 'upload_date': '20130611', 'duration': 82, - }, - 'skip': 'Blocked in the US' + } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(1) - - info_url = ('http://www.clipfish.de/devxml/videoinfo/%s?ts=%d' % - (video_id, int(time.time()))) - doc = self._download_xml( - info_url, video_id, note='Downloading info page') - title = doc.find('title').text - video_url = doc.find('filename').text - if video_url is None: - xml_bytes = xml.etree.ElementTree.tostring(doc) - raise ExtractorError('Cannot find video URL in document %r' % - xml_bytes) - thumbnail = doc.find('imageurl').text - duration = parse_duration(doc.find('duration').text) + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_info = self._parse_json( + js_to_json(self._html_search_regex( + '(?s)videoObject\s*=\s*({.+?});', webpage, 'video object')), + video_id) + + formats = [] + for video_url in re.findall(r'var\s+videourl\s*=\s*"([^"]+)"', webpage): + ext = determine_ext(video_url) + if ext == 'm3u8': + formats.append({ + 'url': video_url.replace('de.hls.fra.clipfish.de', 'hls.fra.clipfish.de'), + 'ext': 'mp4', + 'format_id': 'hls', + }) + else: + formats.append({ + 'url': video_url, + 'format_id': ext, + }) + self._sort_formats(formats) + + title = remove_end(self._og_search_title(webpage), ' - Video') + thumbnail = self._og_search_thumbnail(webpage) + duration = int_or_none(video_info.get('length')) + timestamp = parse_iso8601(self._html_search_meta('uploadDate', webpage, 'upload date')) return { 'id': video_id, 'title': title, - 'url': video_url, + 'formats': formats, 'thumbnail': thumbnail, 'duration': duration, + 'timestamp': timestamp, } diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py index d07d544ea..8306d6fb7 100644 --- a/youtube_dl/extractor/clipsyndicate.py +++ b/youtube_dl/extractor/clipsyndicate.py @@ -1,7 +1,5 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( find_xpath_attr, @@ -10,9 +8,9 @@ from ..utils import ( class ClipsyndicateIE(InfoExtractor): - _VALID_URL = r'http://www\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)' + _VALID_URL = r'http://(?:chic|www)\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe', 'md5': '4d7d549451bad625e0ff3d7bd56d776c', 'info_dict': { @@ -22,11 +20,13 @@ class ClipsyndicateIE(InfoExtractor): 'duration': 612, 'thumbnail': 're:^https?://.+\.jpg', }, - } + }, { + 'url': 'http://chic.clipsyndicate.com/video/play/5844117/shark_attack', + 'only_matching': True, + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) js_player = self._download_webpage( 'http://eplayer.clipsyndicate.com/embed/player.js?va_id=%s' % video_id, video_id, 'Downlaoding player') diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py index abf8cc280..0fa720ee8 100644 --- a/youtube_dl/extractor/cloudy.py +++ b/youtube_dl/extractor/cloudy.py @@ -105,6 +105,7 @@ class CloudyIE(InfoExtractor): webpage = self._download_webpage(url, video_id) file_key = self._search_regex( - r'filekey\s*=\s*"([^"]+)"', webpage, 'file_key') + [r'key\s*:\s*"([^"]+)"', r'filekey\s*=\s*"([^"]+)"'], + webpage, 'file_key') return self._extract_video(video_host, video_id, file_key) diff --git a/youtube_dl/extractor/clubic.py b/youtube_dl/extractor/clubic.py index 14f215c5c..1dfa7c12e 100644 --- a/youtube_dl/extractor/clubic.py +++ b/youtube_dl/extractor/clubic.py @@ -12,9 +12,9 @@ from ..utils import ( class ClubicIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?clubic\.com/video/[^/]+/video.*-(?P<id>[0-9]+)\.html' + _VALID_URL = r'http://(?:www\.)?clubic\.com/video/(?:[^/]+/)*video.*-(?P<id>[0-9]+)\.html' - _TEST = { + _TESTS = [{ 'url': 'http://www.clubic.com/video/clubic-week/video-clubic-week-2-0-le-fbi-se-lance-dans-la-photo-d-identite-448474.html', 'md5': '1592b694ba586036efac1776b0b43cd3', 'info_dict': { @@ -24,7 +24,10 @@ class ClubicIE(InfoExtractor): 'description': 're:Gueule de bois chez Nokia. Le constructeur a indiqué cette.*', 'thumbnail': 're:^http://img\.clubic\.com/.*\.jpg$', } - } + }, { + 'url': 'http://www.clubic.com/video/video-clubic-week-2-0-apple-iphone-6s-et-plus-mais-surtout-le-pencil-469792.html', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/clyp.py b/youtube_dl/extractor/clyp.py new file mode 100644 index 000000000..57e643799 --- /dev/null +++ b/youtube_dl/extractor/clyp.py @@ -0,0 +1,57 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + parse_iso8601, +) + + +class ClypIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?clyp\.it/(?P<id>[a-z0-9]+)' + _TEST = { + 'url': 'https://clyp.it/ojz2wfah', + 'md5': '1d4961036c41247ecfdcc439c0cddcbb', + 'info_dict': { + 'id': 'ojz2wfah', + 'ext': 'mp3', + 'title': 'Krisson80 - bits wip wip', + 'description': '#Krisson80BitsWipWip #chiptune\n#wip', + 'duration': 263.21, + 'timestamp': 1443515251, + 'upload_date': '20150929', + }, + } + + def _real_extract(self, url): + audio_id = self._match_id(url) + + metadata = self._download_json( + 'https://api.clyp.it/%s' % audio_id, audio_id) + + formats = [] + for secure in ('', 'Secure'): + for ext in ('Ogg', 'Mp3'): + format_id = '%s%s' % (secure, ext) + format_url = metadata.get('%sUrl' % format_id) + if format_url: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'vcodec': 'none', + }) + self._sort_formats(formats) + + title = metadata['Title'] + description = metadata.get('Description') + duration = float_or_none(metadata.get('Duration')) + timestamp = parse_iso8601(metadata.get('DateCreated')) + + return { + 'id': audio_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + } diff --git a/youtube_dl/extractor/cmt.py b/youtube_dl/extractor/cmt.py index e96c59f71..f1311b14f 100644 --- a/youtube_dl/extractor/cmt.py +++ b/youtube_dl/extractor/cmt.py @@ -4,7 +4,7 @@ from .mtv import MTVIE class CMTIE(MTVIE): IE_NAME = 'cmt.com' - _VALID_URL = r'https?://www\.cmt\.com/videos/.+?/(?P<videoid>[^/]+)\.jhtml' + _VALID_URL = r'https?://www\.cmt\.com/(?:videos|shows)/(?:[^/]+/)*(?P<videoid>\d+)' _FEED_URL = 'http://www.cmt.com/sitewide/apps/player/embed/rss/' _TESTS = [{ @@ -16,4 +16,7 @@ class CMTIE(MTVIE): 'title': 'Garth Brooks - "The Call (featuring Trisha Yearwood)"', 'description': 'Blame It All On My Roots', }, + }, { + 'url': 'http://www.cmt.com/shows/party-down-south/party-down-south-ep-407-gone-girl/1738172/playlist/#id=1738172', + 'only_matching': True, }] diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py index 3145b3051..5dd69bff7 100644 --- a/youtube_dl/extractor/cnet.py +++ b/youtube_dl/extractor/cnet.py @@ -11,7 +11,7 @@ from ..utils import ( class CNETIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?cnet\.com/videos/(?P<id>[^/]+)/' - _TEST = { + _TESTS = [{ 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/', 'info_dict': { 'id': '56f4ea68-bd21-4852-b08c-4de5b8354c60', @@ -25,7 +25,20 @@ class CNETIE(InfoExtractor): 'params': { 'skip_download': 'requires rtmpdump', } - } + }, { + 'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/', + 'info_dict': { + 'id': '56527b93-d25d-44e3-b738-f989ce2e49ba', + 'ext': 'flv', + 'description': 'Khail and Ashley wonder what other civic woes can be solved by self-tweeting objects, investigate a new kind of VR camera and watch an origami robot self-assemble, walk, climb, dig and dissolve. #TDPothole', + 'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40', + 'uploader': 'Ashley Esqueda', + 'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)', + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -42,7 +55,7 @@ class CNETIE(InfoExtractor): raise ExtractorError('Cannot find video data') mpx_account = data['config']['players']['default']['mpx_account'] - vid = vdata['files']['rtmp'] + vid = vdata['files'].get('rtmp', vdata['files']['hds']) tp_link = 'http://link.theplatform.com/s/%s/%s' % (mpx_account, vid) video_id = vdata['id'] diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 90ea07438..3b1bd4033 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -12,7 +12,7 @@ from ..utils import ( class CNNIE(InfoExtractor): _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/ - (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:cnn|hln)(?:-ap)?|(?=&)))''' + (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))''' _TESTS = [{ 'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', @@ -45,6 +45,12 @@ class CNNIE(InfoExtractor): 'description': 'md5:e7223a503315c9f150acac52e76de086', 'upload_date': '20141222', } + }, { + 'url': 'http://cnn.com/video/?/video/politics/2015/03/27/pkg-arizona-senator-church-attendance-mandatory.ktvk', + 'only_matching': True, + }, { + 'url': 'http://cnn.com/video/?/video/us/2015/04/06/dnt-baker-refuses-anti-gay-order.wkmg', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/collegerama.py b/youtube_dl/extractor/collegerama.py index fedd48490..40667a0f1 100644 --- a/youtube_dl/extractor/collegerama.py +++ b/youtube_dl/extractor/collegerama.py @@ -3,10 +3,10 @@ from __future__ import unicode_literals import json from .common import InfoExtractor -from ..compat import compat_urllib_request from ..utils import ( float_or_none, int_or_none, + sanitized_Request, ) @@ -52,7 +52,7 @@ class CollegeRamaIE(InfoExtractor): } } - request = compat_urllib_request.Request( + request = sanitized_Request( 'http://collegerama.tudelft.nl/Mediasite/PlayerService/PlayerService.svc/json/GetPlayerOptions', json.dumps(player_options_request)) request.add_header('Content-Type', 'application/json') diff --git a/youtube_dl/extractor/comcarcoff.py b/youtube_dl/extractor/comcarcoff.py index 9c25b2223..81f3d7697 100644 --- a/youtube_dl/extractor/comcarcoff.py +++ b/youtube_dl/extractor/comcarcoff.py @@ -36,7 +36,7 @@ class ComCarCoffIE(InfoExtractor): webpage, 'full data json')) video_id = full_data['activeVideo']['video'] - video_data = full_data['videos'][video_id] + video_data = full_data.get('videos', {}).get(video_id) or full_data['singleshots'][video_id] thumbnails = [{ 'url': video_data['images']['thumb'], }, { diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index b24538981..3e4bd10b6 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -151,12 +151,7 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor): mobj = re.match(self._VALID_URL, url) if mobj.group('shortname'): - if mobj.group('shortname') in ('tds', 'thedailyshow'): - url = 'http://thedailyshow.cc.com/full-episodes/' - else: - url = 'http://thecolbertreport.cc.com/full-episodes/' - mobj = re.match(self._VALID_URL, url, re.VERBOSE) - assert mobj is not None + return self.url_result('http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes') if mobj.group('clip'): if mobj.group('videotitle'): @@ -201,7 +196,7 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor): uri = mMovieParams[0][1] # Correct cc.com in uri - uri = re.sub(r'(episode:[^.]+)(\.cc)?\.com', r'\1.cc.com', uri) + uri = re.sub(r'(episode:[^.]+)(\.cc)?\.com', r'\1.com', uri) index_url = 'http://%s.cc.com/feeds/mrss?%s' % (show_name, compat_urllib_parse.urlencode({'uri': uri})) idoc = self._download_xml( @@ -250,6 +245,8 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor): }) self._sort_formats(formats) + subtitles = self._extract_subtitles(cdoc, guid) + virtual_id = show_name + ' ' + epTitle + ' part ' + compat_str(part_num + 1) entries.append({ 'id': guid, @@ -260,6 +257,7 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor): 'duration': duration, 'thumbnail': thumbnail, 'description': description, + 'subtitles': subtitles, }) return { diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 48742189a..eb9bfa3d1 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -10,30 +10,39 @@ import re import socket import sys import time -import xml.etree.ElementTree from ..compat import ( compat_cookiejar, - compat_HTTPError, + compat_cookies, + compat_getpass, compat_http_client, compat_urllib_error, + compat_urllib_parse, compat_urllib_parse_urlparse, compat_urlparse, compat_str, + compat_etree_fromstring, ) from ..utils import ( + NO_DEFAULT, age_restricted, + bug_reports_message, clean_html, compiled_regex_type, + determine_ext, ExtractorError, + fix_xml_ampersands, float_or_none, - HEADRequest, int_or_none, RegexNotFoundError, sanitize_filename, + sanitized_Request, unescapeHTML, + unified_strdate, + url_basename, + xpath_text, + xpath_with_ns, ) -_NO_DEFAULT = object() class InfoExtractor(object): @@ -47,7 +56,7 @@ class InfoExtractor(object): information possibly downloading the video to the file system, among other possible outcomes. - The type field determines the the type of the result. + The type field determines the type of the result. By far the most common value (and the default if _type is missing) is "video", which indicates a single video. @@ -63,7 +72,7 @@ class InfoExtractor(object): Potential fields: * url Mandatory. The URL of the video file - * ext Will be calculated from url if missing + * ext Will be calculated from URL if missing * format A human-readable description of the format ("mp4 container with h264/opus"). Calculated from the format_id, width, height. @@ -111,11 +120,8 @@ class InfoExtractor(object): (quality takes higher priority) -1 for default (order by other properties), -2 or smaller for less than default. - * http_method HTTP method to use for the download. * http_headers A dictionary of additional HTTP headers to add to the request. - * http_post_data Additional data to send with a POST - request. * stretched_ratio If given and not 1, indicates that the video's pixels are not square. width : height ratio as float. @@ -146,17 +152,26 @@ class InfoExtractor(object): description: Full video description. uploader: Full name of the video uploader. creator: The main artist who created the video. + release_date: The date (YYYYMMDD) when the video was released. timestamp: UNIX timestamp of the moment the video became available. upload_date: Video upload date (YYYYMMDD). If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. location: Physical location where the video was filmed. - subtitles: The subtitle file contents as a dictionary in the format - {language: subtitles}. + subtitles: The available subtitles as a dictionary in the format + {language: subformats}. "subformats" is a list sorted from + lower to higher preference, each element is a dictionary + with the "ext" entry and one of: + * "data": The subtitles file contents + * "url": A URL pointing to the subtitles file + "ext" will be calculated from URL if missing + automatic_captions: Like 'subtitles', used by the YoutubeIE for + automatically generated captions duration: Length of the video in seconds, as an integer. view_count: How many users have watched the video on the platform. like_count: Number of positive ratings of the video dislike_count: Number of negative ratings of the video + repost_count: Number of reposts of the video average_rating: Average rating give by users, the scale used depends on the webpage comment_count: Number of comments on the video comments: A list of comments, each with one or more of the following @@ -171,13 +186,18 @@ class InfoExtractor(object): Set to "root" to indicate that this is a comment to the original video. age_limit: Age restriction for the video, as an integer (years) - webpage_url: The url to the video webpage, if given to youtube-dl it + webpage_url: The URL to the video webpage, if given to youtube-dl it should allow to get the same result again. (It will be set by YoutubeDL if it's missing) categories: A list of categories that the video falls in, for example ["Sports", "Berlin"] + tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"] is_live: True, False, or None (=unknown). Whether this video is a live stream that goes on instead of a fixed-length video. + start_time: Time in seconds where the reproduction should start, as + specified in the URL. + end_time: Time in seconds where the reproduction should end, as + specified in the URL. Unless mentioned otherwise, the fields should be Unicode strings. @@ -188,8 +208,8 @@ class InfoExtractor(object): There must be a key "entries", which is a list, an iterable, or a PagedList object, each element of which is a valid dictionary by this specification. - Additionally, playlists can have "title" and "id" attributes with the same - semantics as videos (see above). + Additionally, playlists can have "title", "description" and "id" attributes + with the same semantics as videos (see above). _type "multi_video" indicates that there are multiple videos that @@ -290,11 +310,11 @@ class InfoExtractor(object): @classmethod def ie_key(cls): """A string for getting the InfoExtractor with get_info_extractor""" - return cls.__name__[:-2] + return compat_str(cls.__name__[:-2]) @property def IE_NAME(self): - return type(self).__name__[:-2] + return compat_str(type(self).__name__[:-2]) def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): """ Returns the response handle """ @@ -319,7 +339,7 @@ class InfoExtractor(object): self._downloader.report_warning(errmsg) return False - def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True): + def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None): """ Returns a tuple (page content as string, URL handle) """ # Strip hashes from the URL (#1038) if isinstance(url_or_request, (compat_str, str)): @@ -329,14 +349,11 @@ class InfoExtractor(object): if urlh is False: assert not fatal return False - content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal) + content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding) return (content, urlh) - def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None): - content_type = urlh.headers.get('Content-Type', '') - webpage_bytes = urlh.read() - if prefix is not None: - webpage_bytes = prefix + webpage_bytes + @staticmethod + def _guess_encoding_from_content(content_type, webpage_bytes): m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) if m: encoding = m.group(1) @@ -349,6 +366,16 @@ class InfoExtractor(object): encoding = 'utf-16' else: encoding = 'utf-8' + + return encoding + + def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): + content_type = urlh.headers.get('Content-Type', '') + webpage_bytes = urlh.read() + if prefix is not None: + webpage_bytes = prefix + webpage_bytes + if not encoding: + encoding = self._guess_encoding_from_content(content_type, webpage_bytes) if self._downloader.params.get('dump_intermediate_pages', False): try: url = url_or_request.get_full_url() @@ -392,16 +419,26 @@ class InfoExtractor(object): if blocked_iframe: msg += ' Visit %s for more details' % blocked_iframe raise ExtractorError(msg, expected=True) + if '<title>The URL you requested has been blocked' in content[:512]: + msg = ( + 'Access to this webpage has been blocked by Indian censorship. ' + 'Use a VPN or proxy server (with --proxy) to route around it.') + block_msg = self._html_search_regex( + r'

    (.*?)

    ', + content, 'block message', default=None) + if block_msg: + msg += ' (Message: "%s")' % block_msg.replace('\n', ' ') + raise ExtractorError(msg, expected=True) return content - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5): + def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None): """ Returns the data of the page as a string """ success = False try_count = 0 while success is False: try: - res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal) + res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding) success = True except compat_http_client.IncompleteRead as e: try_count += 1 @@ -416,23 +453,24 @@ class InfoExtractor(object): def _download_xml(self, url_or_request, video_id, note='Downloading XML', errnote='Unable to download XML', - transform_source=None, fatal=True): + transform_source=None, fatal=True, encoding=None): """Return the xml as an xml.etree.ElementTree.Element""" xml_string = self._download_webpage( - url_or_request, video_id, note, errnote, fatal=fatal) + url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding) if xml_string is False: return xml_string if transform_source: xml_string = transform_source(xml_string) - return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) + return compat_etree_fromstring(xml_string.encode('utf-8')) def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', errnote='Unable to download JSON metadata', transform_source=None, - fatal=True): + fatal=True, encoding=None): json_string = self._download_webpage( - url_or_request, video_id, note, errnote, fatal=fatal) + url_or_request, video_id, note, errnote, fatal=fatal, + encoding=encoding) if (not fatal) and json_string is False: return None return self._parse_json( @@ -475,16 +513,30 @@ class InfoExtractor(object): """Report attempt to log in.""" self.to_screen('Logging in') + @staticmethod + def raise_login_required(msg='This video is only available for registered users'): + raise ExtractorError( + '%s. Use --username and --password or --netrc to provide account credentials.' % msg, + expected=True) + + @staticmethod + def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'): + raise ExtractorError( + '%s. You might want to use --proxy to workaround.' % msg, + expected=True) + # Methods for following #608 @staticmethod - def url_result(url, ie=None, video_id=None): - """Returns a url that points to a page that should be processed""" + def url_result(url, ie=None, video_id=None, video_title=None): + """Returns a URL that points to a page that should be processed""" # TODO: ie should be the class used for getting the info video_info = {'_type': 'url', 'url': url, 'ie_key': ie} if video_id is not None: video_info['id'] = video_id + if video_title is not None: + video_info['title'] = video_title return video_info @staticmethod @@ -500,7 +552,7 @@ class InfoExtractor(object): video_info['description'] = playlist_description return video_info - def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None): + def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): """ Perform a regex search on the given string, using a single or a list of patterns returning the first matching group. @@ -526,16 +578,15 @@ class InfoExtractor(object): return next(g for g in mobj.groups() if g is not None) else: return mobj.group(group) - elif default is not _NO_DEFAULT: + elif default is not NO_DEFAULT: return default elif fatal: raise RegexNotFoundError('Unable to extract %s' % _name) else: - self._downloader.report_warning('unable to extract %s; ' - 'please report this issue on http://yt-dl.org/bug' % _name) + self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message()) return None - def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None): + def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): """ Like _search_regex, but strips HTML tags and unescapes entities. """ @@ -547,7 +598,7 @@ class InfoExtractor(object): def _get_login_info(self): """ - Get the the login info as (username, password) + Get the login info as (username, password) It will look in the netrc file using the _NETRC_MACHINE value If there's no info available, return (None, None) """ @@ -575,7 +626,7 @@ class InfoExtractor(object): return (username, password) - def _get_tfa_info(self): + def _get_tfa_info(self, note='two-factor verification code'): """ Get the two-factor authentication info TODO - asking the user will be required for sms/phone verify @@ -589,19 +640,26 @@ class InfoExtractor(object): if downloader_params.get('twofactor', None) is not None: return downloader_params['twofactor'] - return None + return compat_getpass('Type %s and press [Return]: ' % note) # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): - content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')' - property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop) + content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' + property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)' + % {'prop': re.escape(prop)}) template = r']+?%s[^>]+?%s' return [ template % (property_re, content_re), template % (content_re, property_re), ] + @staticmethod + def _meta_regex(prop): + return r'''(?isx)]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1) + [^>]+?content=(["\'])(?P.*?)\2''' % re.escape(prop) + def _og_search_property(self, prop, html, name=None, **kargs): if name is None: name = 'OpenGraph %s' % prop @@ -611,7 +669,7 @@ class InfoExtractor(object): return unescapeHTML(escaped) def _og_search_thumbnail(self, html, **kargs): - return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs) + return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs) def _og_search_description(self, html, **kargs): return self._og_search_property('description', html, fatal=False, **kargs) @@ -632,9 +690,7 @@ class InfoExtractor(object): if display_name is None: display_name = name return self._html_search_regex( - r'''(?isx)]+(?:itemprop|name|property)=(["\']?)%s\1) - [^>]+?content=(["\'])(?P.*?)\2''' % re.escape(name), + self._meta_regex(name), html, display_name, fatal=fatal, group='content', **kwargs) def _dc_search_uploader(self, html): @@ -665,7 +721,7 @@ class InfoExtractor(object): return RATING_TABLE.get(rating.lower(), None) def _family_friendly_search(self, html): - # See http://schema.org/VideoObj + # See http://schema.org/VideoObject family_friendly = self._html_search_meta('isFamilyFriendly', html) if not family_friendly: @@ -683,7 +739,29 @@ class InfoExtractor(object): return self._html_search_meta('twitter:player', html, 'twitter card player') - def _sort_formats(self, formats): + @staticmethod + def _hidden_inputs(html): + html = re.sub(r'', '', html) + hidden_inputs = {} + for input in re.findall(r'(?i)]+)>', html): + if not re.search(r'type=(["\'])(?:hidden|submit)\1', input): + continue + name = re.search(r'name=(["\'])(?P.+?)\1', input) + if not name: + continue + value = re.search(r'value=(["\'])(?P.*?)\1', input) + if not value: + continue + hidden_inputs[name.group('value')] = value.group('value') + return hidden_inputs + + def _form_hidden_inputs(self, form_id, html): + form = self._search_regex( + r'(?is)]+?id=(["\'])%s\1[^>]*>(?P
    .+?)
    ' % form_id, + html, '%s form' % form_id, group='form') + return self._hidden_inputs(form) + + def _sort_formats(self, formats, field_preference=None): if not formats: raise ExtractorError('No video formats found') @@ -693,6 +771,9 @@ class InfoExtractor(object): if not f.get('ext') and 'url' in f: f['ext'] = determine_ext(f['url']) + if isinstance(field_preference, (list, tuple)): + return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference) + preference = f.get('preference') if preference is None: proto = f.get('protocol') @@ -729,6 +810,7 @@ class InfoExtractor(object): f.get('language_preference') if f.get('language_preference') is not None else -1, f.get('quality') if f.get('quality') is not None else -1, f.get('tbr') if f.get('tbr') is not None else -1, + f.get('filesize') if f.get('filesize') is not None else -1, f.get('vbr') if f.get('vbr') is not None else -1, f.get('height') if f.get('height') is not None else -1, f.get('width') if f.get('width') is not None else -1, @@ -736,10 +818,9 @@ class InfoExtractor(object): f.get('abr') if f.get('abr') is not None else -1, audio_ext_preference, f.get('fps') if f.get('fps') is not None else -1, - f.get('filesize') if f.get('filesize') is not None else -1, f.get('filesize_approx') if f.get('filesize_approx') is not None else -1, f.get('source_preference') if f.get('source_preference') is not None else -1, - f.get('format_id'), + f.get('format_id') if f.get('format_id') is not None else '', ) formats.sort(key=_formats_key) @@ -752,15 +833,17 @@ class InfoExtractor(object): formats) def _is_valid_url(self, url, video_id, item='video'): + url = self._proto_relative_url(url, scheme='http:') + # For now assume non HTTP(S) URLs always valid + if not (url.startswith('http://') or url.startswith('https://')): + return True try: - self._request_webpage( - HEADRequest(url), video_id, - 'Checking %s URL' % item) + self._request_webpage(url, video_id, 'Checking %s URL' % item) return True except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): - self.report_warning( - '%s URL is invalid, skipping' % item, video_id) + if isinstance(e.cause, compat_urllib_error.URLError): + self.to_screen( + '%s: %s URL is invalid, skipping' % (video_id, item)) return False raise @@ -788,10 +871,19 @@ class InfoExtractor(object): self.to_screen(msg) time.sleep(timeout) - def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None): + def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None, + transform_source=lambda s: fix_xml_ampersands(s).strip(), + fatal=True): manifest = self._download_xml( manifest_url, video_id, 'Downloading f4m manifest', - 'Unable to download f4m manifest') + 'Unable to download f4m manifest', + # Some manifests may be malformed, e.g. prosiebensat1 generated manifests + # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244) + transform_source=transform_source, + fatal=fatal) + + if manifest is False: + return manifest formats = [] manifest_version = '1.0' @@ -799,13 +891,32 @@ class InfoExtractor(object): if not media_nodes: manifest_version = '2.0' media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') + base_url = xpath_text( + manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'], + 'base URL', default=None) + if base_url: + base_url = base_url.strip() for i, media_el in enumerate(media_nodes): if manifest_version == '2.0': - manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/' - + (media_el.attrib.get('href') or media_el.attrib.get('url'))) + media_url = media_el.attrib.get('href') or media_el.attrib.get('url') + if not media_url: + continue + manifest_url = ( + media_url if media_url.startswith('http://') or media_url.startswith('https://') + else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url)) + # If media_url is itself a f4m manifest do the recursive extraction + # since bitrates in parent manifest (this one) and media_url manifest + # may differ leading to inability to resolve the format by requested + # bitrate in f4m downloader + if determine_ext(manifest_url) == 'f4m': + f4m_formats = self._extract_f4m_formats( + manifest_url, video_id, preference, f4m_id, fatal=fatal) + if f4m_formats: + formats.extend(f4m_formats) + continue tbr = int_or_none(media_el.attrib.get('bitrate')) formats.append({ - 'format_id': '-'.join(filter(None, [f4m_id, 'f4m-%d' % (i if tbr is None else tbr)])), + 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])), 'url': manifest_url, 'ext': 'flv', 'tbr': tbr, @@ -819,14 +930,15 @@ class InfoExtractor(object): def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, entry_protocol='m3u8', preference=None, - m3u8_id=None): + m3u8_id=None, note=None, errnote=None, + fatal=True): formats = [{ - 'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-meta'])), + 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), 'url': m3u8_url, 'ext': ext, 'protocol': 'm3u8', - 'preference': -1, + 'preference': preference - 1 if preference else -1, 'resolution': 'multiple', 'format_note': 'Quality selection URL', }] @@ -836,11 +948,17 @@ class InfoExtractor(object): if re.match(r'^https?://', u) else compat_urlparse.urljoin(m3u8_url, u)) - m3u8_doc = self._download_webpage( + res = self._download_webpage_handle( m3u8_url, video_id, - note='Downloading m3u8 information', - errnote='Failed to download m3u8 information') + note=note or 'Downloading m3u8 information', + errnote=errnote or 'Failed to download m3u8 information', + fatal=fatal) + if res is False: + return res + m3u8_doc, urlh = res + m3u8_url = urlh.geturl() last_info = None + last_media = None kv_rex = re.compile( r'(?P[a-zA-Z_-]+)=(?P"[^"]+"|[^",]+)(?:,|$)') for line in m3u8_doc.splitlines(): @@ -851,6 +969,13 @@ class InfoExtractor(object): if v.startswith('"'): v = v[1:-1] last_info[m.group('key')] = v + elif line.startswith('#EXT-X-MEDIA:'): + last_media = {} + for m in kv_rex.finditer(line): + v = m.group('val') + if v.startswith('"'): + v = v[1:-1] + last_media[m.group('key')] = v elif line.startswith('#') or not line.strip(): continue else: @@ -858,8 +983,13 @@ class InfoExtractor(object): formats.append({'url': format_url(line)}) continue tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000) + format_id = [] + if m3u8_id: + format_id.append(m3u8_id) + last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None + format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats))) f = { - 'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-%d' % (tbr if tbr else len(formats))])), + 'format_id': '-'.join(format_id), 'url': format_url(line.strip()), 'tbr': tbr, 'ext': ext, @@ -879,57 +1009,246 @@ class InfoExtractor(object): width_str, height_str = resolution.split('x') f['width'] = int(width_str) f['height'] = int(height_str) + if last_media is not None: + f['m3u8_media'] = last_media + last_media = None formats.append(f) last_info = {} self._sort_formats(formats) return formats - # TODO: improve extraction - def _extract_smil_formats(self, smil_url, video_id, fatal=True): - smil = self._download_xml( - smil_url, video_id, 'Downloading SMIL file', - 'Unable to download SMIL file', fatal=fatal) + @staticmethod + def _xpath_ns(path, namespace=None): + if not namespace: + return path + out = [] + for c in path.split('/'): + if not c or c == '.': + out.append(c) + else: + out.append('{%s}%s' % (namespace, c)) + return '/'.join(out) + + def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None): + smil = self._download_smil(smil_url, video_id, fatal=fatal) + if smil is False: assert not fatal return [] - base = smil.find('./head/meta').get('base') + namespace = self._parse_smil_namespace(smil) + + return self._parse_smil_formats( + smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) + + def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None): + smil = self._download_smil(smil_url, video_id, fatal=fatal) + if smil is False: + return {} + return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) + + def _download_smil(self, smil_url, video_id, fatal=True): + return self._download_xml( + smil_url, video_id, 'Downloading SMIL file', + 'Unable to download SMIL file', fatal=fatal) + + def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): + namespace = self._parse_smil_namespace(smil) + + formats = self._parse_smil_formats( + smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) + subtitles = self._parse_smil_subtitles(smil, namespace=namespace) + + video_id = os.path.splitext(url_basename(smil_url))[0] + title = None + description = None + upload_date = None + for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): + name = meta.attrib.get('name') + content = meta.attrib.get('content') + if not name or not content: + continue + if not title and name == 'title': + title = content + elif not description and name in ('description', 'abstract'): + description = content + elif not upload_date and name == 'date': + upload_date = unified_strdate(content) + + thumbnails = [{ + 'id': image.get('type'), + 'url': image.get('src'), + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')] + + return { + 'id': video_id, + 'title': title or video_id, + 'description': description, + 'upload_date': upload_date, + 'thumbnails': thumbnails, + 'formats': formats, + 'subtitles': subtitles, + } + + def _parse_smil_namespace(self, smil): + return self._search_regex( + r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None) + + def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): + base = smil_url + for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): + b = meta.get('base') or meta.get('httpBase') + if b: + base = b + break formats = [] rtmp_count = 0 - for video in smil.findall('./body/switch/video'): + http_count = 0 + + videos = smil.findall(self._xpath_ns('.//video', namespace)) + for video in videos: src = video.get('src') if not src: continue - bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) + + bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) + filesize = int_or_none(video.get('size') or video.get('fileSize')) width = int_or_none(video.get('width')) height = int_or_none(video.get('height')) proto = video.get('proto') - if not proto: - if base: - if base.startswith('rtmp'): - proto = 'rtmp' - elif base.startswith('http'): - proto = 'http' ext = video.get('ext') - if proto == 'm3u8': - formats.extend(self._extract_m3u8_formats(src, video_id, ext)) - elif proto == 'rtmp': + src_ext = determine_ext(src) + streamer = video.get('streamer') or base + + if proto == 'rtmp' or streamer.startswith('rtmp'): rtmp_count += 1 - streamer = video.get('streamer') or base formats.append({ 'url': streamer, 'play_path': src, 'ext': 'flv', 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), 'tbr': bitrate, + 'filesize': filesize, + 'width': width, + 'height': height, + }) + if transform_rtmp_url: + streamer, src = transform_rtmp_url(streamer, src) + formats[-1].update({ + 'url': streamer, + 'play_path': src, + }) + continue + + src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) + + if proto == 'm3u8' or src_ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + continue + + if src_ext == 'f4m': + f4m_url = src_url + if not f4m_params: + f4m_params = { + 'hdcore': '3.2.0', + 'plugin': 'flowplayer-3.2.0.1', + } + f4m_url += '&' if '?' in f4m_url else '?' + f4m_url += compat_urllib_parse.urlencode(f4m_params) + f4m_formats = self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) + continue + + if src_url.startswith('http') and self._is_valid_url(src, video_id): + http_count += 1 + formats.append({ + 'url': src_url, + 'ext': ext or src_ext or 'flv', + 'format_id': 'http-%d' % (bitrate or http_count), + 'tbr': bitrate, + 'filesize': filesize, 'width': width, 'height': height, }) + continue + self._sort_formats(formats) return formats + def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): + subtitles = {} + for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))): + src = textstream.get('src') + if not src: + continue + ext = textstream.get('ext') or determine_ext(src) + if not ext: + type_ = textstream.get('type') + SUBTITLES_TYPES = { + 'text/vtt': 'vtt', + 'text/srt': 'srt', + 'application/smptett+xml': 'tt', + } + if type_ in SUBTITLES_TYPES: + ext = SUBTITLES_TYPES[type_] + lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang + subtitles.setdefault(lang, []).append({ + 'url': src, + 'ext': ext, + }) + return subtitles + + def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True): + xspf = self._download_xml( + playlist_url, playlist_id, 'Downloading xpsf playlist', + 'Unable to download xspf manifest', fatal=fatal) + if xspf is False: + return [] + return self._parse_xspf(xspf, playlist_id) + + def _parse_xspf(self, playlist, playlist_id): + NS_MAP = { + 'xspf': 'http://xspf.org/ns/0/', + 's1': 'http://static.streamone.nl/player/ns/0', + } + + entries = [] + for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)): + title = xpath_text( + track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id) + description = xpath_text( + track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description') + thumbnail = xpath_text( + track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail') + duration = float_or_none( + xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000) + + formats = [{ + 'url': location.text, + 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)), + 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), + 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), + } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))] + self._sort_formats(formats) + + entries.append({ + 'id': playlist_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + }) + return entries + def _live_title(self, name): """ Generate the title for a live video """ now = datetime.datetime.now() @@ -964,6 +1283,12 @@ class InfoExtractor(object): None, '/', True, False, expire_time, '', None, None, None) self._downloader.cookiejar.set_cookie(cookie) + def _get_cookies(self, url): + """ Return a compat_cookies.SimpleCookie with the cookies for the url """ + req = sanitized_Request(url) + self._downloader.cookiejar.add_cookie_header(req) + return compat_cookies.SimpleCookie(req.get_header('Cookie')) + def get_testcases(self, include_onlymatching=False): t = getattr(self, '_TEST', None) if t: @@ -993,11 +1318,46 @@ class InfoExtractor(object): any_restricted = any_restricted or is_restricted return not any_restricted + def extract_subtitles(self, *args, **kwargs): + if (self._downloader.params.get('writesubtitles', False) or + self._downloader.params.get('listsubtitles')): + return self._get_subtitles(*args, **kwargs) + return {} + + def _get_subtitles(self, *args, **kwargs): + raise NotImplementedError("This method must be implemented by subclasses") + + @staticmethod + def _merge_subtitle_items(subtitle_list1, subtitle_list2): + """ Merge subtitle items for one language. Items with duplicated URLs + will be dropped. """ + list1_urls = set([item['url'] for item in subtitle_list1]) + ret = list(subtitle_list1) + ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls]) + return ret + + @classmethod + def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2): + """ Merge two subtitle dictionaries, language by language. """ + ret = dict(subtitle_dict1) + for lang in subtitle_dict2: + ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang]) + return ret + + def extract_automatic_captions(self, *args, **kwargs): + if (self._downloader.params.get('writeautomaticsub', False) or + self._downloader.params.get('listsubtitles')): + return self._get_automatic_captions(*args, **kwargs) + return {} + + def _get_automatic_captions(self, *args, **kwargs): + raise NotImplementedError("This method must be implemented by subclasses") + class SearchInfoExtractor(InfoExtractor): """ Base class for paged search queries extractors. - They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query} + They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query} Instances should define _SEARCH_KEY and _MAX_RESULTS. """ diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index 3db4db4e4..6f92ae2ed 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import re -import json from .common import InfoExtractor from ..compat import ( @@ -12,6 +11,7 @@ from ..compat import ( ) from ..utils import ( orderedSet, + remove_end, ) @@ -24,21 +24,33 @@ class CondeNastIE(InfoExtractor): # The keys are the supported sites and the values are the name to be shown # to the user and in the extractor description. _SITES = { - 'wired': 'WIRED', + 'allure': 'Allure', + 'architecturaldigest': 'Architectural Digest', + 'arstechnica': 'Ars Technica', + 'bonappetit': 'Bon Appétit', + 'brides': 'Brides', + 'cnevids': 'Condé Nast', + 'cntraveler': 'Condé Nast Traveler', + 'details': 'Details', + 'epicurious': 'Epicurious', + 'glamour': 'Glamour', + 'golfdigest': 'Golf Digest', 'gq': 'GQ', + 'newyorker': 'The New Yorker', + 'self': 'SELF', + 'teenvogue': 'Teen Vogue', + 'vanityfair': 'Vanity Fair', 'vogue': 'Vogue', - 'glamour': 'Glamour', + 'wired': 'WIRED', 'wmagazine': 'W Magazine', - 'vanityfair': 'Vanity Fair', - 'cnevids': 'Condé Nast', } - _VALID_URL = r'http://(video|www|player)\.(?P%s)\.com/(?Pwatch|series|video|embed)/(?P[^/?#]+)' % '|'.join(_SITES.keys()) + _VALID_URL = r'http://(?:video|www|player)\.(?P%s)\.com/(?Pwatch|series|video|embed(?:js)?)/(?P[^/?#]+)' % '|'.join(_SITES.keys()) IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) - EMBED_URL = r'(?:https?:)?//player\.(?P%s)\.com/(?Pembed)/.+?' % '|'.join(_SITES.keys()) + EMBED_URL = r'(?:https?:)?//player\.(?P%s)\.com/(?Pembed(?:js)?)/.+?' % '|'.join(_SITES.keys()) - _TEST = { + _TESTS = [{ 'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led', 'md5': '1921f713ed48aabd715691f774c451f7', 'info_dict': { @@ -47,7 +59,16 @@ class CondeNastIE(InfoExtractor): 'title': '3D Printed Speakers Lit With LED', 'description': 'Check out these beautiful 3D printed LED speakers. You can\'t actually buy them, but LumiGeek is working on a board that will let you make you\'re own.', } - } + }, { + # JS embed + 'url': 'http://player.cnevids.com/embedjs/55f9cf8b61646d1acf00000c/5511d76261646d5566020000.js', + 'md5': 'f1a6f9cafb7083bab74a710f65d08999', + 'info_dict': { + 'id': '55f9cf8b61646d1acf00000c', + 'ext': 'mp4', + 'title': '3D printed TSA Travel Sentry keys really do open TSA locks', + } + }] def _extract_series(self, url, webpage): title = self._html_search_regex(r'
    .*?

    (.+?)

    ', @@ -86,8 +107,8 @@ class CondeNastIE(InfoExtractor): info_url = base_info_url + data info_page = self._download_webpage(info_url, video_id, 'Downloading video info') - video_info = self._search_regex(r'var video = ({.+?});', info_page, 'video info') - video_info = json.loads(video_info) + video_info = self._search_regex(r'var\s+video\s*=\s*({.+?});', info_page, 'video info') + video_info = self._parse_json(video_info, video_id) formats = [{ 'format_id': '%s-%s' % (fdata['type'].split('/')[-1], fdata['quality']), @@ -111,6 +132,13 @@ class CondeNastIE(InfoExtractor): url_type = mobj.group('type') item_id = mobj.group('id') + # Convert JS embed to regular embed + if url_type == 'embedjs': + parsed_url = compat_urlparse.urlparse(url) + url = compat_urlparse.urlunparse(parsed_url._replace( + path=remove_end(parsed_url.path, '.js').replace('/embedjs/', '/embed/'))) + url_type = 'embed' + self.to_screen('Extracting from %s with the Condé Nast extractor' % self._SITES[site]) webpage = self._download_webpage(url, item_id) diff --git a/youtube_dl/extractor/cracked.py b/youtube_dl/extractor/cracked.py index cf763ee7e..94d03ce2a 100644 --- a/youtube_dl/extractor/cracked.py +++ b/youtube_dl/extractor/cracked.py @@ -11,39 +11,65 @@ from ..utils import ( class CrackedIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?cracked\.com/video_(?P\d+)_[\da-z-]+\.html' - _TEST = { + _TESTS = [{ + 'url': 'http://www.cracked.com/video_19070_if-animal-actors-got-e21-true-hollywood-stories.html', + 'md5': '89b90b9824e3806ca95072c4d78f13f7', + 'info_dict': { + 'id': '19070', + 'ext': 'mp4', + 'title': 'If Animal Actors Got E! True Hollywood Stories', + 'timestamp': 1404954000, + 'upload_date': '20140710', + } + }, { + # youtube embed 'url': 'http://www.cracked.com/video_19006_4-plot-holes-you-didnt-notice-in-your-favorite-movies.html', - 'md5': '4b29a5eeec292cd5eca6388c7558db9e', + 'md5': 'ccd52866b50bde63a6ef3b35016ba8c7', 'info_dict': { - 'id': '19006', + 'id': 'EjI00A3rZD0', 'ext': 'mp4', - 'title': '4 Plot Holes You Didn\'t Notice in Your Favorite Movies', - 'description': 'md5:3b909e752661db86007d10e5ec2df769', - 'timestamp': 1405659600, - 'upload_date': '20140718', + 'title': "4 Plot Holes You Didn't Notice in Your Favorite Movies - The Spit Take", + 'description': 'md5:c603708c718b796fe6079e2b3351ffc7', + 'upload_date': '20140725', + 'uploader_id': 'Cracked', + 'uploader': 'Cracked', } - } + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + youtube_url = self._search_regex( + r']+src="((?:https?:)?//www\.youtube\.com/embed/[^"]+)"', + webpage, 'youtube url', default=None) + if youtube_url: + return self.url_result(youtube_url, 'Youtube') + video_url = self._html_search_regex( - [r'var\s+CK_vidSrc\s*=\s*"([^"]+)"', r'([^<]+)'], + webpage, 'title') - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) + description = self._search_regex( + r'name="?(?:og:)?description"?\s+content="([^"]+)"', + webpage, 'description', default=None) - timestamp = self._html_search_regex(r'