From: Yen Chi Hsuan Date: Wed, 22 Jul 2015 12:10:45 +0000 (+0800) Subject: Merge branch 'lecture2go' of https://github.com/nichdu/youtube-dl into nichdu-lecture2go X-Git-Url: http://git.bitcoin.ninja/index.cgi?p=youtube-dl;a=commitdiff_plain;h=3f724339dbe61fe84dd8e66e9c3b74ba6a9c6ddf;hp=f11554092b419baa919875432fe6ebc1f22f5307 Merge branch 'lecture2go' of https://github.com/nichdu/youtube-dl into nichdu-lecture2go --- diff --git a/AUTHORS b/AUTHORS index db3f42b26..373e05c9f 100644 --- a/AUTHORS +++ b/AUTHORS @@ -123,3 +123,14 @@ Will W. Mohammad Teimori Pabandi Roman Le Négrate Matthias Küch +Julian Richen +Ping O. +Mister Hat +Peter Ding +jackyzy823 +George Brighton +Remita Amine +Aurélio A. Heckert +Bernhard Minks +sceext +Zach Bruggeman diff --git a/README.md b/README.md index caa1478d9..ac54d7b67 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ youtube-dl - download videos from youtube.com or other video platforms - [OPTIONS](#options) - [CONFIGURATION](#configuration) - [OUTPUT TEMPLATE](#output-template) +- [FORMAT SELECTION](#format-selection) - [VIDEO SELECTION](#video-selection) - [FAQ](#faq) - [DEVELOPER INSTRUCTIONS](#developer-instructions) @@ -16,12 +17,12 @@ youtube-dl - download videos from youtube.com or other video platforms To install it right away for all UNIX users (Linux, OS X, etc.), type: sudo curl https://yt-dl.org/latest/youtube-dl -o /usr/local/bin/youtube-dl - sudo chmod a+x /usr/local/bin/youtube-dl + sudo chmod a+rx /usr/local/bin/youtube-dl If you do not have curl, you can alternatively use a recent wget: sudo wget https://yt-dl.org/downloads/latest/youtube-dl -O /usr/local/bin/youtube-dl - sudo chmod a+x /usr/local/bin/youtube-dl + sudo chmod a+rx /usr/local/bin/youtube-dl Windows users can [download a .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in their home directory or any other location on their [PATH](http://en.wikipedia.org/wiki/PATH_%28variable%29). @@ -51,8 +52,9 @@ which means you can modify it, redistribute it or use it however you like. -i, --ignore-errors Continue on download errors, for example to skip unavailable videos in a playlist --abort-on-error Abort downloading of further videos (in the playlist or the command line) if an error occurs --dump-user-agent Display the current browser identification - --list-extractors List all supported extractors and the URLs they would handle + --list-extractors List all supported extractors --extractor-descriptions Output descriptions of all supported extractors + --force-generic-extractor Force extraction to use the generic extractor --default-search PREFIX Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". Use the value "auto" to let youtube-dl guess ("auto_warning" to emit a warning when guessing). "error" just throws an error. The default value "fixup_error" repairs broken URLs, but emits an error if this is not possible instead of searching. @@ -73,7 +75,7 @@ which means you can modify it, redistribute it or use it however you like. ## Video Selection: --playlist-start NUMBER Playlist video to start at (default is 1) --playlist-end NUMBER Playlist video to end at (default is last) - --playlist-items ITEM_SPEC Playlist video items to download. Specify indices of the videos in the playlist seperated by commas like: "--playlist-items 1,2,5,8" + --playlist-items ITEM_SPEC Playlist video items to download. Specify indices of the videos in the playlist separated by commas like: "--playlist-items 1,2,5,8" if you want to download videos indexed 1, 2, 5, 8 in the playlist. You can specify range: "--playlist-items 1-3,7,10-13", it will download the videos at index 1, 2, 3, 7, 10, 11, 12 and 13. --match-title REGEX Download only matching titles (regex or caseless sub-string) @@ -106,7 +108,7 @@ which means you can modify it, redistribute it or use it however you like. --playlist-reverse Download playlist videos in reverse order --xattr-set-filesize Set file xattribute ytdl.filesize with expected filesize (experimental) --hls-prefer-native Use the native HLS downloader instead of ffmpeg (experimental) - --external-downloader COMMAND Use the specified external downloader. Currently supports aria2c,curl,wget + --external-downloader COMMAND Use the specified external downloader. Currently supports aria2c,curl,httpie,wget --external-downloader-args ARGS Give these arguments to the external downloader ## Filesystem Options: @@ -132,7 +134,7 @@ which means you can modify it, redistribute it or use it however you like. --no-mtime Do not use the Last-modified header to set the file modification time --write-description Write video description to a .description file --write-info-json Write video metadata to a .info.json file - --write-annotations Write video annotations to a .annotation file + --write-annotations Write video annotations to a .annotations.xml file --load-info FILE JSON file containing the video information (created with the "--write-info-json" option) --cookies FILE File to read cookies from and dump cookie jar in --cache-dir DIR Location in the filesystem where youtube-dl can store some downloaded information permanently. By default $XDG_CACHE_HOME/youtube-dl @@ -167,7 +169,7 @@ which means you can modify it, redistribute it or use it however you like. --no-progress Do not print progress bar --console-title Display progress in console titlebar -v, --verbose Print various debugging information - --dump-pages Print downloaded pages to debug problems (very verbose) + --dump-pages Print downloaded pages encoded using base64 to debug problems (very verbose) --write-pages Write downloaded intermediary pages to files in the current directory to debug problems --print-traffic Display sent and read HTTP traffic -C, --call-home Contact the youtube-dl server for debugging @@ -184,22 +186,12 @@ which means you can modify it, redistribute it or use it however you like. --sleep-interval SECONDS Number of seconds to sleep before each download. ## Video Format Options: - -f, --format FORMAT Video format code, specify the order of preference using slashes, as in -f 22/17/18 . Instead of format codes, you can select by - extension for the extensions aac, m4a, mp3, mp4, ogg, wav, webm. You can also use the special names "best", "bestvideo", "bestaudio", - "worst". You can filter the video results by putting a condition in brackets, as in -f "best[height=720]" (or -f "[filesize>10M]"). - This works for filesize, height, width, tbr, abr, vbr, asr, and fps and the comparisons <, <=, >, >=, =, != and for ext, acodec, - vcodec, container, and protocol and the comparisons =, != . Formats for which the value is not known are excluded unless you put a - question mark (?) after the operator. You can combine format filters, so -f "[height <=? 720][tbr>500]" selects up to 720p videos - (or videos where the height is not known) with a bitrate of at least 500 KBit/s. By default, youtube-dl will pick the best quality. - Use commas to download multiple audio formats, such as -f 136/137/mp4/bestvideo,140/m4a/bestaudio. You can merge the video and audio - of two formats into a single file using -f + (requires ffmpeg or avconv), for example -f - bestvideo+bestaudio. + -f, --format FORMAT Video format code, see the "FORMAT SELECTION" for all the info --all-formats Download all available video formats --prefer-free-formats Prefer free video formats unless a specific one is requested - --max-quality FORMAT Highest quality format to download -F, --list-formats List all available formats - --youtube-skip-dash-manifest Do not download the DASH manifest on YouTube videos - --merge-output-format FORMAT If a merge is required (e.g. bestvideo+bestaudio), output to given container format. One of mkv, mp4, ogg, webm, flv.Ignored if no + --youtube-skip-dash-manifest Do not download the DASH manifests and related data on YouTube videos + --merge-output-format FORMAT If a merge is required (e.g. bestvideo+bestaudio), output to given container format. One of mkv, mp4, ogg, webm, flv. Ignored if no merge is required ## Subtitle Options: @@ -222,17 +214,18 @@ which means you can modify it, redistribute it or use it however you like. --audio-format FORMAT Specify audio format: "best", "aac", "vorbis", "mp3", "m4a", "opus", or "wav"; "best" by default --audio-quality QUALITY Specify ffmpeg/avconv audio quality, insert a value between 0 (better) and 9 (worse) for VBR or a specific bitrate like 128K (default 5) - --recode-video FORMAT Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv) + --recode-video FORMAT Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv|avi) + --postprocessor-args ARGS Give these arguments to the postprocessor -k, --keep-video Keep the video file on disk after the post-processing; the video is erased by default --no-post-overwrites Do not overwrite post-processed files; the post-processed files are overwritten by default - --embed-subs Embed subtitles in the video (only for mp4 videos) + --embed-subs Embed subtitles in the video (only for mkv and mp4 videos) --embed-thumbnail Embed thumbnail in the audio as cover art --add-metadata Write metadata to the video file --metadata-from-title FORMAT Parse additional metadata like song title / artist from the video title. The format syntax is the same as --output, the parsed - parameters replace existing values. Additional templates: %(album), %(artist). Example: --metadata-from-title "%(artist)s - + parameters replace existing values. Additional templates: %(album)s, %(artist)s. Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like "Coldplay - Paradise" --xattrs Write metadata to the video file's xattrs (using dublin core and xdg standards) - --fixup POLICY Automatically correct known faults of the file. One of never (do nothing), warn (only emit a warning), detect_or_warn(the default; + --fixup POLICY Automatically correct known faults of the file. One of never (do nothing), warn (only emit a warning), detect_or_warn (the default; fix file if we can, warn otherwise) --prefer-avconv Prefer avconv over ffmpeg for running the postprocessors (default) --prefer-ffmpeg Prefer ffmpeg over avconv for running the postprocessors @@ -245,6 +238,26 @@ which means you can modify it, redistribute it or use it however you like. You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl/config`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config.txt` and `C:\Users\\youtube-dl.conf`. +### Authentication with `.netrc` file ### + +You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every youtube-dl execution and prevent tracking plain text passwords in shell command history. You can achieve this using [`.netrc` file](http://stackoverflow.com/tags/.netrc/info) on per extractor basis. For that you will need to create `.netrc` file in your `$HOME` and restrict permissions to read/write by you only: +``` +touch $HOME/.netrc +chmod a-rwx,u+rw $HOME/.netrc +``` +After that you can add credentials for extractor in the following format, where *extractor* is the name of extractor in lowercase: +``` +machine login password +``` +For example: +``` +machine youtube login myaccount@gmail.com password my_youtube_password +machine twitch login my_twitch_account_name password my_twitch_password +``` +To activate authentication with `.netrc` file you should pass `--netrc` to youtube-dl or to place it in [configuration file](#configuration). + +On Windows you may also need to setup `%HOME%` environment variable manually. + # OUTPUT TEMPLATE The `-o` option allows users to indicate a template for the output file names. The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "http://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences have the format `%(NAME)s`. To clarify, that is a percent symbol followed by a name in parenthesis, followed by a lowercase S. Allowed names are: @@ -271,6 +284,17 @@ $ youtube-dl --get-filename -o "%(title)s.%(ext)s" BaW_jenozKc --restrict-filena youtube-dl_test_video_.mp4 # A simple file name ``` +# FORMAT SELECTION + +By default youtube-dl tries to download the best quality, but sometimes you may want to download other format. +The simplest case is requesting a specific format, for example `-f 22`. You can get the list of available formats using `--list-formats`, you can also use a file extension (currently it supports aac, m4a, mp3, mp4, ogg, wav, webm) or the special names `best`, `bestvideo`, `bestaudio` and `worst`. + +If you want to download multiple videos and they don't have the same formats available, you can specify the order of preference using slashes, as in `-f 22/17/18`. You can also filter the video results by putting a condition in brackets, as in `-f "best[height=720]"` (or `-f "[filesize>10M]"`). This works for filesize, height, width, tbr, abr, vbr, asr, and fps and the comparisons <, <=, >, >=, =, != and for ext, acodec, vcodec, container, and protocol and the comparisons =, != . Formats for which the value is not known are excluded unless you put a question mark (?) after the operator. You can combine format filters, so `-f "[height <=? 720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 KBit/s. Use commas to download multiple formats, such as `-f 136/137/mp4/bestvideo,140/m4a/bestaudio`. You can merge the video and audio of two formats into a single file using `-f +` (requires ffmpeg or avconv), for example `-f bestvideo+bestaudio`. + +Since the end of April 2015 and version 2015.04.26 youtube-dl uses `-f bestvideo+bestaudio/best` as default format selection (see #5447, #5456). If ffmpeg or avconv are installed this results in downloading `bestvideo` and `bestaudio` separately and muxing them together into a single file giving the best overall quality available. Otherwise it falls back to `best` and results in downloading best available quality served as a single file. `best` is also needed for videos that don't come from YouTube because they don't provide the audio and video in two different files. If you want to only download some dash formats (for example if you are not interested in getting videos with a resolution higher than 1080p), you can add `-f bestvideo[height<=?1080]+bestaudio/best` to your configuration file. Note that if you use youtube-dl to stream to `stdout` (and most likely to pipe it to your media player then), i.e. you explicitly specify output template as `-o -`, youtube-dl still uses `-f best` format selection in order to start content delivery immediately to your player and not to wait until `bestvideo` and `bestaudio` are downloaded and muxed. + +If you want to preserve the old format selection behavior (prior to youtube-dl 2015.04.26), i.e. you want to download best available quality media served as a single file, you should explicitly specify your choice with `-f best`. You may want to add it to the [configuration file](#configuration) in order not to type it every time you run youtube-dl. + # VIDEO SELECTION Videos can be filtered by their upload date using the options `--date`, `--datebefore` or `--dateafter`, they accept dates in two formats: @@ -321,9 +345,9 @@ YouTube changed their playlist format in March 2014 and later on, so you'll need If you have installed youtube-dl with a package manager, pip, setup.py or a tarball, please use that to update. Note that Ubuntu packages do not seem to get updated anymore. Since we are not affiliated with Ubuntu, there is little we can do. Feel free to [report bugs](https://bugs.launchpad.net/ubuntu/+source/youtube-dl/+filebug) to the [Ubuntu packaging guys](mailto:ubuntu-motu@lists.ubuntu.com?subject=outdated%20version%20of%20youtube-dl) - all they have to do is update the package to a somewhat recent version. See above for a way to update. -### Do I always have to pass in `--max-quality FORMAT`, or `-citw`? +### Do I always have to pass `-citw`? -By default, youtube-dl intends to have the best options (incidentally, if you have a convincing case that these should be different, [please file an issue where you explain that](https://yt-dl.org/bug)). Therefore, it is unnecessary and sometimes harmful to copy long option strings from webpages. In particular, `--max-quality` *limits* the video quality (so if you want the best quality, do NOT pass it in), and the only option out of `-citw` that is regularly useful is `-i`. +By default, youtube-dl intends to have the best options (incidentally, if you have a convincing case that these should be different, [please file an issue where you explain that](https://yt-dl.org/bug)). Therefore, it is unnecessary and sometimes harmful to copy long option strings from webpages. In particular, the only option out of `-citw` that is regularly useful is `-i`. ### Can you please put the -b option back? @@ -355,13 +379,29 @@ YouTube has switched to a new video info format in July 2011 which is not suppor YouTube requires an additional signature since September 2012 which is not supported by old versions of youtube-dl. See [above](#how-do-i-update-youtube-dl) for how to update youtube-dl. +### Video URL contains an ampersand and I'm getting some strange output `[1] 2839` or `'v' is not recognized as an internal or external command` ### + +That's actually the output from your shell. Since ampersand is one of the special shell characters it's interpreted by shell preventing you from passing the whole URL to youtube-dl. To disable your shell from interpreting the ampersands (or any other special characters) you have to either put the whole URL in quotes or escape them with a backslash (which approach will work depends on your shell). + +For example if your URL is https://www.youtube.com/watch?t=4&v=BaW_jenozKc you should end up with following command: + +```youtube-dl 'https://www.youtube.com/watch?t=4&v=BaW_jenozKc'``` + +or + +```youtube-dl https://www.youtube.com/watch?t=4\&v=BaW_jenozKc``` + +For Windows you have to use the double quotes: + +```youtube-dl "https://www.youtube.com/watch?t=4&v=BaW_jenozKc"``` + ### ExtractorError: Could not find JS function u'OF' In February 2015, the new YouTube player contained a character sequence in a string that was misinterpreted by old versions of youtube-dl. See [above](#how-do-i-update-youtube-dl) for how to update youtube-dl. ### HTTP Error 429: Too Many Requests or 402: Payment Required -These two error codes indicate that the service is blocking your IP address because of overuse. Contact the service and ask them to unblock your IP address, or - if you have acquired a whitelisted IP address already - use the [`--proxy` or `--network-address` options](#network-options) to select another IP address. +These two error codes indicate that the service is blocking your IP address because of overuse. Contact the service and ask them to unblock your IP address, or - if you have acquired a whitelisted IP address already - use the [`--proxy` or `--source-address` options](#network-options) to select another IP address. ### SyntaxError: Non-ASCII character ### diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c85a39918..73445137f 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -10,12 +10,14 @@ - **56.com** - **5min** - **8tracks** + - **91porn** - **9gag** - **abc.net.au** - **Abc7News** - **AcademicEarth:Course** - **AddAnime** - **AdobeTV** + - **AdobeTVVideo** - **AdultSwim** - **Aftenposten** - **Aftonbladet** @@ -26,8 +28,8 @@ - **anitube.se** - **AnySex** - **Aparat** - - **AppleDailyAnimationNews** - - **AppleDailyRealtimeNews** + - **AppleConnect** + - **AppleDaily**: 臺灣蘋果日報 - **AppleTrailers** - **archive.org**: archive.org videos - **ARD** @@ -44,6 +46,7 @@ - **audiomack** - **audiomack:album** - **Azubu** + - **BaiduVideo**: 百度视频 - **bambuser** - **bambuser:channel** - **Bandcamp** @@ -63,6 +66,8 @@ - **BR**: Bayerischer Rundfunk Mediathek - **Break** - **Brightcove** + - **bt:article**: Bergens Tidende Articles + - **bt:vestlendingen**: Bergens Tidende - Vestlendingen - **BuzzFeed** - **BYUtv** - **Camdemy** @@ -98,14 +103,16 @@ - **CondeNast**: Condé Nast media group: Condé Nast, GQ, Glamour, Vanity Fair, Vogue, W Magazine, WIRED - **Cracked** - **Criterion** + - **CrooksAndLiars** - **Crunchyroll** - **crunchyroll:playlist** - **CSpan**: C-SPAN - - **CtsNews** + - **CtsNews**: 華視新聞 - **culturebox.francetvinfo.fr** - **dailymotion** - **dailymotion:playlist** - **dailymotion:user** + - **DailymotionCloud** - **daum.net** - **DBTV** - **DctpTv** @@ -115,7 +122,9 @@ - **Discovery** - **divxstage**: DivxStage - **Dotsub** - - **DouyuTV** + - **DouyuTV**: 斗鱼 + - **dramafever** + - **dramafever:series** - **DRBonanza** - **Dropbox** - **DrTuber** @@ -138,6 +147,7 @@ - **Eporner** - **EroProfile** - **Escapist** + - **ESPN** (Currently broken) - **EveryonesMixtape** - **exfm**: ex.fm - **ExpoTV** @@ -147,13 +157,14 @@ - **fc2** - **fernsehkritik.tv** - **fernsehkritik.tv:postecke** - - **Firedrive** - **Firstpost** + - **FiveTV** - **Flickr** - **Folketinget**: Folketinget (ft.dk; Danish parliament) - **FootyRoom** - **Foxgay** - **FoxNews** + - **FoxSports** - **france2.fr:generation-quoi** - **FranceCulture** - **FranceInter** @@ -173,6 +184,7 @@ - **Gazeta** - **GDCVault** - **generic**: Generic downloader that works on some sites + - **Gfycat** - **GiantBomb** - **Giga** - **Glide**: Glide mobile video messages (glide.me) @@ -180,9 +192,8 @@ - **GodTube** - **GoldenMoustache** - **Golem** - - **GorillaVid**: GorillaVid.in, daclips.in, movpod.in and fastvideo.in + - **GorillaVid**: GorillaVid.in, daclips.in, movpod.in, fastvideo.in and realvid.net - **Goshgay** - - **Grooveshark** - **Groupon** - **Hark** - **HearThisAt** @@ -212,6 +223,7 @@ - **instagram:user**: Instagram user profile - **InternetVideoArchive** - **IPrima** + - **iqiyi**: 爱奇艺 - **ivi**: ivi.ru - **ivi:compilation**: ivi.ru compilations - **Izlesene** @@ -224,6 +236,7 @@ - **KanalPlay**: Kanal 5/9/11 Play - **Kankan** - **Karaoketv** + - **KarriereVideos** - **keek** - **KeezMovies** - **KhanAcademy** @@ -231,12 +244,19 @@ - **kontrtube**: KontrTube.ru - Труба зовёт - **KrasView**: Красвью - **Ku6** + - **kuwo:album**: 酷我音乐 - 专辑 + - **kuwo:category**: 酷我音乐 - 分类 + - **kuwo:chart**: 酷我音乐 - 排行榜 + - **kuwo:mv**: 酷我音乐 - MV + - **kuwo:singer**: 酷我音乐 - 歌手 + - **kuwo:song**: 酷我音乐 - **la7.tv** - **Laola1Tv** - - **Letv** + - **Letv**: 乐视网 - **LetvPlaylist** - **LetvTv** - **Libsyn** + - **life:embed** - **lifenews**: LIFE | NEWS - **LiveLeak** - **livestream** @@ -251,6 +271,7 @@ - **Malemotion** - **MDR** - **media.ccc.de** + - **MegaVideoz** - **metacafe** - **Metacritic** - **Mgoon** @@ -269,6 +290,7 @@ - **Motherless** - **Motorsport**: motorsport.com - **MovieClips** + - **MovieFap** - **Moviezine** - **movshare**: MovShare - **MPORA** @@ -282,8 +304,10 @@ - **MySpace** - **MySpace:album** - **MySpass** + - **Myvi** - **myvideo** - **MyVidster** + - **N-JOY** - **n-tv.de** - **NationalGeographic** - **Naver** @@ -296,11 +320,18 @@ - **NDTV** - **NerdCubedFeed** - **Nerdist** + - **netease:album**: 网易云音乐 - 专辑 + - **netease:djradio**: 网易云音乐 - 电台 + - **netease:mv**: 网易云音乐 - MV + - **netease:playlist**: 网易云音乐 - 歌单 + - **netease:program**: 网易云音乐 - 电台节目 + - **netease:singer**: 网易云音乐 - 歌手 + - **netease:song**: 网易云音乐 - **Netzkino** - **Newgrounds** - **Newstube** - - **NextMedia** - - **NextMediaActionNews** + - **NextMedia**: 蘋果日報 + - **NextMediaActionNews**: 蘋果日報 - 動新聞 - **nfb**: National Film Board of Canada - **nfl.com** - **nhl.com** @@ -311,24 +342,30 @@ - **Noco** - **Normalboots** - **NosVideo** + - **Nova**: TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz - **novamov**: NovaMov - **Nowness** + - **NowTV** - **nowvideo**: NowVideo - - **npo.nl** + - **npo**: npo.nl and ntr.nl + - **npo**: npo.nl and ntr.nl - **npo.nl:live** - **npo.nl:radio** - **npo.nl:radio:fragment** - **NRK** - **NRKPlaylist** - - **NRKTV** + - **NRKTV**: NRK TV and NRK Radio - **ntv.ru** - **Nuvid** - **NYTimes** + - **NYTimesArticle** - **ocw.mit.edu** - **Odnoklassniki** - **OktoberfestTV** - **on.aol.com** + - **OnionStudios** - **Ooyala** + - **OoyalaExternal** - **OpenFilm** - **orf:fm4**: radio FM4 - **orf:iptv**: iptv.ORF.at @@ -337,8 +374,10 @@ - **parliamentlive.tv**: UK parliament videos - **Patreon** - **PBS** + - **PhilharmonieDeParis**: Philharmonie de Paris - **Phoenix** - **Photobucket** + - **Pinkbike** - **Pladform** - **PlanetaPlay** - **play.fm** @@ -359,6 +398,11 @@ - **prosiebensat1**: ProSiebenSat.1 Digital - **Puls4** - **Pyvideo** + - **qqmusic**: QQ音乐 + - **qqmusic:album**: QQ音乐 - 专辑 + - **qqmusic:playlist**: QQ音乐 - 歌单 + - **qqmusic:singer**: QQ音乐 - 歌手 + - **qqmusic:toplist**: QQ音乐 - 排行榜 - **QuickVid** - **R7** - **radio.de** @@ -367,6 +411,7 @@ - **RadioJavan** - **Rai** - **RBMARadio** + - **RDS**: RDS.ca - **RedTube** - **Restudy** - **ReverbNation** @@ -377,7 +422,6 @@ - **Rte** - **rtl.nl**: rtl.nl and rtlxl.nl - **RTL2** - - **RTLnow** - **RTP** - **RTS**: RTS.ch - **rtve.es:alacarta**: RTVE a la carta @@ -390,6 +434,7 @@ - **rutube:movie**: Rutube movies - **rutube:person**: Rutube person videos - **RUTV**: RUTV.RU + - **Ruutu** - **safari**: safaribooksonline.com online video - **safari:course**: safaribooksonline.com online courses - **Sandia**: Sandia National Laboratories @@ -401,6 +446,7 @@ - **Screencast** - **ScreencastOMatic** - **ScreenwaveMedia** + - **SenateISVP** - **ServingSys** - **Sexu** - **SexyKarma**: Sexy Karma and Watch Indian Porn @@ -413,9 +459,12 @@ - **smotri:broadcast**: Smotri.com broadcasts - **smotri:community**: Smotri.com community videos - **smotri:user**: Smotri.com user videos + - **SnagFilms** + - **SnagFilmsEmbed** - **Snotr** - - **Sockshare** - **Sohu** + - **soompi** + - **soompi:show** - **soundcloud** - **soundcloud:playlist** - **soundcloud:set** @@ -423,7 +472,10 @@ - **soundgasm** - **soundgasm:profile** - **southpark.cc.com** + - **southpark.cc.com:español** - **southpark.de** + - **southpark.nl** + - **southparkstudios.dk** - **Space** - **SpankBang** - **Spankwire** @@ -433,7 +485,10 @@ - **Spike** - **Sport5** - **SportBox** + - **SportBoxEmbed** - **SportDeutschland** + - **Sportschau** + - **Srf** - **SRMediathek**: Saarländischer Rundfunk - **SSA** - **stanfordoc**: Stanford Open ClassRoom @@ -442,6 +497,7 @@ - **StreamCZ** - **StreetVoice** - **SunPorno** + - **SVT** - **SVTPlay**: SVT Play and Öppet arkiv - **SWRMediathek** - **Syfy** @@ -456,8 +512,7 @@ - **TeamFour** - **TechTalks** - **techtv.mit.edu** - - **TED** - - **tegenlicht.vpro.nl** + - **ted** - **TeleBruxelles** - **telecinco.es** - **TeleMB** @@ -468,6 +523,7 @@ - **TheOnion** - **ThePlatform** - **TheSixtyOne** + - **ThisAmericanLife** - **ThisAV** - **THVideo** - **THVideoPlaylist** @@ -475,6 +531,7 @@ - **tlc.com** - **tlc.de** - **TMZ** + - **TMZArticle** - **TNAFlix** - **tou.tv** - **Toypics**: Toypics user profile @@ -483,13 +540,18 @@ - **Trilulilu** - **TruTube** - **Tube8** + - **TubiTv** - **Tudou** - **Tumblr** - **TuneIn** - **Turbo** - **Tutv** - **tv.dfb.de** + - **TV2** + - **TV2Article** - **TV4**: tv4.se and tv4play.se + - **TVC** + - **TVCArticle** - **tvigle**: Интернет-телевидение Tvigle.ru - **tvp.pl** - **tvp.pl:Series** @@ -502,10 +564,11 @@ - **twitch:stream** - **twitch:video** - **twitch:vod** + - **TwitterCard** - **Ubu** - **udemy** - **udemy:course** - - **UDNEmbed** + - **UDNEmbed**: 聯合影音 - **Ultimedia** - **Unistra** - **Urort**: NRK P3 Urørt @@ -518,7 +581,7 @@ - **Vessel** - **Vesti**: Вести.Ru - **Vevo** - - **VGTV** + - **VGTV**: VGTV and BTTV - **vh1.com** - **Vice** - **Viddler** @@ -538,6 +601,7 @@ - **vier:videos** - **Viewster** - **viki** + - **viki:channel** - **vimeo** - **vimeo:album** - **vimeo:channel** @@ -546,12 +610,13 @@ - **vimeo:review**: Review pages on vimeo - **vimeo:user** - **vimeo:watchlater**: Vimeo watch later list, "vimeowatchlater" keyword (requires authentication) - - **Vimple**: Vimple.ru + - **Vimple**: Vimple - one-click video hosting - **Vine** - **vine:user** - - **vk.com** - - **vk.com:user-videos**: vk.com:All of a user's videos + - **vk**: VK + - **vk:uservideos**: VK - User's Videos - **Vodlocker** + - **VoiceRepublic** - **Vporn** - **VRT** - **vube**: Vube.com @@ -565,31 +630,36 @@ - **wdr:mobile** - **WDRMaus**: Sendung mit der Maus - **WebOfStories** + - **WebOfStoriesPlaylist** - **Weibo** - **Wimp** - **Wistia** + - **WNL** - **WorldStarHipHop** - **wrzuta.pl** - **WSJ**: Wall Street Journal - **XBef** - **XboxClips** - **XHamster** + - **XHamsterEmbed** - **XMinus** - **XNXX** + - **Xstream** - **XTube** - **XTubeUser**: XTube user profile - - **Xuite** + - **Xuite**: 隨意窩Xuite影音 - **XVideos** - **XXXYMovies** - **Yahoo**: Yahoo screen and movies - - **Yam** + - **Yam**: 蕃薯藤yam天空部落 - **yandexmusic:album**: Яндекс.Музыка - Альбом - **yandexmusic:playlist**: Яндекс.Музыка - Плейлист - **yandexmusic:track**: Яндекс.Музыка - Трек - **YesJapan** + - **yinyuetai:video**: 音悦Tai - **Ynet** - **YouJizz** - - **Youku** + - **youku**: 优酷 - **YouPorn** - **YourUpload** - **youtube**: YouTube.com diff --git a/test/helper.py b/test/helper.py index 12afdf184..e1129e58f 100644 --- a/test/helper.py +++ b/test/helper.py @@ -150,7 +150,7 @@ def expect_info_dict(self, got_dict, expected_dict): 'invalid value for field %s, expected %r, got %r' % (info_field, expected, got)) # Check for the presence of mandatory fields - if got_dict.get('_type') != 'playlist': + if got_dict.get('_type') not in ('playlist', 'multi_video'): for key in ('id', 'url', 'title', 'ext'): self.assertTrue(got_dict.get(key), 'Missing mandatory field %s' % key) # Check for mandatory fields that are automatically set by YoutubeDL diff --git a/test/parameters.json b/test/parameters.json index cbff9bd16..7bf59c25f 100644 --- a/test/parameters.json +++ b/test/parameters.json @@ -7,8 +7,7 @@ "forcethumbnail": false, "forcetitle": false, "forceurl": false, - "format": null, - "format_limit": null, + "format": "best", "ignoreerrors": false, "listformats": null, "logtostderr": false, diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 652519831..a13c09ef4 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -12,6 +12,7 @@ import copy from test.helper import FakeYDL, assertRegexpMatches from youtube_dl import YoutubeDL +from youtube_dl.compat import compat_str from youtube_dl.extractor import YoutubeIE from youtube_dl.postprocessor.common import PostProcessor from youtube_dl.utils import match_filter_func @@ -101,39 +102,6 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['ext'], 'flv') - def test_format_limit(self): - formats = [ - {'format_id': 'meh', 'url': 'http://example.com/meh', 'preference': 1}, - {'format_id': 'good', 'url': 'http://example.com/good', 'preference': 2}, - {'format_id': 'great', 'url': 'http://example.com/great', 'preference': 3}, - {'format_id': 'excellent', 'url': 'http://example.com/exc', 'preference': 4}, - ] - info_dict = _make_result(formats) - - ydl = YDL() - ydl.process_ie_result(info_dict) - downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], 'excellent') - - ydl = YDL({'format_limit': 'good'}) - assert ydl.params['format_limit'] == 'good' - ydl.process_ie_result(info_dict.copy()) - downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], 'good') - - ydl = YDL({'format_limit': 'great', 'format': 'all'}) - ydl.process_ie_result(info_dict.copy()) - self.assertEqual(ydl.downloaded_info_dicts[0]['format_id'], 'meh') - self.assertEqual(ydl.downloaded_info_dicts[1]['format_id'], 'good') - self.assertEqual(ydl.downloaded_info_dicts[2]['format_id'], 'great') - self.assertTrue('3' in ydl.msgs[0]) - - ydl = YDL() - ydl.params['format_limit'] = 'excellent' - ydl.process_ie_result(info_dict.copy()) - downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], 'excellent') - def test_format_selection(self): formats = [ {'format_id': '35', 'ext': 'mp4', 'preference': 1, 'url': TEST_URL}, @@ -270,7 +238,7 @@ class TestFormatSelection(unittest.TestCase): f2['url'] = 'url:' + f2id info_dict = _make_result([f1, f2], extractor='youtube') - ydl = YDL() + ydl = YDL({'format': 'best/bestvideo'}) yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) @@ -278,7 +246,7 @@ class TestFormatSelection(unittest.TestCase): self.assertEqual(downloaded['format_id'], f1id) info_dict = _make_result([f2, f1], extractor='youtube') - ydl = YDL() + ydl = YDL({'format': 'best/bestvideo'}) yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) @@ -443,27 +411,36 @@ class TestYoutubeDL(unittest.TestCase): def run(self, info): with open(audiofile, 'wt') as f: f.write('EXAMPLE') - info['filepath'] - return False, info + return [info['filepath']], info - def run_pp(params): + def run_pp(params, PP): with open(filename, 'wt') as f: f.write('EXAMPLE') ydl = YoutubeDL(params) - ydl.add_post_processor(SimplePP()) + ydl.add_post_processor(PP()) ydl.post_process(filename, {'filepath': filename}) - run_pp({'keepvideo': True}) + run_pp({'keepvideo': True}, SimplePP) self.assertTrue(os.path.exists(filename), '%s doesn\'t exist' % filename) self.assertTrue(os.path.exists(audiofile), '%s doesn\'t exist' % audiofile) os.unlink(filename) os.unlink(audiofile) - run_pp({'keepvideo': False}) + run_pp({'keepvideo': False}, SimplePP) self.assertFalse(os.path.exists(filename), '%s exists' % filename) self.assertTrue(os.path.exists(audiofile), '%s doesn\'t exist' % audiofile) os.unlink(audiofile) + class ModifierPP(PostProcessor): + def run(self, info): + with open(info['filepath'], 'wt') as f: + f.write('MODIFIED') + return [], info + + run_pp({'keepvideo': False}, ModifierPP) + self.assertTrue(os.path.exists(filename), '%s doesn\'t exist' % filename) + os.unlink(filename) + def test_match_filter(self): class FilterYDL(YDL): def __init__(self, *args, **kwargs): @@ -531,6 +508,51 @@ class TestYoutubeDL(unittest.TestCase): res = get_videos(f) self.assertEqual(res, ['1']) + def test_playlist_items_selection(self): + entries = [{ + 'id': compat_str(i), + 'title': compat_str(i), + 'url': TEST_URL, + } for i in range(1, 5)] + playlist = { + '_type': 'playlist', + 'id': 'test', + 'entries': entries, + 'extractor': 'test:playlist', + 'extractor_key': 'test:playlist', + 'webpage_url': 'http://example.com', + } + + def get_ids(params): + ydl = YDL(params) + # make a copy because the dictionary can be modified + ydl.process_ie_result(playlist.copy()) + return [int(v['id']) for v in ydl.downloaded_info_dicts] + + result = get_ids({}) + self.assertEqual(result, [1, 2, 3, 4]) + + result = get_ids({'playlistend': 10}) + self.assertEqual(result, [1, 2, 3, 4]) + + result = get_ids({'playlistend': 2}) + self.assertEqual(result, [1, 2]) + + result = get_ids({'playliststart': 10}) + self.assertEqual(result, []) + + result = get_ids({'playliststart': 2}) + self.assertEqual(result, [2, 3, 4]) + + result = get_ids({'playlist_items': '2-4'}) + self.assertEqual(result, [2, 3, 4]) + + result = get_ids({'playlist_items': '2,4'}) + self.assertEqual(result, [2, 4]) + + result = get_ids({'playlist_items': '10'}) + self.assertEqual(result, []) + if __name__ == '__main__': unittest.main() diff --git a/test/test_aes.py b/test/test_aes.py index 4dc7de7b5..315a3f5ae 100644 --- a/test/test_aes.py +++ b/test/test_aes.py @@ -39,7 +39,7 @@ class TestAES(unittest.TestCase): encrypted = base64.b64encode( intlist_to_bytes(self.iv[:8]) + b'\x17\x15\x93\xab\x8d\x80V\xcdV\xe0\t\xcdo\xc2\xa5\xd8ksM\r\xe27N\xae' - ) + ).decode('utf-8') decrypted = (aes_decrypt_text(encrypted, password, 16)) self.assertEqual(decrypted, self.secret_msg) @@ -47,7 +47,7 @@ class TestAES(unittest.TestCase): encrypted = base64.b64encode( intlist_to_bytes(self.iv[:8]) + b'\x0b\xe6\xa4\xd9z\x0e\xb8\xb9\xd0\xd4i_\x85\x1d\x99\x98_\xe5\x80\xe7.\xbf\xa5\x83' - ) + ).decode('utf-8') decrypted = (aes_decrypt_text(encrypted, password, 32)) self.assertEqual(decrypted, self.secret_msg) diff --git a/test/test_compat.py b/test/test_compat.py index 1eb454e06..c3ba8ad2e 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -14,6 +14,8 @@ from youtube_dl.utils import get_filesystem_encoding from youtube_dl.compat import ( compat_getenv, compat_expanduser, + compat_urllib_parse_unquote, + compat_urllib_parse_unquote_plus, ) @@ -42,5 +44,28 @@ class TestCompat(unittest.TestCase): dir(youtube_dl.compat))) - set(['unicode_literals']) self.assertEqual(all_names, sorted(present_names)) + def test_compat_urllib_parse_unquote(self): + self.assertEqual(compat_urllib_parse_unquote('abc%20def'), 'abc def') + self.assertEqual(compat_urllib_parse_unquote('%7e/abc+def'), '~/abc+def') + self.assertEqual(compat_urllib_parse_unquote(''), '') + self.assertEqual(compat_urllib_parse_unquote('%'), '%') + self.assertEqual(compat_urllib_parse_unquote('%%'), '%%') + self.assertEqual(compat_urllib_parse_unquote('%%%'), '%%%') + self.assertEqual(compat_urllib_parse_unquote('%2F'), '/') + self.assertEqual(compat_urllib_parse_unquote('%2f'), '/') + self.assertEqual(compat_urllib_parse_unquote('%E6%B4%A5%E6%B3%A2'), '津波') + self.assertEqual( + compat_urllib_parse_unquote(''' +%%a'''), + ''' +%%a''') + self.assertEqual( + compat_urllib_parse_unquote('''%28%5E%E2%97%A3_%E2%97%A2%5E%29%E3%81%A3%EF%B8%BB%E3%83%87%E2%95%90%E4%B8%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%86%B6%I%Break%25Things%'''), + '''(^◣_◢^)っ︻デ═一 ⇀ ⇀ ⇀ ⇀ ⇀ ↶%I%Break%Things%''') + + def test_compat_urllib_parse_unquote_plus(self): + self.assertEqual(compat_urllib_parse_unquote_plus('abc%20def'), 'abc def') + self.assertEqual(compat_urllib_parse_unquote_plus('%7e/abc+def'), '~/abc def') + if __name__ == '__main__': unittest.main() diff --git a/test/test_download.py b/test/test_download.py index 6a149ae4f..1110357a7 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -153,7 +153,7 @@ def generator(test_case): break if is_playlist: - self.assertEqual(res_dict['_type'], 'playlist') + self.assertTrue(res_dict['_type'] in ['playlist', 'multi_video']) self.assertTrue('entries' in res_dict) expect_info_dict(self, res_dict, test_case.get('info_dict', {})) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 891ee620b..c4e3adb67 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -266,7 +266,7 @@ class TestNRKSubtitles(BaseTestSubtitles): self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['no'])) - self.assertEqual(md5(subtitles['no']), '1d221e6458c95c5494dcd38e6a1f129a') + self.assertEqual(md5(subtitles['no']), '544fa917d3197fcbee64634559221cc2') class TestRaiSubtitles(BaseTestSubtitles): diff --git a/test/test_utils.py b/test/test_utils.py index 2e3a6480c..e13e11b59 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -40,7 +40,8 @@ from youtube_dl.utils import ( read_batch_urls, sanitize_filename, sanitize_path, - sanitize_url_path_consecutive_slashes, + prepend_extension, + replace_extension, shell_quote, smuggle_url, str_to_int, @@ -51,6 +52,7 @@ from youtube_dl.utils import ( unified_strdate, unsmuggle_url, uppercase_escape, + lowercase_escape, url_basename, urlencode_postdata, version_tuple, @@ -58,6 +60,8 @@ from youtube_dl.utils import ( xpath_text, render_table, match_str, + parse_dfxp_time_expr, + dfxp2srt, ) @@ -171,25 +175,21 @@ class TestUtil(unittest.TestCase): self.assertEqual(sanitize_path('./abc'), 'abc') self.assertEqual(sanitize_path('./../abc'), '..\\abc') - def test_sanitize_url_path_consecutive_slashes(self): - self.assertEqual( - sanitize_url_path_consecutive_slashes('http://hostname/foo//bar/filename.html'), - 'http://hostname/foo/bar/filename.html') - self.assertEqual( - sanitize_url_path_consecutive_slashes('http://hostname//foo/bar/filename.html'), - 'http://hostname/foo/bar/filename.html') - self.assertEqual( - sanitize_url_path_consecutive_slashes('http://hostname//'), - 'http://hostname/') - self.assertEqual( - sanitize_url_path_consecutive_slashes('http://hostname/foo/bar/filename.html'), - 'http://hostname/foo/bar/filename.html') - self.assertEqual( - sanitize_url_path_consecutive_slashes('http://hostname/'), - 'http://hostname/') - self.assertEqual( - sanitize_url_path_consecutive_slashes('http://hostname/abc//'), - 'http://hostname/abc/') + def test_prepend_extension(self): + self.assertEqual(prepend_extension('abc.ext', 'temp'), 'abc.temp.ext') + self.assertEqual(prepend_extension('abc.ext', 'temp', 'ext'), 'abc.temp.ext') + self.assertEqual(prepend_extension('abc.unexpected_ext', 'temp', 'ext'), 'abc.unexpected_ext.temp') + self.assertEqual(prepend_extension('abc', 'temp'), 'abc.temp') + self.assertEqual(prepend_extension('.abc', 'temp'), '.abc.temp') + self.assertEqual(prepend_extension('.abc.ext', 'temp'), '.abc.temp.ext') + + def test_replace_extension(self): + self.assertEqual(replace_extension('abc.ext', 'temp'), 'abc.temp') + self.assertEqual(replace_extension('abc.ext', 'temp', 'ext'), 'abc.temp') + self.assertEqual(replace_extension('abc.unexpected_ext', 'temp', 'ext'), 'abc.unexpected_ext.temp') + self.assertEqual(replace_extension('abc', 'temp'), 'abc.temp') + self.assertEqual(replace_extension('.abc', 'temp'), '.abc.temp') + self.assertEqual(replace_extension('.abc.ext', 'temp'), '.abc.temp') def test_ordered_set(self): self.assertEqual(orderedSet([1, 1, 2, 3, 4, 4, 5, 6, 7, 3, 5]), [1, 2, 3, 4, 5, 6, 7]) @@ -398,6 +398,10 @@ class TestUtil(unittest.TestCase): self.assertEqual(uppercase_escape('aä'), 'aä') self.assertEqual(uppercase_escape('\\U0001d550'), '𝕐') + def test_lowercase_escape(self): + self.assertEqual(lowercase_escape('aä'), 'aä') + self.assertEqual(lowercase_escape('\\u0026'), '&') + def test_limit_length(self): self.assertEqual(limit_length(None, 12), None) self.assertEqual(limit_length('foo', 12), 'foo') @@ -581,6 +585,57 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4') 'like_count > 100 & dislike_count + + +
+

The following line contains Chinese characters and special symbols

+

第二行
♪♪

+

Third
Line

+
+ +
''' + srt_data = '''1 +00:00:00,000 --> 00:00:01,000 +The following line contains Chinese characters and special symbols + +2 +00:00:01,000 --> 00:00:02,000 +第二行 +♪♪ + +3 +00:00:02,000 --> 00:00:03,000 +Third +Line + +''' + self.assertEqual(dfxp2srt(dfxp_data), srt_data) + + dfxp_data_no_default_namespace = ''' + + +
+

The first line

+
+ +
''' + srt_data = '''1 +00:00:00,000 --> 00:00:01,000 +The first line + +''' + self.assertEqual(dfxp2srt(dfxp_data_no_default_namespace), srt_data) + if __name__ == '__main__': unittest.main() diff --git a/tox.ini b/tox.ini index 00c6e00e3..cd805fe8a 100644 --- a/tox.ini +++ b/tox.ini @@ -4,6 +4,8 @@ envlist = py26,py27,py33,py34 deps = nose coverage +# We need a valid $HOME for test_compat_expanduser +passenv = HOME defaultargs = test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a68b24ab4..00af78e06 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -49,6 +49,7 @@ from .utils import ( ExtractorError, format_bytes, formatSeconds, + HEADRequest, locked_file, make_HTTPS_handler, MaxDownloadsReached, @@ -64,7 +65,6 @@ from .utils import ( sanitize_path, std_headers, subtitles_filename, - takewhile_inclusive, UnavailableVideoError, url_basename, version_tuple, @@ -72,6 +72,7 @@ from .utils import ( write_string, YoutubeDLHandler, prepend_extension, + replace_extension, args_to_str, age_restricted, ) @@ -118,7 +119,7 @@ class YoutubeDL(object): username: Username for authentication purposes. password: Password for authentication purposes. - videopassword: Password for acces a video. + videopassword: Password for accessing a video. usenetrc: Use netrc for authentication instead. verbose: Print additional info to stdout. quiet: Do not print messages to stdout. @@ -135,10 +136,10 @@ class YoutubeDL(object): (or video) as a single JSON line. simulate: Do not download the video files. format: Video format code. See options.py for more information. - format_limit: Highest quality format to try. outtmpl: Template for output names. restrictfilenames: Do not allow "&" and spaces in file names ignoreerrors: Do not stop on download errors. + force_generic_extractor: Force downloader to use the generic extractor nooverwrites: Prevent overwriting files. playliststart: Playlist item to start at. playlistend: Playlist item to end at. @@ -261,7 +262,8 @@ class YoutubeDL(object): The following options are used by the post processors: prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available, otherwise prefer avconv. - exec_cmd: Arbitrary command to run after downloading + postprocessor_args: A list of additional command-line arguments for the + postprocessor. """ params = None @@ -627,13 +629,16 @@ class YoutubeDL(object): info_dict.setdefault(key, value) def extract_info(self, url, download=True, ie_key=None, extra_info={}, - process=True): + process=True, force_generic_extractor=False): ''' Returns a list with a dictionary for each video we find. If 'download', also downloads the videos. extra_info is a dict containing the extra values to add to each result ''' + if not ie_key and force_generic_extractor: + ie_key = 'Generic' + if ie_key: ies = [self.get_info_extractor(ie_key)] else: @@ -761,7 +766,9 @@ class YoutubeDL(object): if isinstance(ie_entries, list): n_all_entries = len(ie_entries) if playlistitems: - entries = [ie_entries[i - 1] for i in playlistitems] + entries = [ + ie_entries[i - 1] for i in playlistitems + if -n_all_entries <= i - 1 < n_all_entries] else: entries = ie_entries[playliststart:playlistend] n_entries = len(entries) @@ -916,15 +923,17 @@ class YoutubeDL(object): if not available_formats: return None - if format_spec == 'best' or format_spec is None: - return available_formats[-1] - elif format_spec == 'worst': + if format_spec in ['best', 'worst', None]: + format_idx = 0 if format_spec == 'worst' else -1 audiovideo_formats = [ f for f in available_formats if f.get('vcodec') != 'none' and f.get('acodec') != 'none'] if audiovideo_formats: - return audiovideo_formats[0] - return available_formats[0] + return audiovideo_formats[format_idx] + # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format + elif (all(f.get('acodec') != 'none' for f in available_formats) or + all(f.get('vcodec') != 'none' for f in available_formats)): + return available_formats[format_idx] elif format_spec == 'bestaudio': audio_formats = [ f for f in available_formats @@ -1001,7 +1010,7 @@ class YoutubeDL(object): t.get('preference'), t.get('width'), t.get('height'), t.get('id'), t.get('url'))) for i, t in enumerate(thumbnails): - if 'width' in t and 'height' in t: + if t.get('width') and t.get('height'): t['resolution'] = '%dx%d' % (t['width'], t['height']) if t.get('id') is None: t['id'] = '%d' % i @@ -1013,13 +1022,13 @@ class YoutubeDL(object): info_dict['display_id'] = info_dict['id'] if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None: - # Working around negative timestamps in Windows - # (see http://bugs.python.org/issue1646728) - if info_dict['timestamp'] < 0 and os.name == 'nt': - info_dict['timestamp'] = 0 - upload_date = datetime.datetime.utcfromtimestamp( - info_dict['timestamp']) - info_dict['upload_date'] = upload_date.strftime('%Y%m%d') + # Working around out-of-range timestamp values (e.g. negative ones on Windows, + # see http://bugs.python.org/issue1646728) + try: + upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp']) + info_dict['upload_date'] = upload_date.strftime('%Y%m%d') + except (ValueError, OverflowError, OSError): + pass if self.params.get('listsubtitles', False): if 'automatic_captions' in info_dict: @@ -1030,12 +1039,6 @@ class YoutubeDL(object): info_dict['id'], info_dict.get('subtitles'), info_dict.get('automatic_captions')) - # This extractors handle format selection themselves - if info_dict['extractor'] in ['Youku']: - if download: - self.process_info(info_dict) - return info_dict - # We now pick which formats have to be downloaded if info_dict.get('formats') is None: # There's only one format available @@ -1046,6 +1049,8 @@ class YoutubeDL(object): if not formats: raise ExtractorError('No video formats found!') + formats_dict = {} + # We check that all the formats have the format and format_id fields for i, format in enumerate(formats): if 'url' not in format: @@ -1053,6 +1058,18 @@ class YoutubeDL(object): if format.get('format_id') is None: format['format_id'] = compat_str(i) + format_id = format['format_id'] + if format_id not in formats_dict: + formats_dict[format_id] = [] + formats_dict[format_id].append(format) + + # Make sure all formats have unique format_id + for format_id, ambiguous_formats in formats_dict.items(): + if len(ambiguous_formats) > 1: + for i, format in enumerate(ambiguous_formats): + format['format_id'] = '%s-%d' % (format_id, i) + + for i, format in enumerate(formats): if format.get('format') is None: format['format'] = '{id} - {res}{note}'.format( id=format['format_id'], @@ -1068,12 +1085,6 @@ class YoutubeDL(object): full_format_info.update(format) format['http_headers'] = self._calc_headers(full_format_info) - format_limit = self.params.get('format_limit', None) - if format_limit: - formats = list(takewhile_inclusive( - lambda f: f['format_id'] != format_limit, formats - )) - # TODO Central sorting goes here if formats[0] is not info_dict: @@ -1091,7 +1102,14 @@ class YoutubeDL(object): req_format = self.params.get('format') if req_format is None: - req_format = 'best' + req_format_list = [] + if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and + info_dict['extractor'] in ['youtube', 'ted']): + merger = FFmpegMergerPP(self) + if merger.available and merger.can_merge(): + req_format_list.append('bestvideo+bestaudio') + req_format_list.append('best') + req_format = '/'.join(req_format_list) formats_to_download = [] if req_format == 'all': formats_to_download = formats @@ -1273,7 +1291,7 @@ class YoutubeDL(object): return if self.params.get('writedescription', False): - descfn = filename + '.description' + descfn = replace_extension(filename, 'description', info_dict.get('ext')) if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)): self.to_screen('[info] Video description is already present') elif info_dict.get('description') is None: @@ -1288,7 +1306,7 @@ class YoutubeDL(object): return if self.params.get('writeannotations', False): - annofn = filename + '.annotations.xml' + annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext')) if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)): self.to_screen('[info] Video annotations are already present') else: @@ -1335,13 +1353,13 @@ class YoutubeDL(object): return if self.params.get('writeinfojson', False): - infofn = os.path.splitext(filename)[0] + '.info.json' + infofn = replace_extension(filename, 'info.json', info_dict.get('ext')) if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)): self.to_screen('[info] Video description metadata is already present') else: self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn) try: - write_json_file(info_dict, infofn) + write_json_file(self.filter_requested_info(info_dict), infofn) except (OSError, IOError): self.report_error('Cannot write metadata to JSON file ' + infofn) return @@ -1361,24 +1379,57 @@ class YoutubeDL(object): if info_dict.get('requested_formats') is not None: downloaded = [] success = True - merger = FFmpegMergerPP(self, not self.params.get('keepvideo')) + merger = FFmpegMergerPP(self) if not merger.available: postprocessors = [] self.report_warning('You have requested multiple ' 'formats but ffmpeg or avconv are not installed.' - ' The formats won\'t be merged') + ' The formats won\'t be merged.') else: postprocessors = [merger] - for f in info_dict['requested_formats']: - new_info = dict(info_dict) - new_info.update(f) - fname = self.prepare_filename(new_info) - fname = prepend_extension(fname, 'f%s' % f['format_id']) - downloaded.append(fname) - partial_success = dl(fname, new_info) - success = success and partial_success - info_dict['__postprocessors'] = postprocessors - info_dict['__files_to_merge'] = downloaded + + def compatible_formats(formats): + video, audio = formats + # Check extension + video_ext, audio_ext = audio.get('ext'), video.get('ext') + if video_ext and audio_ext: + COMPATIBLE_EXTS = ( + ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v'), + ('webm') + ) + for exts in COMPATIBLE_EXTS: + if video_ext in exts and audio_ext in exts: + return True + # TODO: Check acodec/vcodec + return False + + filename_real_ext = os.path.splitext(filename)[1][1:] + filename_wo_ext = ( + os.path.splitext(filename)[0] + if filename_real_ext == info_dict['ext'] + else filename) + requested_formats = info_dict['requested_formats'] + if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats): + info_dict['ext'] = 'mkv' + self.report_warning( + 'Requested formats are incompatible for merge and will be merged into mkv.') + # Ensure filename always has a correct extension for successful merge + filename = '%s.%s' % (filename_wo_ext, info_dict['ext']) + if os.path.exists(encodeFilename(filename)): + self.to_screen( + '[download] %s has already been downloaded and ' + 'merged' % filename) + else: + for f in requested_formats: + new_info = dict(info_dict) + new_info.update(f) + fname = self.prepare_filename(new_info) + fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext']) + downloaded.append(fname) + partial_success = dl(fname, new_info) + success = success and partial_success + info_dict['__postprocessors'] = postprocessors + info_dict['__files_to_merge'] = downloaded else: # Just a single file success = dl(filename, info_dict) @@ -1448,7 +1499,8 @@ class YoutubeDL(object): for url in url_list: try: # It also downloads the videos - res = self.extract_info(url) + res = self.extract_info( + url, force_generic_extractor=self.params.get('force_generic_extractor', False)) except UnavailableVideoError: self.report_error('unable to download video') except MaxDownloadsReached: @@ -1465,7 +1517,7 @@ class YoutubeDL(object): [info_filename], mode='r', openhook=fileinput.hook_encoded('utf-8'))) as f: # FileInput doesn't have a read method, we can't call json.load - info = json.loads('\n'.join(f)) + info = self.filter_requested_info(json.loads('\n'.join(f))) try: self.process_ie_result(info, download=True) except DownloadError: @@ -1477,6 +1529,12 @@ class YoutubeDL(object): raise return self._download_retcode + @staticmethod + def filter_requested_info(info_dict): + return dict( + (k, v) for k, v in info_dict.items() + if k not in ['requested_formats', 'requested_subtitles']) + def post_process(self, filename, ie_info): """Run all the postprocessors on the given file.""" info = dict(ie_info) @@ -1486,24 +1544,18 @@ class YoutubeDL(object): pps_chain.extend(ie_info['__postprocessors']) pps_chain.extend(self._pps) for pp in pps_chain: - keep_video = None - old_filename = info['filepath'] + files_to_delete = [] try: - keep_video_wish, info = pp.run(info) - if keep_video_wish is not None: - if keep_video_wish: - keep_video = keep_video_wish - elif keep_video is None: - # No clear decision yet, let IE decide - keep_video = keep_video_wish + files_to_delete, info = pp.run(info) except PostProcessingError as e: self.report_error(e.msg) - if keep_video is False and not self.params.get('keepvideo', False): - try: + if files_to_delete and not self.params.get('keepvideo', False): + for old_filename in files_to_delete: self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename) - os.remove(encodeFilename(old_filename)) - except (IOError, OSError): - self.report_warning('Unable to remove downloaded video file') + try: + os.remove(encodeFilename(old_filename)) + except (IOError, OSError): + self.report_warning('Unable to remove downloaded original file') def _make_archive_id(self, info_dict): # Future-proof against any change in case @@ -1671,7 +1723,8 @@ class YoutubeDL(object): if req_is_string: req = url_escaped else: - req = compat_urllib_request.Request( + req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request + req = req_type( url_escaped, data=req.data, headers=req.headers, origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) @@ -1817,7 +1870,7 @@ class YoutubeDL(object): thumb_ext = determine_ext(t['url'], 'jpg') suffix = '_%s' % t['id'] if len(thumbnails) > 1 else '' thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else '' - thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext + t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)): self.to_screen('[%s] %s: Thumbnail %sis already present' % diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 1c8b411b7..55b22c889 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -169,7 +169,7 @@ def _real_main(argv=None): if not opts.audioquality.isdigit(): parser.error('invalid audio quality specified') if opts.recodevideo is not None: - if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv']: + if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv', 'avi']: parser.error('invalid video recode format specified') if opts.convertsubtitles is not None: if opts.convertsubtitles not in ['srt', 'vtt', 'ass']: @@ -240,15 +240,18 @@ def _real_main(argv=None): if opts.xattrs: postprocessors.append({'key': 'XAttrMetadata'}) if opts.embedthumbnail: - if not opts.addmetadata: - postprocessors.append({'key': 'FFmpegAudioFix'}) - postprocessors.append({'key': 'AtomicParsley'}) + already_have_thumbnail = opts.writethumbnail or opts.write_all_thumbnails + postprocessors.append({ + 'key': 'EmbedThumbnail', + 'already_have_thumbnail': already_have_thumbnail + }) + if not already_have_thumbnail: + opts.writethumbnail = True # Please keep ExecAfterDownload towards the bottom as it allows the user to modify the final file in any way. # So if the user is able to remove the file before your postprocessor runs it might cause a few problems. if opts.exec_cmd: postprocessors.append({ 'key': 'ExecAfterDownload', - 'verboseOutput': opts.verbose, 'exec_cmd': opts.exec_cmd, }) if opts.xattr_set_filesize: @@ -260,6 +263,9 @@ def _real_main(argv=None): external_downloader_args = None if opts.external_downloader_args: external_downloader_args = shlex.split(opts.external_downloader_args) + postprocessor_args = None + if opts.postprocessor_args: + postprocessor_args = shlex.split(opts.postprocessor_args) match_filter = ( None if opts.match_filter is None else match_filter_func(opts.match_filter)) @@ -285,12 +291,12 @@ def _real_main(argv=None): 'simulate': opts.simulate or any_getting, 'skip_download': opts.skip_download, 'format': opts.format, - 'format_limit': opts.format_limit, 'listformats': opts.listformats, 'outtmpl': outtmpl, 'autonumber_size': opts.autonumber_size, 'restrictfilenames': opts.restrictfilenames, 'ignoreerrors': opts.ignoreerrors, + 'force_generic_extractor': opts.force_generic_extractor, 'ratelimit': opts.ratelimit, 'nooverwrites': opts.nooverwrites, 'retries': opts_retries, @@ -348,7 +354,6 @@ def _real_main(argv=None): 'default_search': opts.default_search, 'youtube_include_dash_manifest': opts.youtube_include_dash_manifest, 'encoding': opts.encoding, - 'exec_cmd': opts.exec_cmd, 'extract_flat': opts.extract_flat, 'merge_output_format': opts.merge_output_format, 'postprocessors': postprocessors, @@ -365,6 +370,7 @@ def _real_main(argv=None): 'ffmpeg_location': opts.ffmpeg_location, 'hls_prefer_native': opts.hls_prefer_native, 'external_downloader_args': external_downloader_args, + 'postprocessor_args': postprocessor_args, 'cn_verification_proxy': opts.cn_verification_proxy, } diff --git a/youtube_dl/aes.py b/youtube_dl/aes.py index 07224d508..7817adcfd 100644 --- a/youtube_dl/aes.py +++ b/youtube_dl/aes.py @@ -152,7 +152,7 @@ def aes_decrypt_text(data, password, key_size_bytes): """ NONCE_LENGTH_BYTES = 8 - data = bytes_to_intlist(base64.b64decode(data)) + data = bytes_to_intlist(base64.b64decode(data.encode('utf-8'))) password = bytes_to_intlist(password.encode('utf-8')) key = password[:key_size_bytes] + [0] * (key_size_bytes - len(password)) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 973bcd320..0c57c7aeb 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -9,6 +9,7 @@ import shutil import socket import subprocess import sys +import itertools try: @@ -46,11 +47,6 @@ try: except ImportError: # Python 2 import htmlentitydefs as compat_html_entities -try: - import html.parser as compat_html_parser -except ImportError: # Python 2 - import HTMLParser as compat_html_parser - try: import http.client as compat_http_client except ImportError: # Python 2 @@ -79,42 +75,74 @@ except ImportError: import BaseHTTPServer as compat_http_server try: + from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes from urllib.parse import unquote as compat_urllib_parse_unquote -except ImportError: - def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'): - if string == '': + from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus +except ImportError: # Python 2 + _asciire = (compat_urllib_parse._asciire if hasattr(compat_urllib_parse, '_asciire') + else re.compile('([\x00-\x7f]+)')) + + # HACK: The following are the correct unquote_to_bytes, unquote and unquote_plus + # implementations from cpython 3.4.3's stdlib. Python 2's version + # is apparently broken (see https://github.com/rg3/youtube-dl/pull/6244) + + def compat_urllib_parse_unquote_to_bytes(string): + """unquote_to_bytes('abc%20def') -> b'abc def'.""" + # Note: strings are encoded as UTF-8. This is only an issue if it contains + # unescaped non-ASCII characters, which URIs should not. + if not string: + # Is it a string-like object? + string.split + return b'' + if isinstance(string, unicode): + string = string.encode('utf-8') + bits = string.split(b'%') + if len(bits) == 1: return string - res = string.split('%') - if len(res) == 1: + res = [bits[0]] + append = res.append + for item in bits[1:]: + try: + append(compat_urllib_parse._hextochr[item[:2]]) + append(item[2:]) + except KeyError: + append(b'%') + append(item) + return b''.join(res) + + def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'): + """Replace %xx escapes by their single-character equivalent. The optional + encoding and errors parameters specify how to decode percent-encoded + sequences into Unicode characters, as accepted by the bytes.decode() + method. + By default, percent-encoded sequences are decoded with UTF-8, and invalid + sequences are replaced by a placeholder character. + + unquote('abc%20def') -> 'abc def'. + """ + if '%' not in string: + string.split return string if encoding is None: encoding = 'utf-8' if errors is None: errors = 'replace' - # pct_sequence: contiguous sequence of percent-encoded bytes, decoded - pct_sequence = b'' - string = res[0] - for item in res[1:]: - try: - if not item: - raise ValueError - pct_sequence += item[:2].decode('hex') - rest = item[2:] - if not rest: - # This segment was just a single percent-encoded character. - # May be part of a sequence of code units, so delay decoding. - # (Stored in pct_sequence). - continue - except ValueError: - rest = '%' + item - # Encountered non-percent-encoded characters. Flush the current - # pct_sequence. - string += pct_sequence.decode(encoding, errors) + rest - pct_sequence = b'' - if pct_sequence: - # Flush the final pct_sequence - string += pct_sequence.decode(encoding, errors) - return string + bits = _asciire.split(string) + res = [bits[0]] + append = res.append + for i in range(1, len(bits), 2): + append(compat_urllib_parse_unquote_to_bytes(bits[i]).decode(encoding, errors)) + append(bits[i + 1]) + return ''.join(res) + + def compat_urllib_parse_unquote_plus(string, encoding='utf-8', errors='replace'): + """Like unquote(), but also replace plus signs by spaces, as required for + unquoting HTML form values. + + unquote_plus('%7e/abc+def') -> '~/abc def' + """ + string = string.replace('+', ' ') + return compat_urllib_parse_unquote(string, encoding, errors) try: compat_str = unicode # Python 2 @@ -393,6 +421,15 @@ else: pass return _terminal_size(columns, lines) +try: + itertools.count(start=0, step=1) + compat_itertools_count = itertools.count +except TypeError: # Python 2.6 + def compat_itertools_count(start=0, step=1): + n = start + while True: + yield n + n += step __all__ = [ 'compat_HTTPError', @@ -404,9 +441,9 @@ __all__ = [ 'compat_getenv', 'compat_getpass', 'compat_html_entities', - 'compat_html_parser', 'compat_http_client', 'compat_http_server', + 'compat_itertools_count', 'compat_kwargs', 'compat_ord', 'compat_parse_qs', @@ -417,6 +454,8 @@ __all__ = [ 'compat_urllib_error', 'compat_urllib_parse', 'compat_urllib_parse_unquote', + 'compat_urllib_parse_unquote_plus', + 'compat_urllib_parse_unquote_to_bytes', 'compat_urllib_parse_urlparse', 'compat_urllib_request', 'compat_urlparse', diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index 9fb66e2f7..dccc59212 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -6,8 +6,9 @@ from .f4m import F4mFD from .hls import HlsFD from .hls import NativeHlsFD from .http import HttpFD -from .mplayer import MplayerFD +from .rtsp import RtspFD from .rtmp import RtmpFD +from .dash import DashSegmentsFD from ..utils import ( determine_protocol, @@ -17,9 +18,10 @@ PROTOCOL_MAP = { 'rtmp': RtmpFD, 'm3u8_native': NativeHlsFD, 'm3u8': HlsFD, - 'mms': MplayerFD, - 'rtsp': MplayerFD, + 'mms': RtspFD, + 'rtsp': RtspFD, 'f4m': F4mFD, + 'http_dash_segments': DashSegmentsFD, } diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index a0fc5ead0..97e755d4b 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -8,6 +8,7 @@ import time from ..compat import compat_str from ..utils import ( encodeFilename, + decodeArgument, format_bytes, timeconvert, ) @@ -353,19 +354,15 @@ class FileDownloader(object): # this interface self._progress_hooks.append(ph) - def _debug_cmd(self, args, subprocess_encoding, exe=None): + def _debug_cmd(self, args, exe=None): if not self.params.get('verbose', False): return + str_args = [decodeArgument(a) for a in args] + if exe is None: - exe = os.path.basename(args[0]) + exe = os.path.basename(str_args[0]) - if subprocess_encoding: - str_args = [ - a.decode(subprocess_encoding) if isinstance(a, bytes) else a - for a in args] - else: - str_args = args try: import pipes shell_quote = lambda args: ' '.join(map(pipes.quote, str_args)) diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py new file mode 100644 index 000000000..a4685d307 --- /dev/null +++ b/youtube_dl/downloader/dash.py @@ -0,0 +1,66 @@ +from __future__ import unicode_literals + +import re + +from .common import FileDownloader +from ..compat import compat_urllib_request + + +class DashSegmentsFD(FileDownloader): + """ + Download segments in a DASH manifest + """ + def real_download(self, filename, info_dict): + self.report_destination(filename) + tmpfilename = self.temp_name(filename) + base_url = info_dict['url'] + segment_urls = info_dict['segment_urls'] + + is_test = self.params.get('test', False) + remaining_bytes = self._TEST_FILE_SIZE if is_test else None + byte_counter = 0 + + def append_url_to_file(outf, target_url, target_name, remaining_bytes=None): + self.to_screen('[DashSegments] %s: Downloading %s' % (info_dict['id'], target_name)) + req = compat_urllib_request.Request(target_url) + if remaining_bytes is not None: + req.add_header('Range', 'bytes=0-%d' % (remaining_bytes - 1)) + + data = self.ydl.urlopen(req).read() + + if remaining_bytes is not None: + data = data[:remaining_bytes] + + outf.write(data) + return len(data) + + def combine_url(base_url, target_url): + if re.match(r'^https?://', target_url): + return target_url + return '%s/%s' % (base_url, target_url) + + with open(tmpfilename, 'wb') as outf: + append_url_to_file( + outf, combine_url(base_url, info_dict['initialization_url']), + 'initialization segment') + for i, segment_url in enumerate(segment_urls): + segment_len = append_url_to_file( + outf, combine_url(base_url, segment_url), + 'segment %d / %d' % (i + 1, len(segment_urls)), + remaining_bytes) + byte_counter += segment_len + if remaining_bytes is not None: + remaining_bytes -= segment_len + if remaining_bytes <= 0: + break + + self.try_rename(tmpfilename, filename) + + self._hook_progress({ + 'downloaded_bytes': byte_counter, + 'total_bytes': byte_counter, + 'filename': filename, + 'status': 'finished', + }) + + return True diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 1673b2382..1d5cc9904 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -2,11 +2,11 @@ from __future__ import unicode_literals import os.path import subprocess -import sys from .common import FileDownloader from ..utils import ( encodeFilename, + encodeArgument, ) @@ -60,17 +60,9 @@ class ExternalFD(FileDownloader): def _call_downloader(self, tmpfilename, info_dict): """ Either overwrite this or implement _make_cmd """ - cmd = self._make_cmd(tmpfilename, info_dict) - - if sys.platform == 'win32' and sys.version_info < (3, 0): - # Windows subprocess module does not actually support Unicode - # on Python 2.x - # See http://stackoverflow.com/a/9951851/35070 - subprocess_encoding = sys.getfilesystemencoding() - cmd = [a.encode(subprocess_encoding, 'ignore') for a in cmd] - else: - subprocess_encoding = None - self._debug_cmd(cmd, subprocess_encoding) + cmd = [encodeArgument(a) for a in self._make_cmd(tmpfilename, info_dict)] + + self._debug_cmd(cmd) p = subprocess.Popen( cmd, stderr=subprocess.PIPE) @@ -117,6 +109,14 @@ class Aria2cFD(ExternalFD): cmd += ['--', info_dict['url']] return cmd + +class HttpieFD(ExternalFD): + def _make_cmd(self, tmpfilename, info_dict): + cmd = ['http', '--download', '--output', tmpfilename, info_dict['url']] + for key, val in info_dict['http_headers'].items(): + cmd += ['%s:%s' % (key, val)] + return cmd + _BY_NAME = dict( (klass.get_basename(), klass) for name, klass in globals().items() @@ -131,5 +131,6 @@ def list_external_downloaders(): def get_external_downloader(external_downloader): """ Given the name of the executable, see whether we support the given downloader . """ - bn = os.path.basename(external_downloader) + # Drop .exe extension on Windows + bn = os.path.splitext(os.path.basename(external_downloader))[0] return _BY_NAME[bn] diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 4ab000d67..b1a858c45 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -389,6 +389,8 @@ class F4mFD(FileDownloader): url = base_url + name if akamai_pv: url += '?' + akamai_pv.strip(';') + if info_dict.get('extra_param_to_segment_url'): + url += info_dict.get('extra_param_to_segment_url') frag_filename = '%s-%s' % (tmpfilename, name) try: success = http_dl.download(frag_filename, {'url': url}) diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index d136bebd1..b7f144af9 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -28,13 +28,8 @@ class HttpFD(FileDownloader): add_headers = info_dict.get('http_headers') if add_headers: headers.update(add_headers) - data = info_dict.get('http_post_data') - http_method = info_dict.get('http_method') - basic_request = compat_urllib_request.Request(url, data, headers) - request = compat_urllib_request.Request(url, data, headers) - if http_method is not None: - basic_request.get_method = lambda: http_method - request.get_method = lambda: http_method + basic_request = compat_urllib_request.Request(url, None, headers) + request = compat_urllib_request.Request(url, None, headers) is_test = self.params.get('test', False) diff --git a/youtube_dl/downloader/mplayer.py b/youtube_dl/downloader/mplayer.py deleted file mode 100644 index 72cef30ea..000000000 --- a/youtube_dl/downloader/mplayer.py +++ /dev/null @@ -1,43 +0,0 @@ -from __future__ import unicode_literals - -import os -import subprocess - -from .common import FileDownloader -from ..utils import ( - check_executable, - encodeFilename, -) - - -class MplayerFD(FileDownloader): - def real_download(self, filename, info_dict): - url = info_dict['url'] - self.report_destination(filename) - tmpfilename = self.temp_name(filename) - - args = [ - 'mplayer', '-really-quiet', '-vo', 'null', '-vc', 'dummy', - '-dumpstream', '-dumpfile', tmpfilename, url] - # Check for mplayer first - if not check_executable('mplayer', ['-h']): - self.report_error('MMS or RTSP download detected but "%s" could not be run' % args[0]) - return False - - # Download using mplayer. - retval = subprocess.call(args) - if retval == 0: - fsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen('\r[%s] %s bytes' % (args[0], fsize)) - self.try_rename(tmpfilename, filename) - self._hook_progress({ - 'downloaded_bytes': fsize, - 'total_bytes': fsize, - 'filename': filename, - 'status': 'finished', - }) - return True - else: - self.to_stderr('\n') - self.report_error('mplayer exited with code %d' % retval) - return False diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py index ddf5724ae..7d19bb808 100644 --- a/youtube_dl/downloader/rtmp.py +++ b/youtube_dl/downloader/rtmp.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import os import re import subprocess -import sys import time from .common import FileDownloader @@ -11,6 +10,7 @@ from ..compat import compat_str from ..utils import ( check_executable, encodeFilename, + encodeArgument, get_exe_version, ) @@ -121,7 +121,7 @@ class RtmpFD(FileDownloader): # possible. This is part of rtmpdump's normal usage, AFAIK. basic_args = [ 'rtmpdump', '--verbose', '-r', url, - '-o', encodeFilename(tmpfilename, True)] + '-o', tmpfilename] if player_url is not None: basic_args += ['--swfVfy', player_url] if page_url is not None: @@ -131,7 +131,7 @@ class RtmpFD(FileDownloader): if play_path is not None: basic_args += ['--playpath', play_path] if tc_url is not None: - basic_args += ['--tcUrl', url] + basic_args += ['--tcUrl', tc_url] if test: basic_args += ['--stop', '1'] if flash_version is not None: @@ -154,16 +154,9 @@ class RtmpFD(FileDownloader): if not live and continue_dl: args += ['--skip', '1'] - if sys.platform == 'win32' and sys.version_info < (3, 0): - # Windows subprocess module does not actually support Unicode - # on Python 2.x - # See http://stackoverflow.com/a/9951851/35070 - subprocess_encoding = sys.getfilesystemencoding() - args = [a.encode(subprocess_encoding, 'ignore') for a in args] - else: - subprocess_encoding = None + args = [encodeArgument(a) for a in args] - self._debug_cmd(args, subprocess_encoding, exe='rtmpdump') + self._debug_cmd(args, exe='rtmpdump') RD_SUCCESS = 0 RD_FAILED = 1 @@ -180,7 +173,11 @@ class RtmpFD(FileDownloader): prevsize = os.path.getsize(encodeFilename(tmpfilename)) self.to_screen('[rtmpdump] %s bytes' % prevsize) time.sleep(5.0) # This seems to be needed - retval = run_rtmpdump(basic_args + ['-e'] + [[], ['-k', '1']][retval == RD_FAILED]) + args = basic_args + ['--resume'] + if retval == RD_FAILED: + args += ['--skip', '1'] + args = [encodeArgument(a) for a in args] + retval = run_rtmpdump(args) cursize = os.path.getsize(encodeFilename(tmpfilename)) if prevsize == cursize and retval == RD_FAILED: break diff --git a/youtube_dl/downloader/rtsp.py b/youtube_dl/downloader/rtsp.py new file mode 100644 index 000000000..3eb29526c --- /dev/null +++ b/youtube_dl/downloader/rtsp.py @@ -0,0 +1,45 @@ +from __future__ import unicode_literals + +import os +import subprocess + +from .common import FileDownloader +from ..utils import ( + check_executable, + encodeFilename, +) + + +class RtspFD(FileDownloader): + def real_download(self, filename, info_dict): + url = info_dict['url'] + self.report_destination(filename) + tmpfilename = self.temp_name(filename) + + if check_executable('mplayer', ['-h']): + args = [ + 'mplayer', '-really-quiet', '-vo', 'null', '-vc', 'dummy', + '-dumpstream', '-dumpfile', tmpfilename, url] + elif check_executable('mpv', ['-h']): + args = [ + 'mpv', '-really-quiet', '--vo=null', '--stream-dump=' + tmpfilename, url] + else: + self.report_error('MMS or RTSP download detected but neither "mplayer" nor "mpv" could be run. Please install any.') + return False + + retval = subprocess.call(args) + if retval == 0: + fsize = os.path.getsize(encodeFilename(tmpfilename)) + self.to_screen('\r[%s] %s bytes' % (args[0], fsize)) + self.try_rename(tmpfilename, filename) + self._hook_progress({ + 'downloaded_bytes': fsize, + 'total_bytes': fsize, + 'filename': filename, + 'status': 'finished', + }) + return True + else: + self.to_stderr('\n') + self.report_error('%s exited with code %d' % (args[0], retval)) + return False diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3d6e981b2..3cfa804ec 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -4,7 +4,10 @@ from .abc import ABCIE from .abc7news import Abc7NewsIE from .academicearth import AcademicEarthCourseIE from .addanime import AddAnimeIE -from .adobetv import AdobeTVIE +from .adobetv import ( + AdobeTVIE, + AdobeTVVideoIE, +) from .adultswim import AdultSwimIE from .aftenposten import AftenpostenIE from .aftonbladet import AftonbladetIE @@ -16,9 +19,14 @@ from .anysex import AnySexIE from .aol import AolIE from .allocine import AllocineIE from .aparat import AparatIE +from .appleconnect import AppleConnectIE from .appletrailers import AppleTrailersIE from .archiveorg import ArchiveOrgIE -from .ard import ARDIE, ARDMediathekIE +from .ard import ( + ARDIE, + ARDMediathekIE, + SportschauIE, +) from .arte import ( ArteTvIE, ArteTVPlus7IE, @@ -32,6 +40,7 @@ from .atresplayer import AtresPlayerIE from .atttechchannel import ATTTechChannelIE from .audiomack import AudiomackIE, AudiomackAlbumIE from .azubu import AzubuIE +from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE from .bbccouk import BBCCoUkIE @@ -70,6 +79,7 @@ from .chirbit import ( ChirbitProfileIE, ) from .cinchcast import CinchcastIE +from .cinemassacre import CinemassacreIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE from .clipsyndicate import ClipsyndicateIE @@ -101,6 +111,7 @@ from .dailymotion import ( DailymotionIE, DailymotionPlaylistIE, DailymotionUserIE, + DailymotionCloudIE, ) from .daum import DaumIE from .dbtv import DBTVIE @@ -110,6 +121,10 @@ from .dfb import DFBIE from .dhm import DHMIE from .dotsub import DotsubIE from .douyutv import DouyuTVIE +from .dramafever import ( + DramaFeverIE, + DramaFeverSeriesIE, +) from .dreisat import DreiSatIE from .drbonanza import DRBonanzaIE from .drtuber import DrTuberIE @@ -134,11 +149,11 @@ from .ellentv import ( ) from .elpais import ElPaisIE from .embedly import EmbedlyIE -from .empflix import EMPFlixIE from .engadget import EngadgetIE from .eporner import EpornerIE from .eroprofile import EroProfileIE from .escapist import EscapistIE +from .espn import ESPNIE from .everyonesmixtape import EveryonesMixtapeIE from .exfm import ExfmIE from .expotv import ExpoTVIE @@ -146,10 +161,10 @@ from .extremetube import ExtremeTubeIE from .facebook import FacebookIE from .faz import FazIE from .fc2 import FC2IE -from .firedrive import FiredriveIE from .firstpost import FirstpostIE from .firsttv import FirstTVIE from .fivemin import FiveMinIE +from .fivetv import FiveTVIE from .fktv import ( FKTVIE, FKTVPosteckeIE, @@ -160,6 +175,7 @@ from .footyroom import FootyRoomIE from .fourtube import FourTubeIE from .foxgay import FoxgayIE from .foxnews import FoxNewsIE +from .foxsports import FoxSportsIE from .franceculture import FranceCultureIE from .franceinter import FranceInterIE from .francetv import ( @@ -185,6 +201,7 @@ from .gametrailers import GametrailersIE from .gazeta import GazetaIE from .gdcvault import GDCVaultIE from .generic import GenericIE +from .gfycat import GfycatIE from .giantbomb import GiantBombIE from .giga import GigaIE from .glide import GlideIE @@ -196,7 +213,6 @@ from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE from .gorillavid import GorillaVidIE from .goshgay import GoshgayIE -from .grooveshark import GroovesharkIE from .groupon import GrouponIE from .hark import HarkIE from .hearthisat import HearThisAtIE @@ -226,6 +242,7 @@ from .infoq import InfoQIE from .instagram import InstagramIE, InstagramUserIE from .internetvideoarchive import InternetVideoArchiveIE from .iprima import IPrimaIE +from .iqiyi import IqiyiIE from .ivi import ( IviIE, IviCompilationIE @@ -240,6 +257,7 @@ from .kaltura import KalturaIE from .kanalplay import KanalPlayIE from .kankan import KankanIE from .karaoketv import KaraoketvIE +from .karrierevideos import KarriereVideosIE from .keezmovies import KeezMoviesIE from .khanacademy import KhanAcademyIE from .kickstarter import KickStarterIE @@ -247,6 +265,14 @@ from .keek import KeekIE from .kontrtube import KontrTubeIE from .krasview import KrasViewIE from .ku6 import Ku6IE +from .kuwo import ( + KuwoIE, + KuwoAlbumIE, + KuwoChartIE, + KuwoSingerIE, + KuwoCategoryIE, + KuwoMvIE, +) from .la7 import LA7IE from .laola1tv import Laola1TvIE from .lecture2go import Lecture2GoIE @@ -256,7 +282,10 @@ from .letv import ( LetvPlaylistIE ) from .libsyn import LibsynIE -from .lifenews import LifeNewsIE +from .lifenews import ( + LifeNewsIE, + LifeEmbedIE, +) from .liveleak import LiveLeakIE from .livestream import ( LivestreamIE, @@ -274,6 +303,7 @@ from .macgamestore import MacGameStoreIE from .mailru import MailRuIE from .malemotion import MalemotionIE from .mdr import MDRIE +from .megavideoz import MegaVideozIE from .metacafe import MetacafeIE from .metacritic import MetacriticIE from .mgoon import MgoonIE @@ -307,6 +337,7 @@ from .musicvault import MusicVaultIE from .muzu import MuzuTVIE from .myspace import MySpaceIE, MySpaceAlbumIE from .myspass import MySpassIE +from .myvi import MyviIE from .myvideo import MyVideoIE from .myvidster import MyVidsterIE from .nationalgeographic import NationalGeographicIE @@ -318,18 +349,29 @@ from .nbc import ( NBCSportsIE, NBCSportsVPlayerIE, ) -from .ndr import NDRIE +from .ndr import ( + NDRIE, + NJoyIE, +) from .ndtv import NDTVIE from .netzkino import NetzkinoIE from .nerdcubed import NerdCubedFeedIE from .nerdist import NerdistIE +from .neteasemusic import ( + NetEaseMusicIE, + NetEaseMusicAlbumIE, + NetEaseMusicSingerIE, + NetEaseMusicListIE, + NetEaseMusicMvIE, + NetEaseMusicProgramIE, + NetEaseMusicDjRadioIE, +) from .newgrounds import NewgroundsIE from .newstube import NewstubeIE from .nextmedia import ( NextMediaIE, NextMediaActionNewsIE, - AppleDailyRealtimeNewsIE, - AppleDailyAnimationNewsIE + AppleDailyIE, ) from .nfb import NFBIE from .nfl import NFLIE @@ -343,15 +385,18 @@ from .ninegag import NineGagIE from .noco import NocoIE from .normalboots import NormalbootsIE from .nosvideo import NosVideoIE +from .nova import NovaIE from .novamov import NovaMovIE from .nowness import NownessIE +from .nowtv import NowTVIE from .nowvideo import NowVideoIE from .npo import ( NPOIE, NPOLiveIE, NPORadioIE, NPORadioFragmentIE, - TegenlichtVproIE, + VPROIE, + WNLIE ) from .nrk import ( NRKIE, @@ -360,11 +405,18 @@ from .nrk import ( ) from .ntvde import NTVDeIE from .ntvru import NTVRuIE -from .nytimes import NYTimesIE +from .nytimes import ( + NYTimesIE, + NYTimesArticleIE, +) from .nuvid import NuvidIE from .odnoklassniki import OdnoklassnikiIE from .oktoberfesttv import OktoberfestTVIE -from .ooyala import OoyalaIE +from .onionstudios import OnionStudiosIE +from .ooyala import ( + OoyalaIE, + OoyalaExternalIE, +) from .openfilm import OpenFilmIE from .orf import ( ORFTVthekIE, @@ -375,8 +427,10 @@ from .orf import ( from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE from .pbs import PBSIE +from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE +from .pinkbike import PinkbikeIE from .planetaplay import PlanetaPlayIE from .pladform import PladformIE from .played import PlayedIE @@ -384,6 +438,7 @@ from .playfm import PlayFMIE from .playvid import PlayvidIE from .playwire import PlaywireIE from .podomatic import PodomaticIE +from .porn91 import Porn91IE from .pornhd import PornHdIE from .pornhub import ( PornHubIE, @@ -397,6 +452,13 @@ from .promptfile import PromptFileIE from .prosiebensat1 import ProSiebenSat1IE from .puls4 import Puls4IE from .pyvideo import PyvideoIE +from .qqmusic import ( + QQMusicIE, + QQMusicSingerIE, + QQMusicAlbumIE, + QQMusicToplistIE, + QQMusicPlaylistIE, +) from .quickvid import QuickVidIE from .r7 import R7IE from .radiode import RadioDeIE @@ -405,6 +467,7 @@ from .radiobremen import RadioBremenIE from .radiofrance import RadioFranceIE from .rai import RaiIE from .rbmaradio import RBMARadioIE +from .rds import RDSIE from .redtube import RedTubeIE from .restudy import RestudyIE from .reverbnation import ReverbNationIE @@ -415,7 +478,6 @@ from .roxwel import RoxwelIE from .rtbf import RTBFIE from .rte import RteIE from .rtlnl import RtlNlIE -from .rtlnow import RTLnowIE from .rtl2 import RTL2IE from .rtp import RTPIE from .rts import RTSIE @@ -429,6 +491,7 @@ from .rutube import ( RutubePersonIE, ) from .rutv import RUTVIE +from .ruutu import RuutuIE from .sandia import SandiaIE from .safari import ( SafariIE, @@ -440,7 +503,8 @@ from .sbs import SBSIE from .scivee import SciVeeIE from .screencast import ScreencastIE from .screencastomatic import ScreencastOMaticIE -from .screenwavemedia import CinemassacreIE, ScreenwaveMediaIE, TeamFourIE +from .screenwavemedia import ScreenwaveMediaIE, TeamFourIE +from .senateisvp import SenateISVPIE from .servingsys import ServingSysIE from .sexu import SexuIE from .sexykarma import SexyKarmaIE @@ -455,9 +519,16 @@ from .smotri import ( SmotriUserIE, SmotriBroadcastIE, ) +from .snagfilms import ( + SnagFilmsIE, + SnagFilmsEmbedIE, +) from .snotr import SnotrIE -from .sockshare import SockshareIE from .sohu import SohuIE +from .soompi import ( + SoompiIE, + SoompiShowIE, +) from .soundcloud import ( SoundcloudIE, SoundcloudSetIE, @@ -470,7 +541,10 @@ from .soundgasm import ( ) from .southpark import ( SouthParkIE, - SouthparkDeIE, + SouthParkDeIE, + SouthParkDkIE, + SouthParkEsIE, + SouthParkNlIE ) from .space import SpaceIE from .spankbang import SpankBangIE @@ -479,8 +553,12 @@ from .spiegel import SpiegelIE, SpiegelArticleIE from .spiegeltv import SpiegeltvIE from .spike import SpikeIE from .sport5 import Sport5IE -from .sportbox import SportBoxIE +from .sportbox import ( + SportBoxIE, + SportBoxEmbedIE, +) from .sportdeutschland import SportDeutschlandIE +from .srf import SrfIE from .srmediathek import SRMediathekIE from .ssa import SSAIE from .stanfordoc import StanfordOpenClassroomIE @@ -489,7 +567,10 @@ from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE from .streetvoice import StreetVoiceIE from .sunporno import SunPornoIE -from .svtplay import SVTPlayIE +from .svt import ( + SVTIE, + SVTPlayIE, +) from .swrmediathek import SWRMediathekIE from .syfy import SyfyIE from .sztvhu import SztvHuIE @@ -515,11 +596,19 @@ from .tf1 import TF1IE from .theonion import TheOnionIE from .theplatform import ThePlatformIE from .thesixtyone import TheSixtyOneIE +from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE from .tinypic import TinyPicIE from .tlc import TlcIE, TlcDeIE -from .tmz import TMZIE -from .tnaflix import TNAFlixIE +from .tmz import ( + TMZIE, + TMZArticleIE, +) +from .tnaflix import ( + TNAFlixIE, + EMPFlixIE, + MovieFapIE, +) from .thvideo import ( THVideoIE, THVideoPlaylistIE @@ -530,12 +619,21 @@ from .traileraddict import TrailerAddictIE from .trilulilu import TriluliluIE from .trutube import TruTubeIE from .tube8 import Tube8IE +from .tubitv import TubiTvIE from .tudou import TudouIE from .tumblr import TumblrIE from .tunein import TuneInIE from .turbo import TurboIE from .tutv import TutvIE +from .tv2 import ( + TV2IE, + TV2ArticleIE, +) from .tv4 import TV4IE +from .tvc import ( + TVCIE, + TVCArticleIE, +) from .tvigle import TvigleIE from .tvp import TvpIE, TvpSeriesIE from .tvplay import TVPlayIE @@ -554,6 +652,7 @@ from .twitch import ( TwitchBookmarksIE, TwitchStreamIE, ) +from .twitter import TwitterCardIE from .ubu import UbuIE from .udemy import ( UdemyIE, @@ -571,7 +670,11 @@ from .veoh import VeohIE from .vessel import VesselIE from .vesti import VestiIE from .vevo import VevoIE -from .vgtv import VGTVIE +from .vgtv import ( + BTArticleIE, + BTVestlendingenIE, + VGTVIE, +) from .vh1 import VH1IE from .vice import ViceIE from .viddler import ViddlerIE @@ -602,12 +705,16 @@ from .vine import ( VineIE, VineUserIE, ) -from .viki import VikiIE +from .viki import ( + VikiIE, + VikiChannelIE, +) from .vk import ( VKIE, VKUserVideosIE, ) from .vodlocker import VodlockerIE +from .voicerepublic import VoiceRepublicIE from .vporn import VpornIE from .vrt import VRTIE from .vube import VubeIE @@ -622,7 +729,10 @@ from .wdr import ( WDRMobileIE, WDRMausIE, ) -from .webofstories import WebOfStoriesIE +from .webofstories import ( + WebOfStoriesIE, + WebOfStoriesPlaylistIE, +) from .weibo import WeiboIE from .wimp import WimpIE from .wistia import WistiaIE @@ -631,12 +741,16 @@ from .wrzuta import WrzutaIE from .wsj import WSJIE from .xbef import XBefIE from .xboxclips import XboxClipsIE -from .xhamster import XHamsterIE +from .xhamster import ( + XHamsterIE, + XHamsterEmbedIE, +) from .xminus import XMinusIE from .xnxx import XNXXIE -from .xvideos import XVideosIE +from .xstream import XstreamIE from .xtube import XTubeUserIE, XTubeIE from .xuite import XuiteIE +from .xvideos import XVideosIE from .xxxymovies import XXXYMoviesIE from .yahoo import ( YahooIE, @@ -649,6 +763,7 @@ from .yandexmusic import ( YandexMusicPlaylistIE, ) from .yesjapan import YesJapanIE +from .yinyuetai import YinYueTaiIE from .ynet import YnetIE from .youjizz import YouJizzIE from .youku import YoukuIE diff --git a/youtube_dl/extractor/adobetv.py b/youtube_dl/extractor/adobetv.py index 97d128560..5e43adc51 100644 --- a/youtube_dl/extractor/adobetv.py +++ b/youtube_dl/extractor/adobetv.py @@ -5,6 +5,8 @@ from ..utils import ( parse_duration, unified_strdate, str_to_int, + float_or_none, + ISO639Utils, ) @@ -69,3 +71,61 @@ class AdobeTVIE(InfoExtractor): 'view_count': view_count, 'formats': formats, } + + +class AdobeTVVideoIE(InfoExtractor): + _VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P\d+)' + + _TEST = { + # From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners + 'url': 'https://video.tv.adobe.com/v/2456/', + 'md5': '43662b577c018ad707a63766462b1e87', + 'info_dict': { + 'id': '2456', + 'ext': 'mp4', + 'title': 'New experience with Acrobat DC', + 'description': 'New experience with Acrobat DC', + 'duration': 248.667, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + player_params = self._parse_json(self._search_regex( + r'var\s+bridge\s*=\s*([^;]+);', webpage, 'player parameters'), + video_id) + + formats = [{ + 'url': source['src'], + 'width': source.get('width'), + 'height': source.get('height'), + 'tbr': source.get('bitrate'), + } for source in player_params['sources']] + + # For both metadata and downloaded files the duration varies among + # formats. I just pick the max one + duration = max(filter(None, [ + float_or_none(source.get('duration'), scale=1000) + for source in player_params['sources']])) + + subtitles = {} + for translation in player_params.get('translations', []): + lang_id = translation.get('language_w3c') or ISO639Utils.long2short(translation['language_medium']) + if lang_id not in subtitles: + subtitles[lang_id] = [] + subtitles[lang_id].append({ + 'url': translation['vttPath'], + 'ext': 'vtt', + }) + + return { + 'id': video_id, + 'formats': formats, + 'title': player_params['title'], + 'description': self._og_search_description(webpage), + 'duration': duration, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/aftenposten.py b/youtube_dl/extractor/aftenposten.py index e15c015fb..0c00acfb5 100644 --- a/youtube_dl/extractor/aftenposten.py +++ b/youtube_dl/extractor/aftenposten.py @@ -1,21 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_iso8601, - xpath_with_ns, - xpath_text, - find_xpath_attr, -) class AftenpostenIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?aftenposten\.no/webtv/(?:#!/)?video/(?P\d+)' - _TEST = { 'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more', 'md5': 'fd828cd29774a729bf4d4425fe192972', @@ -30,69 +20,4 @@ class AftenpostenIE(InfoExtractor): } def _real_extract(self, url): - video_id = self._match_id(url) - - data = self._download_xml( - 'http://frontend.xstream.dk/ap/feed/video/?platform=web&id=%s' % video_id, video_id) - - NS_MAP = { - 'atom': 'http://www.w3.org/2005/Atom', - 'xt': 'http://xstream.dk/', - 'media': 'http://search.yahoo.com/mrss/', - } - - entry = data.find(xpath_with_ns('./atom:entry', NS_MAP)) - - title = xpath_text( - entry, xpath_with_ns('./atom:title', NS_MAP), 'title') - description = xpath_text( - entry, xpath_with_ns('./atom:summary', NS_MAP), 'description') - timestamp = parse_iso8601(xpath_text( - entry, xpath_with_ns('./atom:published', NS_MAP), 'upload date')) - - formats = [] - media_group = entry.find(xpath_with_ns('./media:group', NS_MAP)) - for media_content in media_group.findall(xpath_with_ns('./media:content', NS_MAP)): - media_url = media_content.get('url') - if not media_url: - continue - tbr = int_or_none(media_content.get('bitrate')) - mobj = re.search(r'^(?Prtmp://[^/]+/(?P[^/]+))/(?P.+)$', media_url) - if mobj: - formats.append({ - 'url': mobj.group('url'), - 'play_path': 'mp4:%s' % mobj.group('playpath'), - 'app': mobj.group('app'), - 'ext': 'flv', - 'tbr': tbr, - 'format_id': 'rtmp-%d' % tbr, - }) - else: - formats.append({ - 'url': media_url, - 'tbr': tbr, - }) - self._sort_formats(formats) - - link = find_xpath_attr( - entry, xpath_with_ns('./atom:link', NS_MAP), 'rel', 'original') - if link is not None: - formats.append({ - 'url': link.get('href'), - 'format_id': link.get('rel'), - }) - - thumbnails = [{ - 'url': splash.get('url'), - 'width': int_or_none(splash.get('width')), - 'height': int_or_none(splash.get('height')), - } for splash in media_group.findall(xpath_with_ns('./xt:splash', NS_MAP))] - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'timestamp': timestamp, - 'formats': formats, - 'thumbnails': thumbnails, - } + return self.url_result('xstream:ap:%s' % self._match_id(url), 'Xstream') diff --git a/youtube_dl/extractor/aftonbladet.py b/youtube_dl/extractor/aftonbladet.py index a117502bc..e0518cf26 100644 --- a/youtube_dl/extractor/aftonbladet.py +++ b/youtube_dl/extractor/aftonbladet.py @@ -6,11 +6,11 @@ from ..utils import int_or_none class AftonbladetIE(InfoExtractor): - _VALID_URL = r'http://tv\.aftonbladet\.se/webbtv.+?(?Particle[0-9]+)\.ab(?:$|[?#])' + _VALID_URL = r'http://tv\.aftonbladet\.se/abtv/articles/(?P[0-9]+)' _TEST = { - 'url': 'http://tv.aftonbladet.se/webbtv/nyheter/vetenskap/rymden/article36015.ab', + 'url': 'http://tv.aftonbladet.se/abtv/articles/36015', 'info_dict': { - 'id': 'article36015', + 'id': '36015', 'ext': 'mp4', 'title': 'Vulkanutbrott i rymden - nu släpper NASA bilderna', 'description': 'Jupiters måne mest aktiv av alla himlakroppar', @@ -25,8 +25,9 @@ class AftonbladetIE(InfoExtractor): # find internal video meta data meta_url = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json' - internal_meta_id = self._html_search_regex( - r'data-aptomaId="([\w\d]+)"', webpage, 'internal_meta_id') + player_config = self._parse_json(self._html_search_regex( + r'data-player-config="([^"]+)"', webpage, 'player config'), video_id) + internal_meta_id = player_config['videoId'] internal_meta_url = meta_url % internal_meta_id internal_meta_json = self._download_json( internal_meta_url, video_id, 'Downloading video meta data') diff --git a/youtube_dl/extractor/appleconnect.py b/youtube_dl/extractor/appleconnect.py new file mode 100644 index 000000000..ea7a70393 --- /dev/null +++ b/youtube_dl/extractor/appleconnect.py @@ -0,0 +1,50 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + str_to_int, + ExtractorError +) + + +class AppleConnectIE(InfoExtractor): + _VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/idsa\.(?P[\w-]+)' + _TEST = { + 'url': 'https://itunes.apple.com/us/post/idsa.4ab17a39-2720-11e5-96c5-a5b38f6c42d3', + 'md5': '10d0f2799111df4cb1c924520ca78f98', + 'info_dict': { + 'id': '4ab17a39-2720-11e5-96c5-a5b38f6c42d3', + 'ext': 'm4v', + 'title': 'Energy', + 'uploader': 'Drake', + 'thumbnail': 'http://is5.mzstatic.com/image/thumb/Video5/v4/78/61/c5/7861c5fa-ad6d-294b-1464-cf7605b911d6/source/1920x1080sr.jpg', + 'upload_date': '20150710', + 'timestamp': 1436545535, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + try: + video_json = self._html_search_regex( + r'class="auc-video-data">(\{.*?\})', webpage, 'json') + except ExtractorError: + raise ExtractorError('This post doesn\'t contain a video', expected=True) + + video_data = self._parse_json(video_json, video_id) + timestamp = str_to_int(self._html_search_regex(r'data-timestamp="(\d+)"', webpage, 'timestamp')) + like_count = str_to_int(self._html_search_regex(r'(\d+) Loves', webpage, 'like count')) + + return { + 'id': video_id, + 'url': video_data['sslSrc'], + 'title': video_data['title'], + 'description': video_data['description'], + 'uploader': video_data['artistName'], + 'thumbnail': video_data['artworkUrl'], + 'timestamp': timestamp, + 'like_count': like_count, + } diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index 9fc35a42b..8feb7cb74 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -33,7 +33,7 @@ class ArchiveOrgIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - json_url = url + ('?' if '?' in url else '&') + 'output=json' + json_url = url + ('&' if '?' in url else '?') + 'output=json' data = self._download_json(json_url, video_id) def get_optional(data_dict, field): diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 6a35ea463..6f465789b 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -8,6 +8,7 @@ from .generic import GenericIE from ..utils import ( determine_ext, ExtractorError, + get_element_by_attribute, qualities, int_or_none, parse_duration, @@ -22,19 +23,125 @@ class ARDMediathekIE(InfoExtractor): _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' _TESTS = [{ - 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', - 'only_matching': True, + 'url': 'http://www.ardmediathek.de/tv/Dokumentation-und-Reportage/Ich-liebe-das-Leben-trotzdem/rbb-Fernsehen/Video?documentId=29582122&bcastId=3822114', + 'info_dict': { + 'id': '29582122', + 'ext': 'mp4', + 'title': 'Ich liebe das Leben trotzdem', + 'description': 'md5:45e4c225c72b27993314b31a84a5261c', + 'duration': 4557, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { - 'url': 'http://www.ardmediathek.de/tv/Tatort/Das-Wunder-von-Wolbeck-Video-tgl-ab-20/Das-Erste/Video?documentId=22490580&bcastId=602916', + 'url': 'http://www.ardmediathek.de/tv/Tatort/Tatort-Scheinwelten-H%C3%B6rfassung-Video/Das-Erste/Video?documentId=29522730&bcastId=602916', + 'md5': 'f4d98b10759ac06c0072bbcd1f0b9e3e', 'info_dict': { - 'id': '22490580', + 'id': '29522730', 'ext': 'mp4', - 'title': 'Das Wunder von Wolbeck (Video tgl. ab 20 Uhr)', - 'description': 'Auf einem restaurierten Hof bei Wolbeck wird der Heilpraktiker Raffael Lembeck eines morgens von seiner Frau Stella tot aufgefunden. Das Opfer war offensichtlich in seiner Praxis zu Fall gekommen und ist dann verblutet, erklärt Prof. Boerne am Tatort.', + 'title': 'Tatort: Scheinwelten - Hörfassung (Video tgl. ab 20 Uhr)', + 'description': 'md5:196392e79876d0ac94c94e8cdb2875f1', + 'duration': 5252, }, - 'skip': 'Blocked outside of Germany', + }, { + # audio + 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', + 'md5': '219d94d8980b4f538c7fcb0865eb7f2c', + 'info_dict': { + 'id': '28488308', + 'ext': 'mp3', + 'title': 'Tod eines Fußballers', + 'description': 'md5:f6e39f3461f0e1f54bfa48c8875c86ef', + 'duration': 3240, + }, + }, { + 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', + 'only_matching': True, }] + def _extract_media_info(self, media_info_url, webpage, video_id): + media_info = self._download_json( + media_info_url, video_id, 'Downloading media JSON') + + formats = self._extract_formats(media_info, video_id) + + if not formats: + if '"fsk"' in webpage: + raise ExtractorError( + 'This video is only available after 20:00', expected=True) + elif media_info.get('_geoblocked'): + raise ExtractorError('This video is not available due to geo restriction', expected=True) + + self._sort_formats(formats) + + duration = int_or_none(media_info.get('_duration')) + thumbnail = media_info.get('_previewImage') + + subtitles = {} + subtitle_url = media_info.get('_subtitleUrl') + if subtitle_url: + subtitles['de'] = [{ + 'ext': 'srt', + 'url': subtitle_url, + }] + + return { + 'id': video_id, + 'duration': duration, + 'thumbnail': thumbnail, + 'formats': formats, + 'subtitles': subtitles, + } + + def _extract_formats(self, media_info, video_id): + type_ = media_info.get('_type') + media_array = media_info.get('_mediaArray', []) + formats = [] + for num, media in enumerate(media_array): + for stream in media.get('_mediaStreamArray', []): + stream_urls = stream.get('_stream') + if not stream_urls: + continue + if not isinstance(stream_urls, list): + stream_urls = [stream_urls] + quality = stream.get('_quality') + server = stream.get('_server') + for stream_url in stream_urls: + ext = determine_ext(stream_url) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + stream_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', + video_id, preference=-1, f4m_id='hds')) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, 'mp4', preference=1, m3u8_id='hls')) + else: + if server and server.startswith('rtmp'): + f = { + 'url': server, + 'play_path': stream_url, + 'format_id': 'a%s-rtmp-%s' % (num, quality), + } + elif stream_url.startswith('http'): + f = { + 'url': stream_url, + 'format_id': 'a%s-%s-%s' % (num, ext, quality) + } + else: + continue + m = re.search(r'_(?P\d+)x(?P\d+)\.mp4$', stream_url) + if m: + f.update({ + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) + if type_ == 'audio': + f['vcodec'] = 'none' + formats.append(f) + return formats + def _real_extract(self, url): # determine video id from url m = re.match(self._VALID_URL, url) @@ -92,46 +199,22 @@ class ARDMediathekIE(InfoExtractor): 'format_id': fid, 'url': furl, }) + self._sort_formats(formats) + info = { + 'formats': formats, + } else: # request JSON file - media_info = self._download_json( - 'http://www.ardmediathek.de/play/media/%s' % video_id, video_id) - # The second element of the _mediaArray contains the standard http urls - streams = media_info['_mediaArray'][1]['_mediaStreamArray'] - if not streams: - if '"fsk"' in webpage: - raise ExtractorError('This video is only available after 20:00') - - formats = [] - for s in streams: - if type(s['_stream']) == list: - for index, url in enumerate(s['_stream'][::-1]): - quality = s['_quality'] + index - formats.append({ - 'quality': quality, - 'url': url, - 'format_id': '%s-%s' % (determine_ext(url), quality) - }) - continue - - format = { - 'quality': s['_quality'], - 'url': s['_stream'], - } - - format['format_id'] = '%s-%s' % ( - determine_ext(format['url']), format['quality']) + info = self._extract_media_info( + 'http://www.ardmediathek.de/play/media/%s' % video_id, webpage, video_id) - formats.append(format) - - self._sort_formats(formats) - - return { + info.update({ 'id': video_id, 'title': title, 'description': description, - 'formats': formats, 'thumbnail': thumbnail, - } + }) + + return info class ARDIE(InfoExtractor): @@ -189,3 +272,41 @@ class ARDIE(InfoExtractor): 'upload_date': upload_date, 'thumbnail': thumbnail, } + + +class SportschauIE(ARDMediathekIE): + IE_NAME = 'Sportschau' + _VALID_URL = r'(?Phttps?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video(?P[^/#?]+))\.html' + _TESTS = [{ + 'url': 'http://www.sportschau.de/tourdefrance/videoseppeltkokainhatnichtsmitklassischemdopingzutun100.html', + 'info_dict': { + 'id': 'seppeltkokainhatnichtsmitklassischemdopingzutun100', + 'ext': 'mp4', + 'title': 'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun"', + 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'Der ARD-Doping Experte Hajo Seppelt gibt seine Einschätzung zum ersten Dopingfall der diesjährigen Tour de France um den Italiener Luca Paolini ab.', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + base_url = mobj.group('baseurl') + + webpage = self._download_webpage(url, video_id) + title = get_element_by_attribute('class', 'headline', webpage) + description = self._html_search_meta('description', webpage, 'description') + + info = self._extract_media_info( + base_url + '-mc_defaultQuality-h.json', webpage, video_id) + + info.update({ + 'title': title, + 'description': description, + }) + + return info diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 8273bd6c9..76de24477 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -7,7 +7,6 @@ from .common import InfoExtractor from ..utils import ( find_xpath_attr, unified_strdate, - get_element_by_id, get_element_by_attribute, int_or_none, qualities, @@ -195,7 +194,9 @@ class ArteTVFutureIE(ArteTVPlus7IE): def _real_extract(self, url): anchor_id, lang = self._extract_url_info(url) webpage = self._download_webpage(url, anchor_id) - row = get_element_by_id(anchor_id, webpage) + row = self._search_regex( + r'(?s)id="%s"[^>]*>.+?(]*arte_vp_url[^>]*>)' % anchor_id, + webpage, 'row') return self._extract_from_webpage(row, anchor_id, lang) diff --git a/youtube_dl/extractor/baidu.py b/youtube_dl/extractor/baidu.py new file mode 100644 index 000000000..e37ee4440 --- /dev/null +++ b/youtube_dl/extractor/baidu.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_urlparse + + +class BaiduVideoIE(InfoExtractor): + IE_DESC = '百度视频' + _VALID_URL = r'http://v\.baidu\.com/(?P[a-z]+)/(?P\d+)\.htm' + _TESTS = [{ + 'url': 'http://v.baidu.com/comic/1069.htm?frp=bdbrand&q=%E4%B8%AD%E5%8D%8E%E5%B0%8F%E5%BD%93%E5%AE%B6', + 'info_dict': { + 'id': '1069', + 'title': '中华小当家 TV版 (全52集)', + 'description': 'md5:395a419e41215e531c857bb037bbaf80', + }, + 'playlist_count': 52, + }, { + 'url': 'http://v.baidu.com/show/11595.htm?frp=bdbrand', + 'info_dict': { + 'id': '11595', + 'title': 're:^奔跑吧兄弟', + 'description': 'md5:1bf88bad6d850930f542d51547c089b8', + }, + 'playlist_mincount': 3, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + category = category2 = mobj.group('type') + if category == 'show': + category2 = 'tvshow' + + webpage = self._download_webpage(url, playlist_id) + + playlist_title = self._html_search_regex( + r'title\s*:\s*(["\'])(?P[^\']+)\1', webpage, + 'playlist title', group='title') + playlist_description = self._html_search_regex( + r'<input[^>]+class="j-data-intro"[^>]+value="([^"]+)"/>', webpage, + playlist_id, 'playlist description') + + site = self._html_search_regex( + r'filterSite\s*:\s*["\']([^"]*)["\']', webpage, + 'primary provider site') + api_result = self._download_json( + 'http://v.baidu.com/%s_intro/?dtype=%sPlayUrl&id=%s&site=%s' % ( + category, category2, playlist_id, site), + playlist_id, 'Get playlist links') + + entries = [] + for episode in api_result[0]['episodes']: + episode_id = '%s_%s' % (playlist_id, episode['episode']) + + redirect_page = self._download_webpage( + compat_urlparse.urljoin(url, episode['url']), episode_id, + note='Download Baidu redirect page') + real_url = self._html_search_regex( + r'location\.replace\("([^"]+)"\)', redirect_page, 'real URL') + + entries.append(self.url_result( + real_url, video_title=episode['single_title'])) + + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py index c193e66ca..8dff1d6e3 100644 --- a/youtube_dl/extractor/bambuser.py +++ b/youtube_dl/extractor/bambuser.py @@ -1,12 +1,18 @@ from __future__ import unicode_literals import re -import json import itertools from .common import InfoExtractor from ..compat import ( + compat_urllib_parse, compat_urllib_request, + compat_str, +) +from ..utils import ( + ExtractorError, + int_or_none, + float_or_none, ) @@ -14,6 +20,8 @@ class BambuserIE(InfoExtractor): IE_NAME = 'bambuser' _VALID_URL = r'https?://bambuser\.com/v/(?P<id>\d+)' _API_KEY = '005f64509e19a868399060af746a00aa' + _LOGIN_URL = 'https://bambuser.com/user' + _NETRC_MACHINE = 'bambuser' _TEST = { 'url': 'http://bambuser.com/v/4050584', @@ -26,6 +34,9 @@ class BambuserIE(InfoExtractor): 'duration': 3741, 'uploader': 'pixelversity', 'uploader_id': '344706', + 'timestamp': 1382976692, + 'upload_date': '20131028', + 'view_count': int, }, 'params': { # It doesn't respect the 'Range' header, it would download the whole video @@ -34,23 +45,60 @@ class BambuserIE(InfoExtractor): }, } + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + login_form = { + 'form_id': 'user_login', + 'op': 'Log in', + 'name': username, + 'pass': password, + } + + request = compat_urllib_request.Request( + self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) + request.add_header('Referer', self._LOGIN_URL) + response = self._download_webpage( + request, None, 'Logging in as %s' % username) + + login_error = self._html_search_regex( + r'(?s)<div class="messages error">(.+?)</div>', + response, 'login error', default=None) + if login_error: + raise ExtractorError( + 'Unable to login: %s' % login_error, expected=True) + + def _real_initialize(self): + self._login() + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - info_url = ('http://player-c.api.bambuser.com/getVideo.json?' - '&api_key=%s&vid=%s' % (self._API_KEY, video_id)) - info_json = self._download_webpage(info_url, video_id) - info = json.loads(info_json)['result'] + video_id = self._match_id(url) + + info = self._download_json( + 'http://player-c.api.bambuser.com/getVideo.json?api_key=%s&vid=%s' + % (self._API_KEY, video_id), video_id) + + error = info.get('error') + if error: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, error), expected=True) + + result = info['result'] return { 'id': video_id, - 'title': info['title'], - 'url': info['url'], - 'thumbnail': info.get('preview'), - 'duration': int(info['length']), - 'view_count': int(info['views_total']), - 'uploader': info['username'], - 'uploader_id': info['owner']['uid'], + 'title': result['title'], + 'url': result['url'], + 'thumbnail': result.get('preview'), + 'duration': int_or_none(result.get('length')), + 'uploader': result.get('username'), + 'uploader_id': compat_str(result.get('owner', {}).get('uid')), + 'timestamp': int_or_none(result.get('created')), + 'fps': float_or_none(result.get('framerate')), + 'view_count': int_or_none(result.get('views_total')), + 'comment_count': int_or_none(result.get('comment_count')), } diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 869294967..505877b77 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -72,7 +72,7 @@ class BandcampIE(InfoExtractor): download_link = m_download.group(1) video_id = self._search_regex( - r'(?ms)var TralbumData = {.*?id: (?P<id>\d+),?$', + r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$', webpage, 'video id') download_webpage = self._download_webpage(download_link, video_id, 'Downloading free downloads page') diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index abc34a576..5825d2867 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -3,7 +3,10 @@ from __future__ import unicode_literals import xml.etree.ElementTree from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + int_or_none, +) from ..compat import compat_HTTPError @@ -112,6 +115,34 @@ class BBCCoUkIE(InfoExtractor): # rtmp download 'skip_download': True, } + }, { + 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls', + 'info_dict': { + 'id': 'p02n76xf', + 'ext': 'flv', + 'title': 'Natural World, 2015-2016: 2. Super Powered Owls', + 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d', + 'duration': 3540, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'geolocation', + }, { + 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition', + 'info_dict': { + 'id': 'b05zmgw1', + 'ext': 'flv', + 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.', + 'title': 'Royal Academy Summer Exhibition', + 'duration': 3540, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'geolocation', }, { 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4', 'only_matching': True, @@ -220,26 +251,11 @@ class BBCCoUkIE(InfoExtractor): for connection in self._extract_connections(media): captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions') lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en') - ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}')) - srt = '' - - def _extract_text(p): - if p.text is not None: - stripped_text = p.text.strip() - if stripped_text: - return stripped_text - return ' '.join(span.text.strip() for span in p.findall('{http://www.w3.org/2006/10/ttaf1}span')) - for pos, p in enumerate(ps): - srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), _extract_text(p)) subtitles[lang] = [ { 'url': connection.get('href'), 'ext': 'ttml', }, - { - 'data': srt, - 'ext': 'srt', - }, ] return subtitles @@ -250,7 +266,7 @@ class BBCCoUkIE(InfoExtractor): programme_id, 'Downloading media selection XML') except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: - media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().encode('utf-8')) + media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8')) else: raise @@ -326,16 +342,27 @@ class BBCCoUkIE(InfoExtractor): webpage = self._download_webpage(url, group_id, 'Downloading video page') - programme_id = self._search_regex( - r'"vpid"\s*:\s*"([\da-z]{8})"', webpage, 'vpid', fatal=False, default=None) + programme_id = None + + tviplayer = self._search_regex( + r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById', + webpage, 'player', default=None) + + if tviplayer: + player = self._parse_json(tviplayer, group_id).get('player', {}) + duration = int_or_none(player.get('duration')) + programme_id = player.get('vpid') + + if not programme_id: + programme_id = self._search_regex( + r'"vpid"\s*:\s*"([\da-z]{8})"', webpage, 'vpid', fatal=False, default=None) + if programme_id: - player = self._download_json( - 'http://www.bbc.co.uk/iplayer/episode/%s.json' % group_id, - group_id)['jsConf']['player'] - title = player['title'] - description = player['subtitle'] - duration = player['duration'] formats, subtitles = self._download_media_selector(programme_id) + title = self._og_search_title(webpage) + description = self._search_regex( + r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>', + webpage, 'description', fatal=False) else: programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id) @@ -345,6 +372,7 @@ class BBCCoUkIE(InfoExtractor): 'id': programme_id, 'title': title, 'description': description, + 'thumbnail': self._og_search_thumbnail(webpage, default=None), 'duration': duration, 'formats': formats, 'subtitles': subtitles, diff --git a/youtube_dl/extractor/bet.py b/youtube_dl/extractor/bet.py index d2abd4d77..03dad4636 100644 --- a/youtube_dl/extractor/bet.py +++ b/youtube_dl/extractor/bet.py @@ -1,7 +1,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_unquote from ..utils import ( xpath_text, xpath_with_ns, @@ -16,11 +16,11 @@ class BetIE(InfoExtractor): { 'url': 'http://www.bet.com/news/politics/2014/12/08/in-bet-exclusive-obama-talks-race-and-racism.html', 'info_dict': { - 'id': '740ab250-bb94-4a8a-8787-fe0de7c74471', + 'id': 'news/national/2014/a-conversation-with-president-obama', 'display_id': 'in-bet-exclusive-obama-talks-race-and-racism', 'ext': 'flv', - 'title': 'BET News Presents: A Conversation With President Obama', - 'description': 'md5:5a88d8ae912c1b33e090290af7ec33c6', + 'title': 'A Conversation With President Obama', + 'description': 'md5:699d0652a350cf3e491cd15cc745b5da', 'duration': 1534, 'timestamp': 1418075340, 'upload_date': '20141208', @@ -35,7 +35,7 @@ class BetIE(InfoExtractor): { 'url': 'http://www.bet.com/video/news/national/2014/justice-for-ferguson-a-community-reacts.html', 'info_dict': { - 'id': 'bcd1b1df-673a-42cf-8d01-b282db608f2d', + 'id': 'news/national/2014/justice-for-ferguson-a-community-reacts', 'display_id': 'justice-for-ferguson-a-community-reacts', 'ext': 'flv', 'title': 'Justice for Ferguson: A Community Reacts', @@ -57,10 +57,13 @@ class BetIE(InfoExtractor): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - media_url = compat_urllib_parse.unquote(self._search_regex( + media_url = compat_urllib_parse_unquote(self._search_regex( [r'mediaURL\s*:\s*"([^"]+)"', r"var\s+mrssMediaUrl\s*=\s*'([^']+)'"], webpage, 'media URL')) + video_id = self._search_regex( + r'/video/(.*)/_jcr_content/', media_url, 'video id') + mrss = self._download_xml(media_url, display_id) item = mrss.find('./channel/item') @@ -75,8 +78,6 @@ class BetIE(InfoExtractor): description = xpath_text( item, './description', 'description', fatal=False) - video_id = xpath_text(item, './guid', 'video id', fatal=False) - timestamp = parse_iso8601(xpath_text( item, xpath_with_ns('./dc:date', NS_MAP), 'upload date', fatal=False)) diff --git a/youtube_dl/extractor/bild.py b/youtube_dl/extractor/bild.py index 77b562d99..4d8cce1ef 100644 --- a/youtube_dl/extractor/bild.py +++ b/youtube_dl/extractor/bild.py @@ -2,7 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + int_or_none, + fix_xml_ampersands, +) class BildIE(InfoExtractor): @@ -15,7 +18,7 @@ class BildIE(InfoExtractor): 'id': '38184146', 'ext': 'mp4', 'title': 'BILD hat sie getestet', - 'thumbnail': 'http://bilder.bild.de/fotos/stand-das-koennen-die-neuen-ipads-38184138/Bild/1.bild.jpg', + 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 196, 'description': 'Mit dem iPad Air 2 und dem iPad Mini 3 hat Apple zwei neue Tablet-Modelle präsentiert. BILD-Reporter Sven Stein durfte die Geräte bereits testen. ', } @@ -25,7 +28,7 @@ class BildIE(InfoExtractor): video_id = self._match_id(url) xml_url = url.split(".bild.html")[0] + ",view=xml.bild.xml" - doc = self._download_xml(xml_url, video_id) + doc = self._download_xml(xml_url, video_id, transform_source=fix_xml_ampersands) duration = int_or_none(doc.attrib.get('duration'), scale=1000) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 75d744852..ecc17ebeb 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -2,34 +2,54 @@ from __future__ import unicode_literals import re +import itertools +import json +import xml.etree.ElementTree as ET from .common import InfoExtractor from ..utils import ( int_or_none, unified_strdate, + ExtractorError, ) class BiliBiliIE(InfoExtractor): _VALID_URL = r'http://www\.bilibili\.(?:tv|com)/video/av(?P<id>[0-9]+)/' - _TEST = { + _TESTS = [{ 'url': 'http://www.bilibili.tv/video/av1074402/', 'md5': '2c301e4dab317596e837c3e7633e7d86', 'info_dict': { - 'id': '1074402', + 'id': '1074402_part1', 'ext': 'flv', 'title': '【金坷垃】金泡沫', 'duration': 308, 'upload_date': '20140420', 'thumbnail': 're:^https?://.+\.jpg', }, - } + }, { + 'url': 'http://www.bilibili.com/video/av1041170/', + 'info_dict': { + 'id': '1041170', + 'title': '【BD1080P】刀语【诸神&异域】', + }, + 'playlist_count': 9, + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + if '(此视频不存在或被删除)' in webpage: + raise ExtractorError( + 'The video does not exist or was deleted', expected=True) + + if '>你没有权限浏览! 由于版权相关问题 我们不对您所在的地区提供服务<' in webpage: + raise ExtractorError( + 'The video is not available in your region due to copyright reasons', + expected=True) + video_code = self._search_regex( r'(?s)<div itemprop="video".*?>(.*?)</div>', webpage, 'video code') @@ -54,19 +74,22 @@ class BiliBiliIE(InfoExtractor): cid = self._search_regex(r'cid=(\d+)', webpage, 'cid') - lq_doc = self._download_xml( + entries = [] + + lq_page = self._download_webpage( 'http://interface.bilibili.com/v_cdn_play?appkey=1&cid=%s' % cid, video_id, note='Downloading LQ video info' ) - lq_durl = lq_doc.find('./durl') - formats = [{ - 'format_id': 'lq', - 'quality': 1, - 'url': lq_durl.find('./url').text, - 'filesize': int_or_none( - lq_durl.find('./size'), get_attr='text'), - }] + try: + err_info = json.loads(lq_page) + raise ExtractorError( + 'BiliBili said: ' + err_info['error_text'], expected=True) + except ValueError: + pass + + lq_doc = ET.fromstring(lq_page) + lq_durls = lq_doc.findall('./durl') hq_doc = self._download_xml( 'http://interface.bilibili.com/playurl?appkey=1&cid=%s' % cid, @@ -75,22 +98,45 @@ class BiliBiliIE(InfoExtractor): fatal=False, ) if hq_doc is not False: - hq_durl = hq_doc.find('./durl') - formats.append({ - 'format_id': 'hq', - 'quality': 2, - 'ext': 'flv', - 'url': hq_durl.find('./url').text, + hq_durls = hq_doc.findall('./durl') + assert len(lq_durls) == len(hq_durls) + else: + hq_durls = itertools.repeat(None) + + i = 1 + for lq_durl, hq_durl in zip(lq_durls, hq_durls): + formats = [{ + 'format_id': 'lq', + 'quality': 1, + 'url': lq_durl.find('./url').text, 'filesize': int_or_none( - hq_durl.find('./size'), get_attr='text'), + lq_durl.find('./size'), get_attr='text'), + }] + if hq_durl is not None: + formats.append({ + 'format_id': 'hq', + 'quality': 2, + 'ext': 'flv', + 'url': hq_durl.find('./url').text, + 'filesize': int_or_none( + hq_durl.find('./size'), get_attr='text'), + }) + self._sort_formats(formats) + + entries.append({ + 'id': '%s_part%d' % (video_id, i), + 'title': title, + 'formats': formats, + 'duration': duration, + 'upload_date': upload_date, + 'thumbnail': thumbnail, }) - self._sort_formats(formats) + i += 1 + return { + '_type': 'multi_video', + 'entries': entries, 'id': video_id, - 'title': title, - 'formats': formats, - 'duration': duration, - 'upload_date': upload_date, - 'thumbnail': thumbnail, + 'title': title } diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py index b632ce967..c3296283d 100644 --- a/youtube_dl/extractor/bliptv.py +++ b/youtube_dl/extractor/bliptv.py @@ -5,7 +5,6 @@ import re from .common import InfoExtractor from ..compat import ( - compat_str, compat_urllib_request, compat_urlparse, ) @@ -14,6 +13,8 @@ from ..utils import ( int_or_none, parse_iso8601, unescapeHTML, + xpath_text, + xpath_with_ns, ) @@ -23,10 +24,10 @@ class BlipTVIE(InfoExtractor): _TESTS = [ { 'url': 'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352', - 'md5': 'c6934ad0b6acf2bd920720ec888eb812', + 'md5': '80baf1ec5c3d2019037c1c707d676b9f', 'info_dict': { 'id': '5779306', - 'ext': 'mov', + 'ext': 'm4v', 'title': 'CBR EXCLUSIVE: "Gotham City Imposters" Bats VS Jokerz Short 3', 'description': 'md5:9bc31f227219cde65e47eeec8d2dc596', 'timestamp': 1323138843, @@ -100,8 +101,31 @@ class BlipTVIE(InfoExtractor): 'vcodec': 'none', } }, + { + # missing duration + 'url': 'http://blip.tv/rss/flash/6700880', + 'info_dict': { + 'id': '6684191', + 'ext': 'm4v', + 'title': 'Cowboy Bebop: Gateway Shuffle Review', + 'description': 'md5:3acc480c0f9ae157f5fe88547ecaf3f8', + 'timestamp': 1386639757, + 'upload_date': '20131210', + 'uploader': 'sfdebris', + 'uploader_id': '706520', + } + } ] + @staticmethod + def _extract_url(webpage): + mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage) + if mobj: + return 'http://blip.tv/a/a-' + mobj.group(1) + mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage) + if mobj: + return mobj.group(1) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) lookup_id = mobj.group('lookup_id') @@ -119,35 +143,34 @@ class BlipTVIE(InfoExtractor): rss = self._download_xml('http://blip.tv/rss/flash/%s' % video_id, video_id, 'Downloading video RSS') - def blip(s): - return '{http://blip.tv/dtd/blip/1.0}%s' % s - - def media(s): - return '{http://search.yahoo.com/mrss/}%s' % s - - def itunes(s): - return '{http://www.itunes.com/dtds/podcast-1.0.dtd}%s' % s + def _x(p): + return xpath_with_ns(p, { + 'blip': 'http://blip.tv/dtd/blip/1.0', + 'media': 'http://search.yahoo.com/mrss/', + 'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd', + }) item = rss.find('channel/item') - video_id = item.find(blip('item_id')).text - title = item.find('./title').text - description = clean_html(compat_str(item.find(blip('puredescription')).text)) - timestamp = parse_iso8601(item.find(blip('datestamp')).text) - uploader = item.find(blip('user')).text - uploader_id = item.find(blip('userid')).text - duration = int(item.find(blip('runtime')).text) - media_thumbnail = item.find(media('thumbnail')) - thumbnail = media_thumbnail.get('url') if media_thumbnail is not None else item.find(itunes('image')).text - categories = [category.text for category in item.findall('category')] + video_id = xpath_text(item, _x('blip:item_id'), 'video id') or lookup_id + title = xpath_text(item, 'title', 'title', fatal=True) + description = clean_html(xpath_text(item, _x('blip:puredescription'), 'description')) + timestamp = parse_iso8601(xpath_text(item, _x('blip:datestamp'), 'timestamp')) + uploader = xpath_text(item, _x('blip:user'), 'uploader') + uploader_id = xpath_text(item, _x('blip:userid'), 'uploader id') + duration = int_or_none(xpath_text(item, _x('blip:runtime'), 'duration')) + media_thumbnail = item.find(_x('media:thumbnail')) + thumbnail = (media_thumbnail.get('url') if media_thumbnail is not None + else xpath_text(item, 'image', 'thumbnail')) + categories = [category.text for category in item.findall('category') if category is not None] formats = [] subtitles_urls = {} - media_group = item.find(media('group')) - for media_content in media_group.findall(media('content')): + media_group = item.find(_x('media:group')) + for media_content in media_group.findall(_x('media:content')): url = media_content.get('url') - role = media_content.get(blip('role')) + role = media_content.get(_x('blip:role')) msg = self._download_webpage( url + '?showplayer=20140425131715&referrer=http://blip.tv&mask=7&skin=flashvars&view=url', video_id, 'Resolving URL for %s' % role) @@ -166,8 +189,8 @@ class BlipTVIE(InfoExtractor): 'url': real_url, 'format_id': role, 'format_note': media_type, - 'vcodec': media_content.get(blip('vcodec')) or 'none', - 'acodec': media_content.get(blip('acodec')), + 'vcodec': media_content.get(_x('blip:vcodec')) or 'none', + 'acodec': media_content.get(_x('blip:acodec')), 'filesize': media_content.get('filesize'), 'width': int_or_none(media_content.get('width')), 'height': int_or_none(media_content.get('height')), diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index 45ba51732..66e394e10 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -16,27 +16,38 @@ class BRIE(InfoExtractor): _TESTS = [ { - 'url': 'http://www.br.de/mediathek/video/sendungen/heimatsound/heimatsound-festival-2014-trailer-100.html', - 'md5': '93556dd2bcb2948d9259f8670c516d59', + 'url': 'http://www.br.de/mediathek/video/sendungen/abendschau/betriebliche-altersvorsorge-104.html', + 'md5': '83a0477cf0b8451027eb566d88b51106', 'info_dict': { - 'id': '25e279aa-1ffd-40fd-9955-5325bd48a53a', + 'id': '48f656ef-287e-486f-be86-459122db22cc', 'ext': 'mp4', - 'title': 'Wenn das Traditions-Theater wackelt', - 'description': 'Heimatsound-Festival 2014: Wenn das Traditions-Theater wackelt', - 'duration': 34, - 'uploader': 'BR', - 'upload_date': '20140802', + 'title': 'Die böse Überraschung', + 'description': 'Betriebliche Altersvorsorge: Die böse Überraschung', + 'duration': 180, + 'uploader': 'Reinhard Weber', + 'upload_date': '20150422', } }, { - 'url': 'http://www.br.de/nachrichten/schaeuble-haushaltsentwurf-bundestag-100.html', - 'md5': '3db0df1a9a9cd9fa0c70e6ea8aa8e820', + 'url': 'http://www.br.de/nachrichten/oberbayern/inhalt/muenchner-polizeipraesident-schreiber-gestorben-100.html', + 'md5': 'a44396d73ab6a68a69a568fae10705bb', 'info_dict': { - 'id': 'c6aae3de-2cf9-43f2-957f-f17fef9afaab', + 'id': 'a4b83e34-123d-4b81-9f4e-c0d3121a4e05', + 'ext': 'mp4', + 'title': 'Manfred Schreiber ist tot', + 'description': 'Abendschau kompakt: Manfred Schreiber ist tot', + 'duration': 26, + } + }, + { + 'url': 'http://www.br.de/radio/br-klassik/sendungen/allegro/premiere-urauffuehrung-the-land-2015-dance-festival-muenchen-100.html', + 'md5': '8b5b27c0b090f3b35eac4ab3f7a73d3d', + 'info_dict': { + 'id': '74c603c9-26d3-48bb-b85b-079aeed66e0b', 'ext': 'aac', - 'title': '"Keine neuen Schulden im nächsten Jahr"', - 'description': 'Haushaltsentwurf: "Keine neuen Schulden im nächsten Jahr"', - 'duration': 64, + 'title': 'Kurzweilig und sehr bewegend', + 'description': '"The Land" von Peeping Tom: Kurzweilig und sehr bewegend', + 'duration': 296, } }, { diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 0733bece7..4721c2293 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -13,6 +13,7 @@ from ..compat import ( compat_urllib_parse_urlparse, compat_urllib_request, compat_urlparse, + compat_xml_parse_error, ) from ..utils import ( determine_ext, @@ -117,7 +118,10 @@ class BrightcoveIE(InfoExtractor): object_str = re.sub(r'(<object[^>]*)(xmlns=".*?")', r'\1', object_str) object_str = fix_xml_ampersands(object_str) - object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8')) + try: + object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8')) + except compat_xml_parse_error: + return fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars') if fv_el is not None: @@ -153,6 +157,28 @@ class BrightcoveIE(InfoExtractor): linkBase = find_param('linkBaseURL') if linkBase is not None: params['linkBaseURL'] = linkBase + return cls._make_brightcove_url(params) + + @classmethod + def _build_brighcove_url_from_js(cls, object_js): + # The layout of JS is as follows: + # customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) { + # // build Brightcove <object /> XML + # } + m = re.search( + r'''(?x)customBC.\createVideo\( + .*? # skipping width and height + ["\'](?P<playerID>\d+)["\']\s*,\s* # playerID + ["\'](?P<playerKey>AQ[^"\']{48})[^"\']*["\']\s*,\s* # playerKey begins with AQ and is 50 characters + # in length, however it's appended to itself + # in places, so truncate + ["\'](?P<videoID>\d+)["\'] # @videoPlayer + ''', object_js) + if m: + return cls._make_brightcove_url(m.groupdict()) + + @classmethod + def _make_brightcove_url(cls, params): data = compat_urllib_parse.urlencode(params) return cls._FEDERATED_URL_TEMPLATE % data @@ -169,7 +195,7 @@ class BrightcoveIE(InfoExtractor): """Return a list of all Brightcove URLs from the webpage """ url_m = re.search( - r'<meta\s+property="og:video"\s+content="(https?://(?:secure|c)\.brightcove.com/[^"]+)"', + r'<meta\s+property=[\'"]og:video[\'"]\s+content=[\'"](https?://(?:secure|c)\.brightcove.com/[^\'"]+)[\'"]', webpage) if url_m: url = unescapeHTML(url_m.group(1)) @@ -183,9 +209,14 @@ class BrightcoveIE(InfoExtractor): (?: [^>]+?class=[\'"][^>]*?BrightcoveExperience.*?[\'"] | [^>]*?>\s*<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/ - ).+?</object>''', + ).+?>\s*</object>''', webpage) - return [cls._build_brighcove_url(m) for m in matches] + if matches: + return list(filter(None, [cls._build_brighcove_url(m) for m in matches])) + + return list(filter(None, [ + cls._build_brighcove_url_from_js(custom_bc) + for custom_bc in re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)])) def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) diff --git a/youtube_dl/extractor/byutv.py b/youtube_dl/extractor/byutv.py index 6252be05b..3b2de517e 100644 --- a/youtube_dl/extractor/byutv.py +++ b/youtube_dl/extractor/byutv.py @@ -16,7 +16,7 @@ class BYUtvIE(InfoExtractor): 'ext': 'mp4', 'description': 'md5:5438d33774b6bdc662f9485a340401cc', 'title': 'Season 5 Episode 5', - 'thumbnail': 're:^https?://.*promo.*' + 'thumbnail': 're:^https?://.*\.jpg$' }, 'params': { 'skip_download': True, diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 1b14471e5..699b4f7d0 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -25,14 +25,14 @@ class CanalplusIE(InfoExtractor): } _TESTS = [{ - 'url': 'http://www.canalplus.fr/c-infos-documentaires/pid1830-c-zapping.html?vid=922470', - 'md5': '3db39fb48b9685438ecf33a1078023e4', + 'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1263092', + 'md5': 'b3481d7ca972f61e37420798d0a9d934', 'info_dict': { - 'id': '922470', + 'id': '1263092', 'ext': 'flv', - 'title': 'Zapping - 26/08/13', - 'description': 'Le meilleur de toutes les chaînes, tous les jours.\nEmission du 26 août 2013', - 'upload_date': '20130826', + 'title': 'Le Zapping - 13/05/15', + 'description': 'md5:09738c0d06be4b5d06a0940edb0da73f', + 'upload_date': '20150513', }, }, { 'url': 'http://www.piwiplus.fr/videos-piwi/pid1405-le-labyrinthe-boing-super-ranger.html?vid=1108190', @@ -56,7 +56,7 @@ class CanalplusIE(InfoExtractor): 'skip': 'videos get deleted after a while', }, { 'url': 'http://www.itele.fr/france/video/aubervilliers-un-lycee-en-colere-111559', - 'md5': '65aa83ad62fe107ce29e564bb8712580', + 'md5': 'f3a46edcdf28006598ffaf5b30e6a2d4', 'info_dict': { 'id': '1213714', 'ext': 'flv', diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 1ceb9d8d9..75fffb156 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -4,12 +4,13 @@ from .common import InfoExtractor class CBSIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cbs\.com/shows/[^/]+/(?:video|artist)/(?P<id>[^/]+)/.*' + _VALID_URL = r'https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/[^/]+/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', 'info_dict': { 'id': '4JUVEwq3wUT7', + 'display_id': 'connect-chat-feat-garth-brooks', 'ext': 'flv', 'title': 'Connect Chat feat. Garth Brooks', 'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', @@ -24,6 +25,7 @@ class CBSIE(InfoExtractor): 'url': 'http://www.cbs.com/shows/liveonletterman/artist/221752/st-vincent/', 'info_dict': { 'id': 'WWF_5KqY3PK1', + 'display_id': 'st-vincent', 'ext': 'flv', 'title': 'Live on Letterman - St. Vincent', 'description': 'Live On Letterman: St. Vincent in concert from New York\'s Ed Sullivan Theater on Tuesday, July 16, 2014.', @@ -34,12 +36,23 @@ class CBSIE(InfoExtractor): 'skip_download': True, }, '_skip': 'Blocked outside the US', + }, { + 'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/', + 'only_matching': True, + }, { + 'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/', + 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) real_id = self._search_regex( - r"video\.settings\.pid\s*=\s*'([^']+)';", + [r"video\.settings\.pid\s*=\s*'([^']+)';", r"cbsplayer\.pid\s*=\s*'([^']+)';"], webpage, 'real video ID') - return self.url_result('theplatform:%s' % real_id) + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'url': 'theplatform:%s' % real_id, + 'display_id': display_id, + } diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 7e47960ab..52e61d85b 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -32,7 +32,7 @@ class CBSNewsIE(InfoExtractor): 'id': 'fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack', 'ext': 'flv', 'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack', - 'thumbnail': 'http://cbsnews2.cbsistatic.com/hub/i/r/2014/04/04/0c9fbc66-576b-41ca-8069-02d122060dd2/thumbnail/140x90/6dad7a502f88875ceac38202984b6d58/en-0404-werner-replace-640x360.jpg', + 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 205, }, 'params': { diff --git a/youtube_dl/extractor/ccc.py b/youtube_dl/extractor/ccc.py index 2a5d4be18..6924eac70 100644 --- a/youtube_dl/extractor/ccc.py +++ b/youtube_dl/extractor/ccc.py @@ -16,7 +16,7 @@ class CCCIE(InfoExtractor): _TEST = { 'url': 'http://media.ccc.de/browse/congress/2013/30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor.html#video', - 'md5': '205a365d0d57c0b1e43a12c9ffe8f9be', + 'md5': '3a1eda8f3a29515d27f5adb967d7e740', 'info_dict': { 'id': '20131228183', 'ext': 'mp4', @@ -51,7 +51,7 @@ class CCCIE(InfoExtractor): matches = re.finditer(r'''(?xs) <(?:span|div)\s+class='label\s+filetype'>(?P<format>.*?)</(?:span|div)>\s* - <a\s+href='(?P<http_url>[^']+)'>\s* + <a\s+download\s+href='(?P<http_url>[^']+)'>\s* (?: .*? <a\s+href='(?P<torrent_url>[^']+\.torrent)' diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index 65f6be623..dda583680 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -7,6 +7,7 @@ from .common import InfoExtractor from ..compat import ( compat_urllib_request, compat_urllib_parse, + compat_urllib_parse_unquote, compat_urllib_parse_urlparse, ) from ..utils import ( @@ -88,7 +89,7 @@ class CeskaTelevizeIE(InfoExtractor): if playlist_url == 'error_region': raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) - req = compat_urllib_request.Request(compat_urllib_parse.unquote(playlist_url)) + req = compat_urllib_request.Request(compat_urllib_parse_unquote(playlist_url)) req.add_header('Referer', url) playlist = self._download_json(req, video_id) diff --git a/youtube_dl/extractor/chilloutzone.py b/youtube_dl/extractor/chilloutzone.py index c922f6959..0206d96db 100644 --- a/youtube_dl/extractor/chilloutzone.py +++ b/youtube_dl/extractor/chilloutzone.py @@ -57,7 +57,7 @@ class ChilloutzoneIE(InfoExtractor): base64_video_info = self._html_search_regex( r'var cozVidData = "(.+?)";', webpage, 'video data') - decoded_video_info = base64.b64decode(base64_video_info).decode("utf-8") + decoded_video_info = base64.b64decode(base64_video_info.encode('utf-8')).decode('utf-8') video_info_dict = json.loads(decoded_video_info) # get video information from dict diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py new file mode 100644 index 000000000..c949a4814 --- /dev/null +++ b/youtube_dl/extractor/cinemassacre.py @@ -0,0 +1,110 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ExtractorError +from .bliptv import BlipTVIE + + +class CinemassacreIE(InfoExtractor): + _VALID_URL = 'https?://(?:www\.)?cinemassacre\.com/(?P<date_y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/(?P<display_id>[^?#/]+)' + _TESTS = [ + { + 'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/', + 'md5': 'fde81fbafaee331785f58cd6c0d46190', + 'info_dict': { + 'id': 'Cinemassacre-19911', + 'ext': 'mp4', + 'upload_date': '20121110', + 'title': '“Angry Video Game Nerd: The Movie” – Trailer', + 'description': 'md5:fb87405fcb42a331742a0dce2708560b', + }, + }, + { + 'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940', + 'md5': 'd72f10cd39eac4215048f62ab477a511', + 'info_dict': { + 'id': 'Cinemassacre-521be8ef82b16', + 'ext': 'mp4', + 'upload_date': '20131002', + 'title': 'The Mummy’s Hand (1940)', + }, + }, + { + # blip.tv embedded video + 'url': 'http://cinemassacre.com/2006/12/07/chronologically-confused-about-bad-movie-and-video-game-sequel-titles/', + 'md5': 'ca9b3c8dd5a66f9375daeb5135f5a3de', + 'info_dict': { + 'id': '4065369', + 'ext': 'flv', + 'title': 'AVGN: Chronologically Confused about Bad Movie and Video Game Sequel Titles', + 'upload_date': '20061207', + 'uploader': 'cinemassacre', + 'uploader_id': '250778', + 'timestamp': 1283233867, + 'description': 'md5:0a108c78d130676b207d0f6d029ecffd', + } + }, + { + # Youtube embedded video + 'url': 'http://cinemassacre.com/2006/09/01/mckids/', + 'md5': '6eb30961fa795fedc750eac4881ad2e1', + 'info_dict': { + 'id': 'FnxsNhuikpo', + 'ext': 'mp4', + 'upload_date': '20060901', + 'uploader': 'Cinemassacre Extras', + 'description': 'md5:de9b751efa9e45fbaafd9c8a1123ed53', + 'uploader_id': 'Cinemassacre', + 'title': 'AVGN: McKids', + } + }, + { + 'url': 'http://cinemassacre.com/2015/05/25/mario-kart-64-nintendo-64-james-mike-mondays/', + 'md5': '1376908e49572389e7b06251a53cdd08', + 'info_dict': { + 'id': 'Cinemassacre-555779690c440', + 'ext': 'mp4', + 'description': 'Let’s Play Mario Kart 64 !! Mario Kart 64 is a classic go-kart racing game released for the Nintendo 64 (N64). Today James & Mike do 4 player Battle Mode with Kyle and Bootsy!', + 'title': 'Mario Kart 64 (Nintendo 64) James & Mike Mondays', + 'upload_date': '20150525', + } + } + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('display_id') + video_date = mobj.group('date_y') + mobj.group('date_m') + mobj.group('date_d') + + webpage = self._download_webpage(url, display_id) + + playerdata_url = self._search_regex( + [ + r'src="(http://(?:player2\.screenwavemedia\.com|player\.screenwavemedia\.com/play)/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"', + r'<iframe[^>]+src="((?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"', + ], + webpage, 'player data URL', default=None) + if not playerdata_url: + playerdata_url = BlipTVIE._extract_url(webpage) + if not playerdata_url: + raise ExtractorError('Unable to find player data') + + video_title = self._html_search_regex( + r'<title>(?P<title>.+?)\|', webpage, 'title') + video_description = self._html_search_regex( + r'<div class="entry-content">(?P<description>.+?)</div>', + webpage, 'description', flags=re.DOTALL, fatal=False) + video_thumbnail = self._og_search_thumbnail(webpage) + + return { + '_type': 'url_transparent', + 'display_id': display_id, + 'title': video_title, + 'description': video_description, + 'upload_date': video_date, + 'thumbnail': video_thumbnail, + 'url': playerdata_url, + } diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py index d07d544ea..8306d6fb7 100644 --- a/youtube_dl/extractor/clipsyndicate.py +++ b/youtube_dl/extractor/clipsyndicate.py @@ -1,7 +1,5 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( find_xpath_attr, @@ -10,9 +8,9 @@ from ..utils import ( class ClipsyndicateIE(InfoExtractor): - _VALID_URL = r'http://www\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)' + _VALID_URL = r'http://(?:chic|www)\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe', 'md5': '4d7d549451bad625e0ff3d7bd56d776c', 'info_dict': { @@ -22,11 +20,13 @@ class ClipsyndicateIE(InfoExtractor): 'duration': 612, 'thumbnail': 're:^https?://.+\.jpg', }, - } + }, { + 'url': 'http://chic.clipsyndicate.com/video/play/5844117/shark_attack', + 'only_matching': True, + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) js_player = self._download_webpage( 'http://eplayer.clipsyndicate.com/embed/player.js?va_id=%s' % video_id, video_id, 'Downlaoding player') diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py index 3145b3051..5dd69bff7 100644 --- a/youtube_dl/extractor/cnet.py +++ b/youtube_dl/extractor/cnet.py @@ -11,7 +11,7 @@ from ..utils import ( class CNETIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?cnet\.com/videos/(?P<id>[^/]+)/' - _TEST = { + _TESTS = [{ 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/', 'info_dict': { 'id': '56f4ea68-bd21-4852-b08c-4de5b8354c60', @@ -25,7 +25,20 @@ class CNETIE(InfoExtractor): 'params': { 'skip_download': 'requires rtmpdump', } - } + }, { + 'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/', + 'info_dict': { + 'id': '56527b93-d25d-44e3-b738-f989ce2e49ba', + 'ext': 'flv', + 'description': 'Khail and Ashley wonder what other civic woes can be solved by self-tweeting objects, investigate a new kind of VR camera and watch an origami robot self-assemble, walk, climb, dig and dissolve. #TDPothole', + 'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40', + 'uploader': 'Ashley Esqueda', + 'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)', + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -42,7 +55,7 @@ class CNETIE(InfoExtractor): raise ExtractorError('Cannot find video data') mpx_account = data['config']['players']['default']['mpx_account'] - vid = vdata['files']['rtmp'] + vid = vdata['files'].get('rtmp', vdata['files']['hds']) tp_link = 'http://link.theplatform.com/s/%s/%s' % (mpx_account, vid) video_id = vdata['id'] diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 5efc5f4fe..3b1bd4033 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -12,7 +12,7 @@ from ..utils import ( class CNNIE(InfoExtractor): _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/ - (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z]{3,5})(?:-ap)?|(?=&)))''' + (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))''' _TESTS = [{ 'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index e5edcc84b..91ebb0ce5 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -201,7 +201,7 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor): uri = mMovieParams[0][1] # Correct cc.com in uri - uri = re.sub(r'(episode:[^.]+)(\.cc)?\.com', r'\1.cc.com', uri) + uri = re.sub(r'(episode:[^.]+)(\.cc)?\.com', r'\1.com', uri) index_url = 'http://%s.cc.com/feeds/mrss?%s' % (show_name, compat_urllib_parse.urlencode({'uri': uri})) idoc = self._download_xml( diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 8ed97f8dd..b9014fc23 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -22,17 +22,20 @@ from ..compat import ( compat_str, ) from ..utils import ( + NO_DEFAULT, age_restricted, + bug_reports_message, clean_html, compiled_regex_type, + determine_ext, ExtractorError, + fix_xml_ampersands, float_or_none, int_or_none, RegexNotFoundError, sanitize_filename, unescapeHTML, ) -_NO_DEFAULT = object() class InfoExtractor(object): @@ -46,7 +49,7 @@ class InfoExtractor(object): information possibly downloading the video to the file system, among other possible outcomes. - The type field determines the the type of the result. + The type field determines the type of the result. By far the most common value (and the default if _type is missing) is "video", which indicates a single video. @@ -110,11 +113,8 @@ class InfoExtractor(object): (quality takes higher priority) -1 for default (order by other properties), -2 or smaller for less than default. - * http_method HTTP method to use for the download. * http_headers A dictionary of additional HTTP headers to add to the request. - * http_post_data Additional data to send with a POST - request. * stretched_ratio If given and not 1, indicates that the video's pixels are not square. width : height ratio as float. @@ -324,7 +324,7 @@ class InfoExtractor(object): self._downloader.report_warning(errmsg) return False - def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True): + def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None): """ Returns a tuple (page content as string, URL handle) """ # Strip hashes from the URL (#1038) if isinstance(url_or_request, (compat_str, str)): @@ -334,14 +334,11 @@ class InfoExtractor(object): if urlh is False: assert not fatal return False - content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal) + content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding) return (content, urlh) - def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None): - content_type = urlh.headers.get('Content-Type', '') - webpage_bytes = urlh.read() - if prefix is not None: - webpage_bytes = prefix + webpage_bytes + @staticmethod + def _guess_encoding_from_content(content_type, webpage_bytes): m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) if m: encoding = m.group(1) @@ -354,6 +351,16 @@ class InfoExtractor(object): encoding = 'utf-16' else: encoding = 'utf-8' + + return encoding + + def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): + content_type = urlh.headers.get('Content-Type', '') + webpage_bytes = urlh.read() + if prefix is not None: + webpage_bytes = prefix + webpage_bytes + if not encoding: + encoding = self._guess_encoding_from_content(content_type, webpage_bytes) if self._downloader.params.get('dump_intermediate_pages', False): try: url = url_or_request.get_full_url() @@ -410,13 +417,13 @@ class InfoExtractor(object): return content - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5): + def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None): """ Returns the data of the page as a string """ success = False try_count = 0 while success is False: try: - res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal) + res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding) success = True except compat_http_client.IncompleteRead as e: try_count += 1 @@ -431,10 +438,10 @@ class InfoExtractor(object): def _download_xml(self, url_or_request, video_id, note='Downloading XML', errnote='Unable to download XML', - transform_source=None, fatal=True): + transform_source=None, fatal=True, encoding=None): """Return the xml as an xml.etree.ElementTree.Element""" xml_string = self._download_webpage( - url_or_request, video_id, note, errnote, fatal=fatal) + url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding) if xml_string is False: return xml_string if transform_source: @@ -445,9 +452,10 @@ class InfoExtractor(object): note='Downloading JSON metadata', errnote='Unable to download JSON metadata', transform_source=None, - fatal=True): + fatal=True, encoding=None): json_string = self._download_webpage( - url_or_request, video_id, note, errnote, fatal=fatal) + url_or_request, video_id, note, errnote, fatal=fatal, + encoding=encoding) if (not fatal) and json_string is False: return None return self._parse_json( @@ -517,7 +525,7 @@ class InfoExtractor(object): video_info['description'] = playlist_description return video_info - def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None): + def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): """ Perform a regex search on the given string, using a single or a list of patterns returning the first matching group. @@ -543,16 +551,15 @@ class InfoExtractor(object): return next(g for g in mobj.groups() if g is not None) else: return mobj.group(group) - elif default is not _NO_DEFAULT: + elif default is not NO_DEFAULT: return default elif fatal: raise RegexNotFoundError('Unable to extract %s' % _name) else: - self._downloader.report_warning('unable to extract %s; ' - 'please report this issue on http://yt-dl.org/bug' % _name) + self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message()) return None - def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None): + def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): """ Like _search_regex, but strips HTML tags and unescapes entities. """ @@ -564,7 +571,7 @@ class InfoExtractor(object): def _get_login_info(self): """ - Get the the login info as (username, password) + Get the login info as (username, password) It will look in the netrc file using the _NETRC_MACHINE value If there's no info available, return (None, None) """ @@ -700,7 +707,26 @@ class InfoExtractor(object): return self._html_search_meta('twitter:player', html, 'twitter card player') - def _sort_formats(self, formats): + @staticmethod + def _hidden_inputs(html): + return dict([ + (input.group('name'), input.group('value')) for input in re.finditer( + r'''(?x) + <input\s+ + type=(?P<q_hidden>["\'])hidden(?P=q_hidden)\s+ + name=(?P<q_name>["\'])(?P<name>.+?)(?P=q_name)\s+ + (?:id=(?P<q_id>["\']).+?(?P=q_id)\s+)? + value=(?P<q_value>["\'])(?P<value>.*?)(?P=q_value) + ''', html) + ]) + + def _form_hidden_inputs(self, form_id, html): + form = self._search_regex( + r'(?s)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id, + html, '%s form' % form_id, group='form') + return self._hidden_inputs(form) + + def _sort_formats(self, formats, field_preference=None): if not formats: raise ExtractorError('No video formats found') @@ -710,6 +736,9 @@ class InfoExtractor(object): if not f.get('ext') and 'url' in f: f['ext'] = determine_ext(f['url']) + if isinstance(field_preference, (list, tuple)): + return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference) + preference = f.get('preference') if preference is None: proto = f.get('protocol') @@ -756,7 +785,7 @@ class InfoExtractor(object): f.get('fps') if f.get('fps') is not None else -1, f.get('filesize_approx') if f.get('filesize_approx') is not None else -1, f.get('source_preference') if f.get('source_preference') is not None else -1, - f.get('format_id'), + f.get('format_id') if f.get('format_id') is not None else '', ) formats.sort(key=_formats_key) @@ -778,8 +807,8 @@ class InfoExtractor(object): return True except ExtractorError as e: if isinstance(e.cause, compat_HTTPError): - self.report_warning( - '%s URL is invalid, skipping' % item, video_id) + self.to_screen( + '%s: %s URL is invalid, skipping' % (video_id, item)) return False raise @@ -807,10 +836,14 @@ class InfoExtractor(object): self.to_screen(msg) time.sleep(timeout) - def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None): + def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None, + transform_source=lambda s: fix_xml_ampersands(s).strip()): manifest = self._download_xml( manifest_url, video_id, 'Downloading f4m manifest', - 'Unable to download f4m manifest') + 'Unable to download f4m manifest', + # Some manifests may be malformed, e.g. prosiebensat1 generated manifests + # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244) + transform_source=transform_source) formats = [] manifest_version = '1.0' @@ -820,8 +853,19 @@ class InfoExtractor(object): media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') for i, media_el in enumerate(media_nodes): if manifest_version == '2.0': - manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/' + - (media_el.attrib.get('href') or media_el.attrib.get('url'))) + media_url = media_el.attrib.get('href') or media_el.attrib.get('url') + if not media_url: + continue + manifest_url = ( + media_url if media_url.startswith('http://') or media_url.startswith('https://') + else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url)) + # If media_url is itself a f4m manifest do the recursive extraction + # since bitrates in parent manifest (this one) and media_url manifest + # may differ leading to inability to resolve the format by requested + # bitrate in f4m downloader + if determine_ext(manifest_url) == 'f4m': + formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id)) + continue tbr = int_or_none(media_el.attrib.get('bitrate')) formats.append({ 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])), @@ -838,7 +882,8 @@ class InfoExtractor(object): def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, entry_protocol='m3u8', preference=None, - m3u8_id=None): + m3u8_id=None, note=None, errnote=None, + fatal=True): formats = [{ 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), @@ -857,8 +902,11 @@ class InfoExtractor(object): m3u8_doc = self._download_webpage( m3u8_url, video_id, - note='Downloading m3u8 information', - errnote='Failed to download m3u8 information') + note=note or 'Downloading m3u8 information', + errnote=errnote or 'Failed to download m3u8 information', + fatal=fatal) + if m3u8_doc is False: + return m3u8_doc last_info = None last_media = None kv_rex = re.compile( @@ -888,7 +936,7 @@ class InfoExtractor(object): format_id = [] if m3u8_id: format_id.append(m3u8_id) - last_media_name = last_media.get('NAME') if last_media else None + last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats))) f = { 'format_id': '-'.join(format_id), @@ -948,7 +996,7 @@ class InfoExtractor(object): def _parse_smil_video(self, video, video_id, base, rtmp_count): src = video.get('src') if not src: - return ([], rtmp_count) + return [], rtmp_count bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) width = int_or_none(video.get('width')) height = int_or_none(video.get('height')) @@ -961,7 +1009,7 @@ class InfoExtractor(object): proto = 'http' ext = video.get('ext') if proto == 'm3u8': - return (self._extract_m3u8_formats(src, video_id, ext), rtmp_count) + return self._extract_m3u8_formats(src, video_id, ext), rtmp_count elif proto == 'rtmp': rtmp_count += 1 streamer = video.get('streamer') or base @@ -1064,9 +1112,6 @@ class InfoExtractor(object): def _get_automatic_captions(self, *args, **kwargs): raise NotImplementedError("This method must be implemented by subclasses") - def _subtitles_timecode(self, seconds): - return '%02d:%02d:%02d.%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000) - class SearchInfoExtractor(InfoExtractor): """ diff --git a/youtube_dl/extractor/cracked.py b/youtube_dl/extractor/cracked.py index cf763ee7e..94d03ce2a 100644 --- a/youtube_dl/extractor/cracked.py +++ b/youtube_dl/extractor/cracked.py @@ -11,39 +11,65 @@ from ..utils import ( class CrackedIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?cracked\.com/video_(?P<id>\d+)_[\da-z-]+\.html' - _TEST = { + _TESTS = [{ + 'url': 'http://www.cracked.com/video_19070_if-animal-actors-got-e21-true-hollywood-stories.html', + 'md5': '89b90b9824e3806ca95072c4d78f13f7', + 'info_dict': { + 'id': '19070', + 'ext': 'mp4', + 'title': 'If Animal Actors Got E! True Hollywood Stories', + 'timestamp': 1404954000, + 'upload_date': '20140710', + } + }, { + # youtube embed 'url': 'http://www.cracked.com/video_19006_4-plot-holes-you-didnt-notice-in-your-favorite-movies.html', - 'md5': '4b29a5eeec292cd5eca6388c7558db9e', + 'md5': 'ccd52866b50bde63a6ef3b35016ba8c7', 'info_dict': { - 'id': '19006', + 'id': 'EjI00A3rZD0', 'ext': 'mp4', - 'title': '4 Plot Holes You Didn\'t Notice in Your Favorite Movies', - 'description': 'md5:3b909e752661db86007d10e5ec2df769', - 'timestamp': 1405659600, - 'upload_date': '20140718', + 'title': "4 Plot Holes You Didn't Notice in Your Favorite Movies - The Spit Take", + 'description': 'md5:c603708c718b796fe6079e2b3351ffc7', + 'upload_date': '20140725', + 'uploader_id': 'Cracked', + 'uploader': 'Cracked', } - } + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + youtube_url = self._search_regex( + r'<iframe[^>]+src="((?:https?:)?//www\.youtube\.com/embed/[^"]+)"', + webpage, 'youtube url', default=None) + if youtube_url: + return self.url_result(youtube_url, 'Youtube') + video_url = self._html_search_regex( - [r'var\s+CK_vidSrc\s*=\s*"([^"]+)"', r'<video\s+src="([^"]+)"'], webpage, 'video URL') + [r'var\s+CK_vidSrc\s*=\s*"([^"]+)"', r'<video\s+src="([^"]+)"'], + webpage, 'video URL') + + title = self._search_regex( + [r'property="?og:title"?\s+content="([^"]+)"', r'class="?title"?>([^<]+)'], + webpage, 'title') - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) + description = self._search_regex( + r'name="?(?:og:)?description"?\s+content="([^"]+)"', + webpage, 'description', default=None) - timestamp = self._html_search_regex(r'<time datetime="([^"]+)"', webpage, 'upload date', fatal=False) + timestamp = self._html_search_regex( + r'"date"\s*:\s*"([^"]+)"', webpage, 'upload date', fatal=False) if timestamp: timestamp = parse_iso8601(timestamp[:-6]) view_count = str_to_int(self._html_search_regex( - r'<span class="views" id="viewCounts">([\d,\.]+) Views</span>', webpage, 'view count', fatal=False)) + r'<span\s+class="?views"? id="?viewCounts"?>([\d,\.]+) Views</span>', + webpage, 'view count', fatal=False)) comment_count = str_to_int(self._html_search_regex( - r'<span id="commentCounts">([\d,\.]+)</span>', webpage, 'comment count', fatal=False)) + r'<span\s+id="?commentCounts"?>([\d,\.]+)</span>', + webpage, 'comment count', fatal=False)) m = re.search(r'_(?P<width>\d+)X(?P<height>\d+)\.mp4$', video_url) if m: diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 6ded723c9..d1b6d7366 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -12,6 +12,7 @@ from math import pow, sqrt, floor from .common import InfoExtractor from ..compat import ( compat_urllib_parse, + compat_urllib_parse_unquote, compat_urllib_request, ) from ..utils import ( @@ -27,7 +28,7 @@ from ..aes import ( class CrunchyrollIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)' + _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)' _NETRC_MACHINE = 'crunchyroll' _TESTS = [{ 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', @@ -45,6 +46,22 @@ class CrunchyrollIE(InfoExtractor): # rtmp 'skip_download': True, }, + }, { + 'url': 'http://www.crunchyroll.com/media-589804/culture-japan-1', + 'info_dict': { + 'id': '589804', + 'ext': 'flv', + 'title': 'Culture Japan Episode 1 – Rebuilding Japan after the 3.11', + 'description': 'md5:fe2743efedb49d279552926d0bd0cd9e', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'Danny Choo Network', + 'upload_date': '20120213', + }, + 'params': { + # rtmp + 'skip_download': True, + }, + }, { 'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697', 'only_matching': True, @@ -76,8 +93,8 @@ class CrunchyrollIE(InfoExtractor): self._login() def _decrypt_subtitles(self, data, iv, id): - data = bytes_to_intlist(data) - iv = bytes_to_intlist(iv) + data = bytes_to_intlist(base64.b64decode(data.encode('utf-8'))) + iv = bytes_to_intlist(base64.b64decode(iv.encode('utf-8'))) id = int(id) def obfuscate_key_aux(count, modulo, start): @@ -179,6 +196,16 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text return output + def _extract_subtitles(self, subtitle): + sub_root = xml.etree.ElementTree.fromstring(subtitle) + return [{ + 'ext': 'srt', + 'data': self._convert_subtitles_to_srt(sub_root), + }, { + 'ext': 'ass', + 'data': self._convert_subtitles_to_ass(sub_root), + }] + def _get_subtitles(self, video_id, webpage): subtitles = {} for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage): @@ -190,25 +217,11 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text data = self._search_regex(r'<data>([^<]+)', sub_page, 'subtitle_data', fatal=False) if not id or not iv or not data: continue - id = int(id) - iv = base64.b64decode(iv) - data = base64.b64decode(data) - subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8') lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False) if not lang_code: continue - sub_root = xml.etree.ElementTree.fromstring(subtitle) - subtitles[lang_code] = [ - { - 'ext': 'srt', - 'data': self._convert_subtitles_to_srt(sub_root), - }, - { - 'ext': 'ass', - 'data': self._convert_subtitles_to_ass(sub_root), - }, - ] + subtitles[lang_code] = self._extract_subtitles(subtitle) return subtitles def _real_extract(self, url): @@ -242,7 +255,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text video_upload_date = unified_strdate(video_upload_date) video_uploader = self._html_search_regex(r'<div>\s*Publisher:(.+?)</div>', webpage, 'video_uploader', fatal=False, flags=re.DOTALL) - playerdata_url = compat_urllib_parse.unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url')) + playerdata_url = compat_urllib_parse_unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url')) playerdata_req = compat_urllib_request.Request(playerdata_url) playerdata_req.data = compat_urllib_parse.urlencode({'current_page': webpage_url}) playerdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') @@ -255,16 +268,17 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text for fmt in re.findall(r'showmedia\.([0-9]{3,4})p', webpage): stream_quality, stream_format = self._FORMAT_IDS[fmt] video_format = fmt + 'p' - streamdata_req = compat_urllib_request.Request('http://www.crunchyroll.com/xml/') - # urlencode doesn't work! - streamdata_req.data = 'req=RpcApiVideoEncode%5FGetStreamInfo&video%5Fencode%5Fquality=' + stream_quality + '&media%5Fid=' + stream_id + '&video%5Fformat=' + stream_format + streamdata_req = compat_urllib_request.Request( + 'http://www.crunchyroll.com/xml/?req=RpcApiVideoPlayer_GetStandardConfig&media_id=%s&video_format=%s&video_quality=%s' + % (stream_id, stream_format, stream_quality), + compat_urllib_parse.urlencode({'current_page': url}).encode('utf-8')) streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') - streamdata_req.add_header('Content-Length', str(len(streamdata_req.data))) streamdata = self._download_xml( streamdata_req, video_id, note='Downloading media info for %s' % video_format) - video_url = streamdata.find('.//host').text - video_play_path = streamdata.find('.//file').text + stream_info = streamdata.find('./{default}preload/stream_info') + video_url = stream_info.find('./host').text + video_play_path = stream_info.find('./file').text formats.append({ 'url': video_url, 'play_path': video_play_path, diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index 955119d40..fbefd37d0 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -7,7 +7,10 @@ from ..utils import ( int_or_none, unescapeHTML, find_xpath_attr, + smuggle_url, + determine_ext, ) +from .senateisvp import SenateISVPIE class CSpanIE(InfoExtractor): @@ -35,11 +38,22 @@ class CSpanIE(InfoExtractor): } }, { 'url': 'http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall', + 'md5': '446562a736c6bf97118e389433ed88d4', 'info_dict': { 'id': '342759', + 'ext': 'mp4', 'title': 'General Motors Ignition Switch Recall', + 'duration': 14848, + 'description': 'md5:70c7c3b8fa63fa60d42772440596034c' }, - 'playlist_duration_sum': 14855, + }, { + # Video from senate.gov + 'url': 'http://www.c-span.org/video/?104517-1/immigration-reforms-needed-protect-skilled-american-workers', + 'info_dict': { + 'id': 'judiciary031715', + 'ext': 'flv', + 'title': 'Immigration Reforms Needed to Protect Skilled American Workers', + } }] def _real_extract(self, url): @@ -56,7 +70,7 @@ class CSpanIE(InfoExtractor): # present, otherwise this is a stripped version r'<p class=\'initial\'>(.*?)</p>' ], - webpage, 'description', flags=re.DOTALL) + webpage, 'description', flags=re.DOTALL, default=None) info_url = 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=program&id=' + video_id data = self._download_json(info_url, video_id) @@ -68,7 +82,16 @@ class CSpanIE(InfoExtractor): title = find_xpath_attr(doc, './/string', 'name', 'title').text thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text + senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) + if senate_isvp_url: + surl = smuggle_url(senate_isvp_url, {'force_title': title}) + return self.url_result(surl, 'SenateISVP', video_id, title) + files = data['video']['files'] + try: + capfile = data['video']['capfile']['#text'] + except KeyError: + capfile = None entries = [{ 'id': '%s_%d' % (video_id, partnum + 1), @@ -79,11 +102,22 @@ class CSpanIE(InfoExtractor): 'description': description, 'thumbnail': thumbnail, 'duration': int_or_none(f.get('length', {}).get('#text')), + 'subtitles': { + 'en': [{ + 'url': capfile, + 'ext': determine_ext(capfile, 'dfxp') + }], + } if capfile else None, } for partnum, f in enumerate(files)] - return { - '_type': 'playlist', - 'entries': entries, - 'title': title, - 'id': video_id, - } + if len(entries) == 1: + entry = dict(entries[0]) + entry['id'] = video_id + return entry + else: + return { + '_type': 'playlist', + 'entries': entries, + 'title': title, + 'id': video_id, + } diff --git a/youtube_dl/extractor/ctsnews.py b/youtube_dl/extractor/ctsnews.py index 0226f8036..45049bf37 100644 --- a/youtube_dl/extractor/ctsnews.py +++ b/youtube_dl/extractor/ctsnews.py @@ -6,6 +6,7 @@ from ..utils import parse_iso8601, ExtractorError class CtsNewsIE(InfoExtractor): + IE_DESC = '華視新聞' # https connection failed (Connection reset) _VALID_URL = r'http://news\.cts\.com\.tw/[a-z]+/[a-z]+/\d+/(?P<id>\d+)\.html' _TESTS = [{ diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 7615ecd4b..1a41c0db1 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -52,6 +52,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'ext': 'mp4', 'uploader': 'IGN', 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News', + 'upload_date': '20150306', + 'duration': 74, } }, # Vevo video @@ -85,7 +87,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - url = 'http://www.dailymotion.com/video/%s' % video_id + url = 'https://www.dailymotion.com/video/%s' % video_id # Retrieve video webpage to extract further information request = self._build_request(url) @@ -106,11 +108,11 @@ class DailymotionIE(DailymotionBaseInfoExtractor): age_limit = self._rta_search(webpage) video_upload_date = None - mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage) + mobj = re.search(r'<meta property="video:release_date" content="([0-9]{4})-([0-9]{2})-([0-9]{2}).+?"/>', webpage) if mobj is not None: - video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1) + video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3) - embed_url = 'http://www.dailymotion.com/embed/video/%s' % video_id + embed_url = 'https://www.dailymotion.com/embed/video/%s' % video_id embed_request = self._build_request(embed_url) embed_page = self._download_webpage( embed_request, video_id, 'Downloading embed page') @@ -163,6 +165,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'thumbnail': info['thumbnail_url'], 'age_limit': age_limit, 'view_count': view_count, + 'duration': info['duration'] } def _get_subtitles(self, video_id, webpage): @@ -224,7 +227,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): class DailymotionUserIE(DailymotionPlaylistIE): IE_NAME = 'dailymotion:user' - _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?:old/)?user/(?P<user>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?:(?:old/)?user/)?(?P<user>[^/]+)$' _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s' _TESTS = [{ 'url': 'https://www.dailymotion.com/user/nqtv', @@ -238,7 +241,8 @@ class DailymotionUserIE(DailymotionPlaylistIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) user = mobj.group('user') - webpage = self._download_webpage(url, user) + webpage = self._download_webpage( + 'https://www.dailymotion.com/user/%s' % user, user) full_user = unescapeHTML(self._html_search_regex( r'<a class="nav-image" title="([^"]+)" href="/%s">' % re.escape(user), webpage, 'user')) @@ -249,3 +253,53 @@ class DailymotionUserIE(DailymotionPlaylistIE): 'title': full_user, 'entries': self._extract_entries(user), } + + +class DailymotionCloudIE(DailymotionBaseInfoExtractor): + _VALID_URL_PREFIX = r'http://api\.dmcloud\.net/(?:player/)?embed/' + _VALID_URL = r'%s[^/]+/(?P<id>[^/?]+)' % _VALID_URL_PREFIX + _VALID_EMBED_URL = r'%s[^/]+/[^\'"]+' % _VALID_URL_PREFIX + + _TESTS = [{ + # From http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html + # Tested at FranceTvInfo_2 + 'url': 'http://api.dmcloud.net/embed/4e7343f894a6f677b10006b4/556e03339473995ee145930c?auth=1464865870-0-jyhsm84b-ead4c701fb750cf9367bf4447167a3db&autoplay=1', + 'only_matching': True, + }, { + # http://www.francetvinfo.fr/societe/larguez-les-amarres-le-cobaturage-se-developpe_980101.html + 'url': 'http://api.dmcloud.net/player/embed/4e7343f894a6f677b10006b4/559545469473996d31429f06?auth=1467430263-0-90tglw2l-a3a4b64ed41efe48d7fccad85b8b8fda&autoplay=1', + 'only_matching': True, + }] + + @classmethod + def _extract_dmcloud_url(self, webpage): + mobj = re.search(r'<iframe[^>]+src=[\'"](%s)[\'"]' % self._VALID_EMBED_URL, webpage) + if mobj: + return mobj.group(1) + + mobj = re.search( + r'<input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=[\'"](%s)[\'"]' % self._VALID_EMBED_URL, + webpage) + if mobj: + return mobj.group(1) + + def _real_extract(self, url): + video_id = self._match_id(url) + + request = self._build_request(url) + webpage = self._download_webpage(request, video_id) + + title = self._html_search_regex(r'<title>([^>]+)', webpage, 'title') + + video_info = self._parse_json(self._search_regex( + r'var\s+info\s*=\s*([^;]+);', webpage, 'video info'), video_id) + + # TODO: parse ios_url, which is in fact a manifest + video_url = video_info['mp4_url'] + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'thumbnail': video_info.get('thumbnail_url'), + } diff --git a/youtube_dl/extractor/dfb.py b/youtube_dl/extractor/dfb.py index 8049779b0..263532cc6 100644 --- a/youtube_dl/extractor/dfb.py +++ b/youtube_dl/extractor/dfb.py @@ -3,42 +3,47 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import unified_strdate class DFBIE(InfoExtractor): IE_NAME = 'tv.dfb.de' - _VALID_URL = r'https?://tv\.dfb\.de/video/[^/]+/(?P\d+)' + _VALID_URL = r'https?://tv\.dfb\.de/video/(?P[^/]+)/(?P\d+)' _TEST = { - 'url': 'http://tv.dfb.de/video/highlights-des-empfangs-in-berlin/9070/', + 'url': 'http://tv.dfb.de/video/u-19-em-stimmen-zum-spiel-gegen-russland/11633/', # The md5 is different each time 'info_dict': { - 'id': '9070', + 'id': '11633', + 'display_id': 'u-19-em-stimmen-zum-spiel-gegen-russland', 'ext': 'flv', - 'title': 'Highlights des Empfangs in Berlin', - 'upload_date': '20140716', + 'title': 'U 19-EM: Stimmen zum Spiel gegen Russland', + 'upload_date': '20150714', }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + display_id = mobj.group('display_id') - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, display_id) player_info = self._download_xml( 'http://tv.dfb.de/server/hd_video.php?play=%s' % video_id, - video_id) + display_id) video_info = player_info.find('video') - f4m_info = self._download_xml(self._proto_relative_url(video_info.find('url').text.strip()), video_id) + f4m_info = self._download_xml( + self._proto_relative_url(video_info.find('url').text.strip()), display_id) token_el = f4m_info.find('token') manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + '&hdcore=3.2.0' + formats = self._extract_f4m_formats(manifest_url, display_id) return { 'id': video_id, + 'display_id': display_id, 'title': video_info.find('title').text, - 'url': manifest_url, - 'ext': 'flv', 'thumbnail': self._og_search_thumbnail(webpage), - 'upload_date': ''.join(video_info.find('time_date').text.split('.')[::-1]), + 'upload_date': unified_strdate(video_info.find('time_date').text), + 'formats': formats, } diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index d3e667528..d6723ecf2 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -2,19 +2,19 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + parse_duration, parse_iso8601, - int_or_none, ) +from ..compat import compat_str class DiscoveryIE(InfoExtractor): _VALID_URL = r'http://www\.discovery\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P[a-zA-Z0-9_\-]*)(?:\.htm)?' - _TEST = { + _TESTS = [{ 'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm', - 'md5': '3c69d77d9b0d82bfd5e5932a60f26504', 'info_dict': { - 'id': 'mission-impossible-outtakes', - 'ext': 'flv', + 'id': '20769', + 'ext': 'mp4', 'title': 'Mission Impossible Outtakes', 'description': ('Watch Jamie Hyneman and Adam Savage practice being' ' each other -- to the point of confusing Jamie\'s dog -- and ' @@ -24,22 +24,36 @@ class DiscoveryIE(InfoExtractor): 'timestamp': 1303099200, 'upload_date': '20110418', }, - } + 'params': { + 'skip_download': True, # requires ffmpeg + } + }, { + 'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mythbusters-the-simpsons', + 'info_dict': { + 'id': 'mythbusters-the-simpsons', + 'title': 'MythBusters: The Simpsons', + }, + 'playlist_count': 9, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + info = self._download_json(url + '?flat=1', video_id) - info = self._parse_json(self._search_regex( - r'(?s)', - webpage, 'video info'), video_id) + video_title = info.get('playlist_title') or info.get('video_title') - return { - 'id': video_id, - 'title': info['name'], - 'url': info['contentURL'], - 'description': info.get('description'), - 'thumbnail': info.get('thumbnailUrl'), - 'timestamp': parse_iso8601(info.get('uploadDate')), - 'duration': int_or_none(info.get('duration')), - } + entries = [{ + 'id': compat_str(video_info['id']), + 'formats': self._extract_m3u8_formats( + video_info['src'], video_id, ext='mp4', + note='Download m3u8 information for video %d' % (idx + 1)), + 'title': video_info['title'], + 'description': video_info.get('description'), + 'duration': parse_duration(video_info.get('video_length')), + 'webpage_url': video_info.get('href'), + 'thumbnail': video_info.get('thumbnailURL'), + 'alt_title': video_info.get('secondary_title'), + 'timestamp': parse_iso8601(video_info.get('publishedDate')), + } for idx, video_info in enumerate(info['playlist'])] + + return self.playlist_result(entries, video_id, video_title) diff --git a/youtube_dl/extractor/dotsub.py b/youtube_dl/extractor/dotsub.py index f51d88a98..e9ca236d4 100644 --- a/youtube_dl/extractor/dotsub.py +++ b/youtube_dl/extractor/dotsub.py @@ -36,7 +36,8 @@ class DotsubIE(InfoExtractor): if not video_url: webpage = self._download_webpage(url, video_id) video_url = self._search_regex( - r'"file"\s*:\s*\'([^\']+)', webpage, 'video url') + [r']+src="([^"]+)"', r'"file"\s*:\s*\'([^\']+)'], + webpage, 'video url') return { 'id': video_id, diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index 479430c51..373b3b4b4 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -9,6 +9,7 @@ from ..compat import (compat_str, compat_basestring) class DouyuTVIE(InfoExtractor): + IE_DESC = '斗鱼' _VALID_URL = r'http://(?:www\.)?douyutv\.com/(?P[A-Za-z0-9]+)' _TESTS = [{ 'url': 'http://www.douyutv.com/iseven', diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py new file mode 100644 index 000000000..38e6597c8 --- /dev/null +++ b/youtube_dl/extractor/dramafever.py @@ -0,0 +1,216 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_urllib_parse, + compat_urllib_request, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + clean_html, + determine_ext, + int_or_none, + parse_iso8601, +) + + +class DramaFeverBaseIE(InfoExtractor): + _LOGIN_URL = 'https://www.dramafever.com/accounts/login/' + _NETRC_MACHINE = 'dramafever' + + _CONSUMER_SECRET = 'DA59dtVXYLxajktV' + + _consumer_secret = None + + def _get_consumer_secret(self): + mainjs = self._download_webpage( + 'http://www.dramafever.com/static/51afe95/df2014/scripts/main.js', + None, 'Downloading main.js', fatal=False) + if not mainjs: + return self._CONSUMER_SECRET + return self._search_regex( + r"var\s+cs\s*=\s*'([^']+)'", mainjs, + 'consumer secret', default=self._CONSUMER_SECRET) + + def _real_initialize(self): + self._login() + self._consumer_secret = self._get_consumer_secret() + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + login_form = { + 'username': username, + 'password': password, + } + + request = compat_urllib_request.Request( + self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) + response = self._download_webpage( + request, None, 'Logging in as %s' % username) + + if all(logout_pattern not in response + for logout_pattern in ['href="/accounts/logout/"', '>Log out<']): + error = self._html_search_regex( + r'(?s)class="hidden-xs prompt"[^>]*>(.+?)<', + response, 'error message', default=None) + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + +class DramaFeverIE(DramaFeverBaseIE): + IE_NAME = 'dramafever' + _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P[0-9]+/[0-9]+)(?:/|$)' + _TEST = { + 'url': 'http://www.dramafever.com/drama/4512/1/Cooking_with_Shin/', + 'info_dict': { + 'id': '4512.1', + 'ext': 'flv', + 'title': 'Cooking with Shin 4512.1', + 'description': 'md5:a8eec7942e1664a6896fcd5e1287bfd0', + 'thumbnail': 're:^https?://.*\.jpg', + 'timestamp': 1404336058, + 'upload_date': '20140702', + 'duration': 343, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url).replace('/', '.') + + try: + feed = self._download_json( + 'http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id, + video_id, 'Downloading episode JSON')['channel']['item'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + raise ExtractorError( + 'Currently unavailable in your country.', expected=True) + raise + + media_group = feed.get('media-group', {}) + + formats = [] + for media_content in media_group['media-content']: + src = media_content.get('@attributes', {}).get('url') + if not src: + continue + ext = determine_ext(src) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + src, video_id, f4m_id='hds')) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', m3u8_id='hls')) + else: + formats.append({ + 'url': src, + }) + self._sort_formats(formats) + + title = media_group.get('media-title') + description = media_group.get('media-description') + duration = int_or_none(media_group['media-content'][0].get('@attributes', {}).get('duration')) + thumbnail = self._proto_relative_url( + media_group.get('media-thumbnail', {}).get('@attributes', {}).get('url')) + timestamp = parse_iso8601(feed.get('pubDate'), ' ') + + subtitles = {} + for media_subtitle in media_group.get('media-subTitle', []): + lang = media_subtitle.get('@attributes', {}).get('lang') + href = media_subtitle.get('@attributes', {}).get('href') + if not lang or not href: + continue + subtitles[lang] = [{ + 'ext': 'ttml', + 'url': href, + }] + + series_id, episode_number = video_id.split('.') + episode_info = self._download_json( + # We only need a single episode info, so restricting page size to one episode + # and dealing with page number as with episode number + r'http://www.dramafever.com/api/4/episode/series/?cs=%s&series_id=%s&page_number=%s&page_size=1' + % (self._consumer_secret, series_id, episode_number), + video_id, 'Downloading episode info JSON', fatal=False) + if episode_info: + value = episode_info.get('value') + if value: + subfile = value[0].get('subfile') or value[0].get('new_subfile') + if subfile and subfile != 'http://www.dramafever.com/st/': + subtitles.setdefault('English', []).append({ + 'ext': 'srt', + 'url': subfile, + }) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + } + + +class DramaFeverSeriesIE(DramaFeverBaseIE): + IE_NAME = 'dramafever:series' + _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P[0-9]+)(?:/(?:(?!\d+(?:/|$)).+)?)?$' + _TESTS = [{ + 'url': 'http://www.dramafever.com/drama/4512/Cooking_with_Shin/', + 'info_dict': { + 'id': '4512', + 'title': 'Cooking with Shin', + 'description': 'md5:84a3f26e3cdc3fb7f500211b3593b5c1', + }, + 'playlist_count': 4, + }, { + 'url': 'http://www.dramafever.com/drama/124/IRIS/', + 'info_dict': { + 'id': '124', + 'title': 'IRIS', + 'description': 'md5:b3a30e587cf20c59bd1c01ec0ee1b862', + }, + 'playlist_count': 20, + }] + + _PAGE_SIZE = 60 # max is 60 (see http://api.drama9.com/#get--api-4-episode-series-) + + def _real_extract(self, url): + series_id = self._match_id(url) + + series = self._download_json( + 'http://www.dramafever.com/api/4/series/query/?cs=%s&series_id=%s' + % (self._consumer_secret, series_id), + series_id, 'Downloading series JSON')['series'][series_id] + + title = clean_html(series['name']) + description = clean_html(series.get('description') or series.get('description_short')) + + entries = [] + for page_num in itertools.count(1): + episodes = self._download_json( + 'http://www.dramafever.com/api/4/episode/series/?cs=%s&series_id=%s&page_size=%d&page_number=%d' + % (self._consumer_secret, series_id, self._PAGE_SIZE, page_num), + series_id, 'Downloading episodes JSON page #%d' % page_num) + for episode in episodes.get('value', []): + episode_url = episode.get('episode_url') + if not episode_url: + continue + entries.append(self.url_result( + compat_urlparse.urljoin(url, episode_url), + 'DramaFever', episode.get('guid'))) + if page_num == episodes['num_pages']: + break + + return self.playlist_result(entries, series_id, title, description) diff --git a/youtube_dl/extractor/drbonanza.py b/youtube_dl/extractor/drbonanza.py index 7626219ba..8b98b013a 100644 --- a/youtube_dl/extractor/drbonanza.py +++ b/youtube_dl/extractor/drbonanza.py @@ -15,7 +15,6 @@ class DRBonanzaIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.dr.dk/bonanza/serie/portraetter/Talkshowet.htm?assetId=65517', - 'md5': 'fe330252ddea607635cf2eb2c99a0af3', 'info_dict': { 'id': '65517', 'ext': 'mp4', @@ -26,6 +25,9 @@ class DRBonanzaIE(InfoExtractor): 'upload_date': '20110120', 'duration': 3664, }, + 'params': { + 'skip_download': True, # requires rtmp + }, }, { 'url': 'http://www.dr.dk/bonanza/radio/serie/sport/fodbold.htm?assetId=59410', 'md5': '6dfe039417e76795fb783c52da3de11d', @@ -93,6 +95,11 @@ class DRBonanzaIE(InfoExtractor): 'format_id': file['Type'].replace('Video', ''), 'preference': preferencemap.get(file['Type'], -10), }) + if format['url'].startswith('rtmp'): + rtmp_url = format['url'] + format['rtmp_live'] = True # --resume does not work + if '/bonanza/' in rtmp_url: + format['play_path'] = rtmp_url.split('/bonanza/')[1] formats.append(format) elif file['Type'] == "Thumb": thumbnail = file['Location'] @@ -111,9 +118,6 @@ class DRBonanzaIE(InfoExtractor): description = '%s\n%s\n%s\n' % ( info['Description'], info['Actors'], info['Colophon']) - for f in formats: - f['url'] = f['url'].replace('rtmp://vod-bonanza.gss.dr.dk/bonanza/', 'http://vodfiles.dr.dk/') - f['url'] = f['url'].replace('mp4:bonanza', 'bonanza') self._sort_formats(formats) display_id = re.sub(r'[^\w\d-]', '', re.sub(r' ', '-', title.lower())) + '-' + asset_id diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 05bb22ddf..8ac8587be 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -11,19 +11,25 @@ from ..utils import ( class DreiSatIE(InfoExtractor): IE_NAME = '3sat' - _VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P[0-9]+)$' - _TEST = { - 'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918', - 'md5': 'be37228896d30a88f315b638900a026e', - 'info_dict': { - 'id': '45918', - 'ext': 'mp4', - 'title': 'Waidmannsheil', - 'description': 'md5:cce00ca1d70e21425e72c86a98a56817', - 'uploader': '3sat', - 'upload_date': '20140913' - } - } + _VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php|mediathek\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P[0-9]+)$' + _TESTS = [ + { + 'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918', + 'md5': 'be37228896d30a88f315b638900a026e', + 'info_dict': { + 'id': '45918', + 'ext': 'mp4', + 'title': 'Waidmannsheil', + 'description': 'md5:cce00ca1d70e21425e72c86a98a56817', + 'uploader': '3sat', + 'upload_date': '20140913' + } + }, + { + 'url': 'http://www.3sat.de/mediathek/mediathek.php?mode=play&obj=51066', + 'only_matching': True, + }, + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/drtuber.py b/youtube_dl/extractor/drtuber.py index 37c5c181f..639f9182c 100644 --- a/youtube_dl/extractor/drtuber.py +++ b/youtube_dl/extractor/drtuber.py @@ -36,25 +36,24 @@ class DrTuberIE(InfoExtractor): r'([^<]+)', r'([^<]+) - \d+'], + [r'<p[^>]+class="title_substrate">([^<]+)</p>', r'<title>([^<]+) - \d+'], webpage, 'title') thumbnail = self._html_search_regex( r'poster="([^"]+)"', webpage, 'thumbnail', fatal=False) - like_count = str_to_int(self._html_search_regex( - r'<span id="rate_likes">\s*<img[^>]+>\s*<span>([\d,\.]+)</span>', - webpage, 'like count', fatal=False)) - dislike_count = str_to_int(self._html_search_regex( - r'<span id="rate_dislikes">\s*<img[^>]+>\s*<span>([\d,\.]+)</span>', - webpage, 'like count', fatal=False)) - comment_count = str_to_int(self._html_search_regex( - r'<span class="comments_count">([\d,\.]+)</span>', - webpage, 'comment count', fatal=False)) + def extract_count(id_, name): + return str_to_int(self._html_search_regex( + r'<span[^>]+(?:class|id)="%s"[^>]*>([\d,\.]+)</span>' % id_, + webpage, '%s count' % name, fatal=False)) + + like_count = extract_count('rate_likes', 'like') + dislike_count = extract_count('rate_dislikes', 'dislike') + comment_count = extract_count('comments_count', 'comment') cats_str = self._search_regex( - r'<span>Categories:</span><div>(.+?)</div>', webpage, 'categories', fatal=False) + r'<div[^>]+class="categories_list">(.+?)</div>', webpage, 'categories', fatal=False) categories = [] if not cats_str else re.findall(r'<a title="([^"]+)"', cats_str) return { diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index f25ab319e..baa24c6d1 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -1,8 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor, ExtractorError -from ..utils import parse_iso8601 +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + parse_iso8601, +) class DRTVIE(InfoExtractor): @@ -60,19 +63,31 @@ class DRTVIE(InfoExtractor): restricted_to_denmark = asset['RestrictedToDenmark'] spoken_subtitles = asset['Target'] == 'SpokenSubtitles' for link in asset['Links']: - target = link['Target'] uri = link['Uri'] + target = link['Target'] format_id = target - preference = -1 if target == 'HDS' else -2 + preference = None if spoken_subtitles: - preference -= 2 + preference = -1 format_id += '-spoken-subtitles' - formats.append({ - 'url': uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43' if target == 'HDS' else uri, - 'format_id': format_id, - 'ext': link['FileFormat'], - 'preference': preference, - }) + if target == 'HDS': + formats.extend(self._extract_f4m_formats( + uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43', + video_id, preference, f4m_id=format_id)) + elif target == 'HLS': + formats.extend(self._extract_m3u8_formats( + uri, video_id, 'mp4', preference=preference, + m3u8_id=format_id)) + else: + bitrate = link.get('Bitrate') + if bitrate: + format_id += '-%s' % bitrate + formats.append({ + 'url': uri, + 'format_id': format_id, + 'tbr': bitrate, + 'ext': link.get('FileFormat'), + }) subtitles_list = asset.get('SubtitlesList') if isinstance(subtitles_list, list): LANGS = { diff --git a/youtube_dl/extractor/dumpert.py b/youtube_dl/extractor/dumpert.py index 9c594b757..999fb5620 100644 --- a/youtube_dl/extractor/dumpert.py +++ b/youtube_dl/extractor/dumpert.py @@ -26,7 +26,7 @@ class DumpertIE(InfoExtractor): video_id = self._match_id(url) req = compat_urllib_request.Request(url) - req.add_header('Cookie', 'nsfw=1') + req.add_header('Cookie', 'nsfw=1; cpc=10') webpage = self._download_webpage(req, video_id) files_base64 = self._search_regex( diff --git a/youtube_dl/extractor/ehow.py b/youtube_dl/extractor/ehow.py index 9cb1bf301..b1cd4f5d4 100644 --- a/youtube_dl/extractor/ehow.py +++ b/youtube_dl/extractor/ehow.py @@ -1,9 +1,7 @@ from __future__ import unicode_literals -from ..compat import ( - compat_urllib_parse, -) from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote class EHowIE(InfoExtractor): @@ -26,7 +24,7 @@ class EHowIE(InfoExtractor): webpage = self._download_webpage(url, video_id) video_url = self._search_regex( r'(?:file|source)=(http[^\'"&]*)', webpage, 'video URL') - final_url = compat_urllib_parse.unquote(video_url) + final_url = compat_urllib_parse_unquote(video_url) uploader = self._html_search_meta('uploader', webpage) title = self._og_search_title(webpage).replace(' | eHow', '') diff --git a/youtube_dl/extractor/ellentv.py b/youtube_dl/extractor/ellentv.py index 5154bbd7f..02c6a4615 100644 --- a/youtube_dl/extractor/ellentv.py +++ b/youtube_dl/extractor/ellentv.py @@ -6,57 +6,42 @@ import json from .common import InfoExtractor from ..utils import ( ExtractorError, - parse_iso8601, ) class EllenTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?(?:ellentv|ellentube)\.com/videos/(?P<id>[a-z0-9_-]+)' - _TESTS = [{ + _TEST = { 'url': 'http://www.ellentv.com/videos/0-ipq1gsai/', 'md5': '8e3c576bf2e9bfff4d76565f56f94c9c', 'info_dict': { - 'id': '0-ipq1gsai', + 'id': '0_ipq1gsai', 'ext': 'mp4', 'title': 'Fast Fingers of Fate', - 'description': 'md5:686114ced0a032926935e9015ee794ac', - 'timestamp': 1428033600, + 'description': 'md5:587e79fbbd0d73b148bc596d99ce48e6', + 'timestamp': 1428035648, 'upload_date': '20150403', + 'uploader_id': 'batchUser', } - }, { - 'url': 'http://ellentube.com/videos/0-dvzmabd5/', - 'md5': '98238118eaa2bbdf6ad7f708e3e4f4eb', - 'info_dict': { - 'id': '0-dvzmabd5', - 'ext': 'mp4', - 'title': '1 year old twin sister makes her brother laugh', - 'description': '1 year old twin sister makes her brother laugh', - 'timestamp': 1419542075, - 'upload_date': '20141225', - } - }] + } def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + 'http://widgets.ellentube.com/videos/%s' % video_id, + video_id) - video_url = self._html_search_meta('VideoURL', webpage, 'url', fatal=True) - title = self._og_search_title(webpage, default=None) or self._search_regex( - r'pageName\s*=\s*"([^"]+)"', webpage, 'title') - description = self._html_search_meta( - 'description', webpage, 'description') or self._og_search_description(webpage) - timestamp = parse_iso8601(self._search_regex( - r'<span class="publish-date"><time datetime="([^"]+)">', - webpage, 'timestamp', fatal=False)) + partner_id = self._search_regex( + r"var\s+partnerId\s*=\s*'([^']+)", webpage, 'partner id') - return { - 'id': video_id, - 'url': video_url, - 'title': title, - 'description': description, - 'timestamp': timestamp, - } + kaltura_id = self._search_regex( + [r'id="kaltura_player_([^"]+)"', + r"_wb_entry_id\s*:\s*'([^']+)", + r'data-kaltura-entry-id="([^"]+)'], + webpage, 'kaltura id') + + return self.url_result('kaltura:%s:%s' % (partner_id, kaltura_id), 'Kaltura') class EllenTVClipsIE(InfoExtractor): @@ -68,7 +53,7 @@ class EllenTVClipsIE(InfoExtractor): 'id': 'meryl-streep-vanessa-hudgens', 'title': 'Meryl Streep, Vanessa Hudgens', }, - 'playlist_mincount': 9, + 'playlist_mincount': 7, } def _real_extract(self, url): @@ -92,4 +77,8 @@ class EllenTVClipsIE(InfoExtractor): raise ExtractorError('Failed to download JSON', cause=ve) def _extract_entries(self, playlist): - return [self.url_result(item['url'], 'EllenTV') for item in playlist] + return [ + self.url_result( + 'kaltura:%s:%s' % (item['kaltura_partner_id'], item['kaltura_entry_id']), + 'Kaltura') + for item in playlist] diff --git a/youtube_dl/extractor/empflix.py b/youtube_dl/extractor/empflix.py deleted file mode 100644 index 70f8efe27..000000000 --- a/youtube_dl/extractor/empflix.py +++ /dev/null @@ -1,25 +0,0 @@ -from __future__ import unicode_literals - -from .tnaflix import TNAFlixIE - - -class EMPFlixIE(TNAFlixIE): - _VALID_URL = r'^https?://www\.empflix\.com/videos/(?P<display_id>[0-9a-zA-Z-]+)-(?P<id>[0-9]+)\.html' - - _TITLE_REGEX = r'name="title" value="(?P<title>[^"]*)"' - _DESCRIPTION_REGEX = r'name="description" value="([^"]*)"' - _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"' - - _TEST = { - 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', - 'md5': 'b1bc15b6412d33902d6e5952035fcabc', - 'info_dict': { - 'id': '33051', - 'display_id': 'Amateur-Finger-Fuck', - 'ext': 'mp4', - 'title': 'Amateur Finger Fuck', - 'description': 'Amateur solo finger fucking.', - 'thumbnail': 're:https?://.*\.jpg$', - 'age_limit': 18, - } - } diff --git a/youtube_dl/extractor/eroprofile.py b/youtube_dl/extractor/eroprofile.py index 0cbca90b0..316033cf1 100644 --- a/youtube_dl/extractor/eroprofile.py +++ b/youtube_dl/extractor/eroprofile.py @@ -4,7 +4,10 @@ import re from .common import InfoExtractor from ..compat import compat_urllib_parse -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + unescapeHTML +) class EroProfileIE(InfoExtractor): @@ -75,8 +78,8 @@ class EroProfileIE(InfoExtractor): [r"glbUpdViews\s*\('\d*','(\d+)'", r'p/report/video/(\d+)'], webpage, 'video id', default=None) - video_url = self._search_regex( - r'<source src="([^"]+)', webpage, 'video url') + video_url = unescapeHTML(self._search_regex( + r'<source src="([^"]+)', webpage, 'video url')) title = self._html_search_regex( r'Title:</th><td>([^<]+)</td>', webpage, 'title') thumbnail = self._search_regex( diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py index e47f3e27a..c85b4c458 100644 --- a/youtube_dl/extractor/escapist.py +++ b/youtube_dl/extractor/escapist.py @@ -1,128 +1,107 @@ from __future__ import unicode_literals +import json + from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, - compat_urllib_request, -) +from ..compat import compat_urllib_request + from ..utils import ( - ExtractorError, - js_to_json, - parse_duration, + determine_ext, + clean_html, + int_or_none, + float_or_none, ) +def _decrypt_config(key, string): + a = '' + i = '' + r = '' + + while len(a) < (len(string) / 2): + a += key + + a = a[0:int(len(string) / 2)] + + t = 0 + while t < len(string): + i += chr(int(string[t] + string[t + 1], 16)) + t += 2 + + icko = [s for s in i] + + for t, c in enumerate(a): + r += chr(ord(c) ^ ord(icko[t])) + + return r + + class EscapistIE(InfoExtractor): - _VALID_URL = r'https?://?(www\.)?escapistmagazine\.com/videos/view/[^/?#]+/(?P<id>[0-9]+)-[^/?#]*(?:$|[?#])' - _USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko' - _TEST = { + _VALID_URL = r'https?://?(?:www\.)?escapistmagazine\.com/videos/view/[^/?#]+/(?P<id>[0-9]+)-[^/?#]*(?:$|[?#])' + _TESTS = [{ 'url': 'http://www.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate', 'md5': 'ab3a706c681efca53f0a35f1415cf0d1', 'info_dict': { 'id': '6618', 'ext': 'mp4', 'description': "Baldur's Gate: Original, Modded or Enhanced Edition? I'll break down what you can expect from the new Baldur's Gate: Enhanced Edition.", - 'uploader_id': 'the-escapist-presents', - 'uploader': 'The Escapist Presents', 'title': "Breaking Down Baldur's Gate", 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 264, + 'uploader': 'The Escapist', + } + }, { + 'url': 'http://www.escapistmagazine.com/videos/view/zero-punctuation/10044-Evolve-One-vs-Multiplayer', + 'md5': '9e8c437b0dbb0387d3bd3255ca77f6bf', + 'info_dict': { + 'id': '10044', + 'ext': 'mp4', + 'description': 'This week, Zero Punctuation reviews Evolve.', + 'title': 'Evolve - One vs Multiplayer', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 304, + 'uploader': 'The Escapist', } - } + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage_req = compat_urllib_request.Request(url) - webpage_req.add_header('User-Agent', self._USER_AGENT) - webpage = self._download_webpage(webpage_req, video_id) - - uploader_id = self._html_search_regex( - r"<h1\s+class='headline'>\s*<a\s+href='/videos/view/(.*?)'", - webpage, 'uploader ID', fatal=False) - uploader = self._html_search_regex( - r"<h1\s+class='headline'>(.*?)</a>", - webpage, 'uploader', fatal=False) - description = self._html_search_meta('description', webpage) - duration = parse_duration(self._html_search_meta('duration', webpage)) - - raw_title = self._html_search_meta('title', webpage, fatal=True) - title = raw_title.partition(' : ')[2] - - config_url = compat_urllib_parse.unquote(self._html_search_regex( - r'''(?x) - (?: - <param\s+name="flashvars".*?\s+value="config=| - flashvars="config= - ) - (https?://[^"&]+) - ''', - webpage, 'config URL')) - - formats = [] - ad_formats = [] - - def _add_format(name, cfg_url, quality): - cfg_req = compat_urllib_request.Request(cfg_url) - cfg_req.add_header('User-Agent', self._USER_AGENT) - config = self._download_json( - cfg_req, video_id, - 'Downloading ' + name + ' configuration', - 'Unable to download ' + name + ' configuration', - transform_source=js_to_json) - - playlist = config['playlist'] - for p in playlist: - if p.get('eventCategory') == 'Video': - ar = formats - elif p.get('eventCategory') == 'Video Postroll': - ar = ad_formats - else: - continue - - ar.append({ - 'url': p['url'], - 'format_id': name, - 'quality': quality, - 'http_headers': { - 'User-Agent': self._USER_AGENT, - }, - }) - - _add_format('normal', config_url, quality=0) - hq_url = (config_url + - ('&hq=1' if '?' in config_url else config_url + '?hq=1')) - try: - _add_format('hq', hq_url, quality=1) - except ExtractorError: - pass # That's fine, we'll just use normal quality + webpage = self._download_webpage(url, video_id) + + ims_video = self._parse_json( + self._search_regex( + r'imsVideo\.play\(({.+?})\);', webpage, 'imsVideo'), + video_id) + video_id = ims_video['videoID'] + key = ims_video['hash'] + + config_req = compat_urllib_request.Request( + 'http://www.escapistmagazine.com/videos/' + 'vidconfig.php?videoID=%s&hash=%s' % (video_id, key)) + config_req.add_header('Referer', url) + config = self._download_webpage(config_req, video_id, 'Downloading video config') + + data = json.loads(_decrypt_config(key, config)) + + video_data = data['videoData'] + + title = clean_html(video_data['title']) + duration = float_or_none(video_data.get('duration'), 1000) + uploader = video_data.get('publisher') + + formats = [{ + 'url': video['src'], + 'format_id': '%s-%sp' % (determine_ext(video['src']), video['res']), + 'height': int_or_none(video.get('res')), + } for video in data['files']['videos']] self._sort_formats(formats) - if '/escapist/sales-marketing/' in formats[-1]['url']: - raise ExtractorError('This IP address has been blocked by The Escapist', expected=True) - - res = { + return { 'id': video_id, 'formats': formats, - 'uploader': uploader, - 'uploader_id': uploader_id, 'title': title, 'thumbnail': self._og_search_thumbnail(webpage), - 'description': description, + 'description': self._og_search_description(webpage), 'duration': duration, + 'uploader': uploader, } - - if self._downloader.params.get('include_ads') and ad_formats: - self._sort_formats(ad_formats) - ad_res = { - 'id': '%s-ad' % video_id, - 'title': '%s (Postroll)' % title, - 'formats': ad_formats, - } - return { - '_type': 'playlist', - 'entries': [res, ad_res], - 'title': title, - 'id': video_id, - } - - return res diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py new file mode 100644 index 000000000..e6f8f0337 --- /dev/null +++ b/youtube_dl/extractor/espn.py @@ -0,0 +1,55 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class ESPNIE(InfoExtractor): + _VALID_URL = r'https?://espn\.go\.com/(?:[^/]+/)*(?P<id>[^/]+)' + _WORKING = False + _TESTS = [{ + 'url': 'http://espn.go.com/video/clip?id=10365079', + 'info_dict': { + 'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG', + 'ext': 'mp4', + 'title': 'dm_140128_30for30Shorts___JudgingJewellv2', + 'description': '', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://espn.go.com/video/iframe/twitter/?cms=espn&id=10365079', + 'only_matching': True, + }, { + 'url': 'http://espn.go.com/nba/recap?gameId=400793786', + 'only_matching': True, + }, { + 'url': 'http://espn.go.com/blog/golden-state-warriors/post/_/id/593/how-warriors-rapidly-regained-a-winning-edge', + 'only_matching': True, + }, { + 'url': 'http://espn.go.com/sports/endurance/story/_/id/12893522/dzhokhar-tsarnaev-sentenced-role-boston-marathon-bombings', + 'only_matching': True, + }, { + 'url': 'http://espn.go.com/nba/playoffs/2015/story/_/id/12887571/john-wall-washington-wizards-no-swelling-left-hand-wrist-game-5-return', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_id = self._search_regex( + r'class="video-play-button"[^>]+data-id="(\d+)', + webpage, 'video id') + + player = self._download_webpage( + 'https://espn.go.com/video/iframe/twitter/?id=%s' % video_id, video_id) + + pcode = self._search_regex( + r'["\']pcode=([^"\']+)["\']', player, 'pcode') + + return self.url_result( + 'ooyalaexternal:espn:%s:%s' % (video_id, pcode), + 'OoyalaExternal') diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index f0e575320..e17bb9aea 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -9,7 +9,7 @@ from ..compat import ( compat_http_client, compat_str, compat_urllib_error, - compat_urllib_parse, + compat_urllib_parse_unquote, compat_urllib_request, ) from ..utils import ( @@ -24,8 +24,12 @@ class FacebookIE(InfoExtractor): _VALID_URL = r'''(?x) https?://(?:\w+\.)?facebook\.com/ (?:[^#]*?\#!/)? - (?:video/video\.php|photo\.php|video\.php|video/embed)\?(?:.*?) - (?:v|video_id)=(?P<id>[0-9]+) + (?: + (?:video/video\.php|photo\.php|video\.php|video/embed)\?(?:.*?) + (?:v|video_id)=| + [^/]+/videos/(?:[^/]+/)? + ) + (?P<id>[0-9]+) (?:.*)''' _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1' _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1' @@ -46,10 +50,19 @@ class FacebookIE(InfoExtractor): 'id': '274175099429670', 'ext': 'mp4', 'title': 'Facebook video #274175099429670', - } + }, + 'expected_warnings': [ + 'title' + ] }, { 'url': 'https://www.facebook.com/video.php?v=10204634152394104', 'only_matching': True, + }, { + 'url': 'https://www.facebook.com/amogood/videos/1618742068337349/?fref=nf', + 'only_matching': True, + }, { + 'url': 'https://www.facebook.com/ChristyClarkForBC/videos/vb.22819070941/10153870694020942/?type=2&theater', + 'only_matching': True, }] def _login(self): @@ -123,7 +136,7 @@ class FacebookIE(InfoExtractor): else: raise ExtractorError('Cannot parse data') data = dict(json.loads(m.group(1))) - params_raw = compat_urllib_parse.unquote(data['params']) + params_raw = compat_urllib_parse_unquote(data['params']) params = json.loads(params_raw) video_data = params['video_data'][0] @@ -139,12 +152,12 @@ class FacebookIE(InfoExtractor): raise ExtractorError('Cannot find video formats') video_title = self._html_search_regex( - r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, 'title', - fatal=False) + r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, 'title', + default=None) if not video_title: video_title = self._html_search_regex( r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>', - webpage, 'alternative title', default=None) + webpage, 'alternative title', fatal=False) video_title = limit_length(video_title, 80) if not video_title: video_title = 'Facebook video #%s' % video_id diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py index 3c39ca451..cebdd0193 100644 --- a/youtube_dl/extractor/faz.py +++ b/youtube_dl/extractor/faz.py @@ -6,9 +6,9 @@ from .common import InfoExtractor class FazIE(InfoExtractor): IE_NAME = 'faz.net' - _VALID_URL = r'https?://www\.faz\.net/multimedia/videos/.*?-(?P<id>\d+)\.html' + _VALID_URL = r'https?://(?:www\.)?faz\.net/(?:[^/]+/)*.*?-(?P<id>\d+)\.html' - _TEST = { + _TESTS = [{ 'url': 'http://www.faz.net/multimedia/videos/stockholm-chemie-nobelpreis-fuer-drei-amerikanische-forscher-12610585.html', 'info_dict': { 'id': '12610585', @@ -16,7 +16,22 @@ class FazIE(InfoExtractor): 'title': 'Stockholm: Chemie-Nobelpreis für drei amerikanische Forscher', 'description': 'md5:1453fbf9a0d041d985a47306192ea253', }, - } + }, { + 'url': 'http://www.faz.net/aktuell/politik/berlin-gabriel-besteht-zerreissprobe-ueber-datenspeicherung-13659345.html', + 'only_matching': True, + }, { + 'url': 'http://www.faz.net/berlin-gabriel-besteht-zerreissprobe-ueber-datenspeicherung-13659345.html', + 'only_matching': True, + }, { + 'url': 'http://www.faz.net/-13659345.html', + 'only_matching': True, + }, { + 'url': 'http://www.faz.net/aktuell/politik/-13659345.html', + 'only_matching': True, + }, { + 'url': 'http://www.faz.net/foobarblafasel-13659345.html', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/firedrive.py b/youtube_dl/extractor/firedrive.py deleted file mode 100644 index 3191116d9..000000000 --- a/youtube_dl/extractor/firedrive.py +++ /dev/null @@ -1,80 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, - compat_urllib_request, -) -from ..utils import ( - ExtractorError, -) - - -class FiredriveIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?firedrive\.com/' + \ - '(?:file|embed)/(?P<id>[0-9a-zA-Z]+)' - _FILE_DELETED_REGEX = r'<div class="removed_file_image">' - - _TESTS = [{ - 'url': 'https://www.firedrive.com/file/FEB892FA160EBD01', - 'md5': 'd5d4252f80ebeab4dc2d5ceaed1b7970', - 'info_dict': { - 'id': 'FEB892FA160EBD01', - 'ext': 'flv', - 'title': 'bbb_theora_486kbit.flv', - 'thumbnail': 're:^http://.*\.jpg$', - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - url = 'http://firedrive.com/file/%s' % video_id - webpage = self._download_webpage(url, video_id) - - if re.search(self._FILE_DELETED_REGEX, webpage) is not None: - raise ExtractorError('Video %s does not exist' % video_id, - expected=True) - - fields = dict(re.findall(r'''(?x)<input\s+ - type="hidden"\s+ - name="([^"]+)"\s+ - value="([^"]*)" - ''', webpage)) - - post = compat_urllib_parse.urlencode(fields) - req = compat_urllib_request.Request(url, post) - req.add_header('Content-type', 'application/x-www-form-urlencoded') - - # Apparently, this header is required for confirmation to work. - req.add_header('Host', 'www.firedrive.com') - - webpage = self._download_webpage(req, video_id, - 'Downloading video page') - - title = self._search_regex(r'class="external_title_left">(.+)</div>', - webpage, 'title') - thumbnail = self._search_regex(r'image:\s?"(//[^\"]+)', webpage, - 'thumbnail', fatal=False) - if thumbnail is not None: - thumbnail = 'http:' + thumbnail - - ext = self._search_regex(r'type:\s?\'([^\']+)\',', - webpage, 'extension', fatal=False) - video_url = self._search_regex( - r'file:\s?loadURL\(\'(http[^\']+)\'\),', webpage, 'file url') - - formats = [{ - 'format_id': 'sd', - 'url': video_url, - 'ext': ext, - }] - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - } diff --git a/youtube_dl/extractor/fivetv.py b/youtube_dl/extractor/fivetv.py new file mode 100644 index 000000000..13fbc4da2 --- /dev/null +++ b/youtube_dl/extractor/fivetv.py @@ -0,0 +1,88 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class FiveTVIE(InfoExtractor): + _VALID_URL = r'''(?x) + http:// + (?:www\.)?5-tv\.ru/ + (?: + (?:[^/]+/)+(?P<id>\d+)| + (?P<path>[^/?#]+)(?:[/?#])? + ) + ''' + + _TESTS = [{ + 'url': 'http://5-tv.ru/news/96814/', + 'md5': 'bbff554ad415ecf5416a2f48c22d9283', + 'info_dict': { + 'id': '96814', + 'ext': 'mp4', + 'title': 'Россияне выбрали имя для общенациональной платежной системы', + 'description': 'md5:a8aa13e2b7ad36789e9f77a74b6de660', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 180, + }, + }, { + 'url': 'http://5-tv.ru/video/1021729/', + 'info_dict': { + 'id': '1021729', + 'ext': 'mp4', + 'title': '3D принтер', + 'description': 'md5:d76c736d29ef7ec5c0cf7d7c65ffcb41', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 180, + }, + }, { + 'url': 'http://www.5-tv.ru/glavnoe/#itemDetails', + 'info_dict': { + 'id': 'glavnoe', + 'ext': 'mp4', + 'title': 'Итоги недели с 8 по 14 июня 2015 года', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + }, { + 'url': 'http://www.5-tv.ru/glavnoe/broadcasts/508645/', + 'only_matching': True, + }, { + 'url': 'http://5-tv.ru/films/1507502/', + 'only_matching': True, + }, { + 'url': 'http://5-tv.ru/programs/broadcast/508713/', + 'only_matching': True, + }, { + 'url': 'http://5-tv.ru/angel/', + 'only_matching': True, + }, { + 'url': 'http://www.5-tv.ru/schedule/?iframe=true&width=900&height=450', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') or mobj.group('path') + + webpage = self._download_webpage(url, video_id) + + video_url = self._search_regex( + r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"', + webpage, 'video url') + + title = self._og_search_title(webpage, default=None) or self._search_regex( + r'<title>([^<]+)', webpage, 'title') + duration = int_or_none(self._og_search_property( + 'video:duration', webpage, 'duration', default=None)) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'duration': duration, + } diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index 0c858b654..2fe76d661 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -3,9 +3,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urllib_request from ..utils import ( ExtractorError, - unescapeHTML, + find_xpath_attr, ) @@ -29,25 +30,31 @@ class FlickrIE(InfoExtractor): video_id = mobj.group('id') video_uploader_id = mobj.group('uploader_id') webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id - webpage = self._download_webpage(webpage_url, video_id) + req = compat_urllib_request.Request(webpage_url) + req.add_header( + 'User-Agent', + # it needs a more recent version + 'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20150101 Firefox/38.0 (Chrome)') + webpage = self._download_webpage(req, video_id) - secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, 'secret') + secret = self._search_regex(r'secret"\s*:\s*"(\w+)"', webpage, 'secret') first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self' - first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage') + first_xml = self._download_xml(first_url, video_id, 'Downloading first data webpage') - node_id = self._html_search_regex(r'(\d+-\d+)', - first_xml, 'node_id') + node_id = find_xpath_attr( + first_xml, './/{http://video.yahoo.com/YEP/1.0/}Item', 'id', + 'id').text second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1' - second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage') + second_xml = self._download_xml(second_url, video_id, 'Downloading second data webpage') self.report_extraction(video_id) - mobj = re.search(r'[^/]+)' + + _TEST = { + 'url': 'http://www.foxsports.com/video?vid=432609859715', + 'info_dict': { + 'id': 'gA0bHB3Ladz3', + 'ext': 'flv', + 'title': 'Courtney Lee on going up 2-0 in series vs. Blazers', + 'description': 'Courtney Lee talks about Memphis being focused.', + }, + 'add_ie': ['ThePlatform'], + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + config = self._parse_json( + self._search_regex( + r"data-player-config='([^']+)'", webpage, 'data player config'), + video_id) + + return self.url_result(smuggle_url( + config['releaseURL'] + '&manifest=f4m', {'force_smil_url': True})) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index edf555b29..75723c00d 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -6,18 +6,15 @@ import re import json from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlparse, - compat_urlparse, -) +from ..compat import compat_urlparse from ..utils import ( clean_html, ExtractorError, int_or_none, - float_or_none, parse_duration, determine_ext, ) +from .dailymotion import DailymotionCloudIE class FranceTVBaseInfoExtractor(InfoExtractor): @@ -58,12 +55,12 @@ class FranceTVBaseInfoExtractor(InfoExtractor): # See https://github.com/rg3/youtube-dl/issues/3963 # m3u8 urls work fine continue - video_url_parsed = compat_urllib_parse_urlparse(video_url) f4m_url = self._download_webpage( - 'http://hdfauth.francetv.fr/esi/urltokengen2.html?url=%s' % video_url_parsed.path, + 'http://hdfauth.francetv.fr/esi/TA?url=%s' % video_url, video_id, 'Downloading f4m manifest token', fatal=False) if f4m_url: - formats.extend(self._extract_f4m_formats(f4m_url, video_id, 1, format_id)) + formats.extend(self._extract_f4m_formats( + f4m_url + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, 1, format_id)) elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4', m3u8_id=format_id)) elif video_url.startswith('rtmp'): @@ -86,7 +83,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor): 'title': info['titre'], 'description': clean_html(info['synopsis']), 'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', info['image']), - 'duration': float_or_none(info.get('real_duration'), 1000) or parse_duration(info['duree']), + 'duration': int_or_none(info.get('real_duration')) or parse_duration(info['duree']), 'timestamp': int_or_none(info['diffusion']['timestamp']), 'formats': formats, } @@ -131,12 +128,26 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): 'skip_download': 'HLS (reqires ffmpeg)' }, 'skip': 'Ce direct est terminé et sera disponible en rattrapage dans quelques minutes.', + }, { + 'url': 'http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html', + 'md5': 'f485bda6e185e7d15dbc69b72bae993e', + 'info_dict': { + 'id': '556e03339473995ee145930c', + 'ext': 'mp4', + 'title': 'Les entreprises familiales : le secret de la réussite', + 'thumbnail': 're:^https?://.*\.jpe?g$', + } }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) page_title = mobj.group('title') webpage = self._download_webpage(url, page_title) + + dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage) + if dmcloud_url: + return self.url_result(dmcloud_url, 'DailymotionCloud') + video_id, catalogue = self._search_regex( r'id-video=([^@]+@[^"]+)', webpage, 'video id').split('@') return self._extract_video(video_id, catalogue) @@ -145,11 +156,21 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): class FranceTVIE(FranceTVBaseInfoExtractor): IE_NAME = 'francetv' IE_DESC = 'France 2, 3, 4, 5 and Ô' - _VALID_URL = r'''(?x)https?://www\.france[2345o]\.fr/ - (?: - emissions/.*?/(videos|emissions)/(?P[^/?]+) - | (emissions?|jt)/(?P[^/?]+) - )''' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?france[2345o]\.fr/ + (?: + emissions/[^/]+/(?:videos|diffusions)| + emission/[^/]+| + videos| + jt + ) + /| + embed\.francetv\.fr/\?ue= + ) + (?P[^/?]+) + ''' _TESTS = [ # france2 @@ -206,24 +227,46 @@ class FranceTVIE(FranceTVBaseInfoExtractor): }, # franceo { - 'url': 'http://www.franceo.fr/jt/info-afrique/04-12-2013', - 'md5': '52f0bfe202848b15915a2f39aaa8981b', + 'url': 'http://www.franceo.fr/jt/info-soir/18-07-2015', + 'md5': '47d5816d3b24351cdce512ad7ab31da8', 'info_dict': { - 'id': '108634970', + 'id': '125377621', 'ext': 'flv', - 'title': 'Infô Afrique', - 'description': 'md5:ebf346da789428841bee0fd2a935ea55', - 'upload_date': '20140915', - 'timestamp': 1410822000, + 'title': 'Infô soir', + 'description': 'md5:01b8c6915a3d93d8bbbd692651714309', + 'upload_date': '20150718', + 'timestamp': 1437241200, + 'duration': 414, + }, + }, + { + # francetv embed + 'url': 'http://embed.francetv.fr/?ue=8d7d3da1e3047c42ade5a5d7dfd3fc87', + 'info_dict': { + 'id': 'EV_30231', + 'ext': 'flv', + 'title': 'Alcaline, le concert avec Calogero', + 'description': 'md5:61f08036dcc8f47e9cfc33aed08ffaff', + 'upload_date': '20150226', + 'timestamp': 1424989860, + 'duration': 5400, }, }, + { + 'url': 'http://www.france4.fr/emission/highlander/diffusion-du-17-07-2015-04h05', + 'only_matching': True, + }, + { + 'url': 'http://www.franceo.fr/videos/125377617', + 'only_matching': True, + } ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - webpage = self._download_webpage(url, mobj.group('key') or mobj.group('id')) + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) video_id, catalogue = self._html_search_regex( - r'href="http://videos\.francetv\.fr/video/([^@]+@[^"]+)"', + r'href="http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"', webpage, 'video ID').split('@') return self._extract_video(video_id, catalogue) diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 47373e215..b3f1bafcc 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -5,7 +5,7 @@ import json from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_unquote, compat_urlparse, ) from ..utils import ( @@ -14,8 +14,8 @@ from ..utils import ( class GameSpotIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P\d+)/?' - _TEST = { + _VALID_URL = r'http://(?:www\.)?gamespot\.com/.*-(?P\d+)/?' + _TESTS = [{ 'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/', 'md5': 'b2a30deaa8654fcccd43713a6b6a4825', 'info_dict': { @@ -23,8 +23,16 @@ class GameSpotIE(InfoExtractor): 'ext': 'mp4', 'title': 'Arma 3 - Community Guide: SITREP I', 'description': 'Check out this video where some of the basics of Arma 3 is explained.', - } - } + }, + }, { + 'url': 'http://www.gamespot.com/videos/the-witcher-3-wild-hunt-xbox-one-now-playing/2300-6424837/', + 'info_dict': { + 'id': 'gs-2300-6424837', + 'ext': 'flv', + 'title': 'The Witcher 3: Wild Hunt [Xbox ONE] - Now Playing', + 'description': 'Join us as we take a look at the early hours of The Witcher 3: Wild Hunt and more.', + }, + }] def _real_extract(self, url): page_id = self._match_id(url) @@ -32,30 +40,42 @@ class GameSpotIE(InfoExtractor): data_video_json = self._search_regex( r'data-video=["\'](.*?)["\']', webpage, 'data video') data_video = json.loads(unescapeHTML(data_video_json)) + streams = data_video['videoStreams'] - # Transform the manifest url to a link to the mp4 files - # they are used in mobile devices. - f4m_url = data_video['videoStreams']['f4m_stream'] - f4m_path = compat_urlparse.urlparse(f4m_url).path - QUALITIES_RE = r'((,\d+)+,?)' - qualities = self._search_regex(QUALITIES_RE, f4m_path, 'qualities').strip(',').split(',') - http_path = f4m_path[1:].split('/', 1)[1] - http_template = re.sub(QUALITIES_RE, r'%s', http_path) - http_template = http_template.replace('.csmil/manifest.f4m', '') - http_template = compat_urlparse.urljoin( - 'http://video.gamespotcdn.com/', http_template) formats = [] - for q in qualities: - formats.append({ - 'url': http_template % q, - 'ext': 'mp4', - 'format_id': q, - }) + f4m_url = streams.get('f4m_stream') + if f4m_url is not None: + # Transform the manifest url to a link to the mp4 files + # they are used in mobile devices. + f4m_path = compat_urlparse.urlparse(f4m_url).path + QUALITIES_RE = r'((,\d+)+,?)' + qualities = self._search_regex(QUALITIES_RE, f4m_path, 'qualities').strip(',').split(',') + http_path = f4m_path[1:].split('/', 1)[1] + http_template = re.sub(QUALITIES_RE, r'%s', http_path) + http_template = http_template.replace('.csmil/manifest.f4m', '') + http_template = compat_urlparse.urljoin( + 'http://video.gamespotcdn.com/', http_template) + for q in qualities: + formats.append({ + 'url': http_template % q, + 'ext': 'mp4', + 'format_id': q, + }) + else: + for quality in ['sd', 'hd']: + # It's actually a link to a flv file + flv_url = streams.get('f4m_{0}'.format(quality)) + if flv_url is not None: + formats.append({ + 'url': flv_url, + 'ext': 'flv', + 'format_id': quality, + }) return { 'id': data_video['guid'], 'display_id': page_id, - 'title': compat_urllib_parse.unquote(data_video['title']), + 'title': compat_urllib_parse_unquote(data_video['title']), 'formats': formats, 'description': self._html_search_meta('description', webpage), 'thumbnail': self._og_search_thumbnail(webpage), diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index 51796f3a4..43f916412 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -11,7 +11,7 @@ from ..utils import remove_end class GDCVaultIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?gdcvault\.com/play/(?P\d+)/(?P(\w|-)+)' + _VALID_URL = r'https?://(?:www\.)?gdcvault\.com/play/(?P\d+)/(?P(\w|-)+)?' _NETRC_MACHINE = 'gdcvault' _TESTS = [ { @@ -19,6 +19,7 @@ class GDCVaultIE(InfoExtractor): 'md5': '7ce8388f544c88b7ac11c7ab1b593704', 'info_dict': { 'id': '1019721', + 'display_id': 'Doki-Doki-Universe-Sweet-Simple', 'ext': 'mp4', 'title': 'Doki-Doki Universe: Sweet, Simple and Genuine (GDC Next 10)' } @@ -27,6 +28,7 @@ class GDCVaultIE(InfoExtractor): 'url': 'http://www.gdcvault.com/play/1015683/Embracing-the-Dark-Art-of', 'info_dict': { 'id': '1015683', + 'display_id': 'Embracing-the-Dark-Art-of', 'ext': 'flv', 'title': 'Embracing the Dark Art of Mathematical Modeling in AI' }, @@ -39,10 +41,15 @@ class GDCVaultIE(InfoExtractor): 'md5': 'a5eb77996ef82118afbbe8e48731b98e', 'info_dict': { 'id': '1015301', + 'display_id': 'Thexder-Meets-Windows-95-or', 'ext': 'flv', 'title': 'Thexder Meets Windows 95, or Writing Great Games in the Windows 95 Environment', }, 'skip': 'Requires login', + }, + { + 'url': 'http://gdcvault.com/play/1020791/', + 'only_matching': True, } ] @@ -90,7 +97,7 @@ class GDCVaultIE(InfoExtractor): }) return video_formats - def _login(self, webpage_url, video_id): + def _login(self, webpage_url, display_id): (username, password) = self._get_login_info() if username is None or password is None: self.report_warning('It looks like ' + webpage_url + ' requires a login. Try specifying a username and password and try again.') @@ -107,9 +114,9 @@ class GDCVaultIE(InfoExtractor): request = compat_urllib_request.Request(login_url, compat_urllib_parse.urlencode(login_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') - self._download_webpage(request, video_id, 'Logging in') - start_page = self._download_webpage(webpage_url, video_id, 'Getting authenticated video page') - self._download_webpage(logout_url, video_id, 'Logging out') + self._download_webpage(request, display_id, 'Logging in') + start_page = self._download_webpage(webpage_url, display_id, 'Getting authenticated video page') + self._download_webpage(logout_url, display_id, 'Logging out') return start_page @@ -117,8 +124,10 @@ class GDCVaultIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + display_id = mobj.group('name') or video_id + webpage_url = 'http://www.gdcvault.com/play/' + video_id - start_page = self._download_webpage(webpage_url, video_id) + start_page = self._download_webpage(webpage_url, display_id) direct_url = self._search_regex( r's1\.addVariable\("file",\s*encodeURIComponent\("(/[^"]+)"\)\);', @@ -131,6 +140,7 @@ class GDCVaultIE(InfoExtractor): return { 'id': video_id, + 'display_id': display_id, 'url': video_url, 'ext': 'flv', 'title': title, @@ -141,7 +151,7 @@ class GDCVaultIE(InfoExtractor): start_page, 'xml root', default=None) if xml_root is None: # Probably need to authenticate - login_res = self._login(webpage_url, video_id) + login_res = self._login(webpage_url, display_id) if login_res is None: self.report_warning('Could not login.') else: @@ -159,7 +169,7 @@ class GDCVaultIE(InfoExtractor): xml_name = self._html_search_regex(r'