Merge branch 'lecture2go' of https://github.com/nichdu/youtube-dl into nichdu-lecture2go
authorYen Chi Hsuan <yan12125@gmail.com>
Wed, 22 Jul 2015 12:10:45 +0000 (20:10 +0800)
committerYen Chi Hsuan <yan12125@gmail.com>
Wed, 22 Jul 2015 12:10:45 +0000 (20:10 +0800)
282 files changed:
AUTHORS
README.md
docs/supportedsites.md
test/helper.py
test/parameters.json
test/test_YoutubeDL.py
test/test_aes.py
test/test_compat.py
test/test_download.py
test/test_subtitles.py
test/test_utils.py
tox.ini
youtube_dl/YoutubeDL.py
youtube_dl/__init__.py
youtube_dl/aes.py
youtube_dl/compat.py
youtube_dl/downloader/__init__.py
youtube_dl/downloader/common.py
youtube_dl/downloader/dash.py [new file with mode: 0644]
youtube_dl/downloader/external.py
youtube_dl/downloader/f4m.py
youtube_dl/downloader/http.py
youtube_dl/downloader/mplayer.py [deleted file]
youtube_dl/downloader/rtmp.py
youtube_dl/downloader/rtsp.py [new file with mode: 0644]
youtube_dl/extractor/__init__.py
youtube_dl/extractor/adobetv.py
youtube_dl/extractor/aftenposten.py
youtube_dl/extractor/aftonbladet.py
youtube_dl/extractor/appleconnect.py [new file with mode: 0644]
youtube_dl/extractor/archiveorg.py
youtube_dl/extractor/ard.py
youtube_dl/extractor/arte.py
youtube_dl/extractor/baidu.py [new file with mode: 0644]
youtube_dl/extractor/bambuser.py
youtube_dl/extractor/bandcamp.py
youtube_dl/extractor/bbccouk.py
youtube_dl/extractor/bet.py
youtube_dl/extractor/bild.py
youtube_dl/extractor/bilibili.py
youtube_dl/extractor/bliptv.py
youtube_dl/extractor/br.py
youtube_dl/extractor/brightcove.py
youtube_dl/extractor/byutv.py
youtube_dl/extractor/canalplus.py
youtube_dl/extractor/cbs.py
youtube_dl/extractor/cbsnews.py
youtube_dl/extractor/ccc.py
youtube_dl/extractor/ceskatelevize.py
youtube_dl/extractor/chilloutzone.py
youtube_dl/extractor/cinemassacre.py [new file with mode: 0644]
youtube_dl/extractor/clipsyndicate.py
youtube_dl/extractor/cnet.py
youtube_dl/extractor/cnn.py
youtube_dl/extractor/comedycentral.py
youtube_dl/extractor/common.py
youtube_dl/extractor/cracked.py
youtube_dl/extractor/crunchyroll.py
youtube_dl/extractor/cspan.py
youtube_dl/extractor/ctsnews.py
youtube_dl/extractor/dailymotion.py
youtube_dl/extractor/dfb.py
youtube_dl/extractor/discovery.py
youtube_dl/extractor/dotsub.py
youtube_dl/extractor/douyutv.py
youtube_dl/extractor/dramafever.py [new file with mode: 0644]
youtube_dl/extractor/drbonanza.py
youtube_dl/extractor/dreisat.py
youtube_dl/extractor/drtuber.py
youtube_dl/extractor/drtv.py
youtube_dl/extractor/dumpert.py
youtube_dl/extractor/ehow.py
youtube_dl/extractor/ellentv.py
youtube_dl/extractor/empflix.py [deleted file]
youtube_dl/extractor/eroprofile.py
youtube_dl/extractor/escapist.py
youtube_dl/extractor/espn.py [new file with mode: 0644]
youtube_dl/extractor/facebook.py
youtube_dl/extractor/faz.py
youtube_dl/extractor/firedrive.py [deleted file]
youtube_dl/extractor/fivetv.py [new file with mode: 0644]
youtube_dl/extractor/flickr.py
youtube_dl/extractor/foxsports.py [new file with mode: 0644]
youtube_dl/extractor/francetv.py
youtube_dl/extractor/gamespot.py
youtube_dl/extractor/gdcvault.py
youtube_dl/extractor/generic.py
youtube_dl/extractor/gfycat.py [new file with mode: 0644]
youtube_dl/extractor/giga.py
youtube_dl/extractor/gorillavid.py
youtube_dl/extractor/grooveshark.py [deleted file]
youtube_dl/extractor/hentaistigma.py
youtube_dl/extractor/historicfilms.py
youtube_dl/extractor/hitbox.py
youtube_dl/extractor/hostingbulk.py
youtube_dl/extractor/howcast.py
youtube_dl/extractor/howstuffworks.py
youtube_dl/extractor/iconosquare.py
youtube_dl/extractor/ign.py
youtube_dl/extractor/imdb.py
youtube_dl/extractor/imgur.py
youtube_dl/extractor/ina.py
youtube_dl/extractor/infoq.py
youtube_dl/extractor/instagram.py
youtube_dl/extractor/iprima.py
youtube_dl/extractor/iqiyi.py [new file with mode: 0644]
youtube_dl/extractor/izlesene.py
youtube_dl/extractor/jeuxvideo.py
youtube_dl/extractor/kanalplay.py
youtube_dl/extractor/karaoketv.py
youtube_dl/extractor/karrierevideos.py [new file with mode: 0644]
youtube_dl/extractor/kickstarter.py
youtube_dl/extractor/kuwo.py [new file with mode: 0644]
youtube_dl/extractor/letv.py
youtube_dl/extractor/lifenews.py
youtube_dl/extractor/liveleak.py
youtube_dl/extractor/livestream.py
youtube_dl/extractor/lynda.py
youtube_dl/extractor/malemotion.py
youtube_dl/extractor/megavideoz.py [new file with mode: 0644]
youtube_dl/extractor/metacafe.py
youtube_dl/extractor/miomio.py
youtube_dl/extractor/mitele.py
youtube_dl/extractor/mixcloud.py
youtube_dl/extractor/mlb.py
youtube_dl/extractor/mofosex.py
youtube_dl/extractor/moniker.py
youtube_dl/extractor/mtv.py
youtube_dl/extractor/myspass.py
youtube_dl/extractor/myvi.py [new file with mode: 0644]
youtube_dl/extractor/myvideo.py
youtube_dl/extractor/nationalgeographic.py
youtube_dl/extractor/naver.py
youtube_dl/extractor/nba.py
youtube_dl/extractor/nbc.py
youtube_dl/extractor/ndr.py
youtube_dl/extractor/neteasemusic.py [new file with mode: 0644]
youtube_dl/extractor/netzkino.py
youtube_dl/extractor/newstube.py
youtube_dl/extractor/nextmedia.py
youtube_dl/extractor/nfl.py
youtube_dl/extractor/nhl.py
youtube_dl/extractor/niconico.py
youtube_dl/extractor/noco.py
youtube_dl/extractor/nova.py [new file with mode: 0644]
youtube_dl/extractor/nowtv.py [new file with mode: 0644]
youtube_dl/extractor/npo.py
youtube_dl/extractor/nrk.py
youtube_dl/extractor/nytimes.py
youtube_dl/extractor/odnoklassniki.py
youtube_dl/extractor/onionstudios.py [new file with mode: 0644]
youtube_dl/extractor/ooyala.py
youtube_dl/extractor/openfilm.py
youtube_dl/extractor/orf.py
youtube_dl/extractor/patreon.py
youtube_dl/extractor/pbs.py
youtube_dl/extractor/philharmoniedeparis.py [new file with mode: 0644]
youtube_dl/extractor/photobucket.py
youtube_dl/extractor/pinkbike.py [new file with mode: 0644]
youtube_dl/extractor/pladform.py
youtube_dl/extractor/planetaplay.py
youtube_dl/extractor/played.py
youtube_dl/extractor/playvid.py
youtube_dl/extractor/porn91.py [new file with mode: 0644]
youtube_dl/extractor/pornhub.py
youtube_dl/extractor/pornovoisines.py
youtube_dl/extractor/primesharetv.py
youtube_dl/extractor/promptfile.py
youtube_dl/extractor/prosiebensat1.py
youtube_dl/extractor/qqmusic.py [new file with mode: 0644]
youtube_dl/extractor/quickvid.py
youtube_dl/extractor/rds.py [new file with mode: 0644]
youtube_dl/extractor/rtbf.py
youtube_dl/extractor/rtlnl.py
youtube_dl/extractor/rtlnow.py [deleted file]
youtube_dl/extractor/rts.py
youtube_dl/extractor/rtve.py
youtube_dl/extractor/rutv.py
youtube_dl/extractor/ruutu.py [new file with mode: 0644]
youtube_dl/extractor/safari.py
youtube_dl/extractor/sbs.py
youtube_dl/extractor/screenwavemedia.py
youtube_dl/extractor/senateisvp.py [new file with mode: 0644]
youtube_dl/extractor/shared.py
youtube_dl/extractor/smotri.py
youtube_dl/extractor/snagfilms.py [new file with mode: 0644]
youtube_dl/extractor/sockshare.py [deleted file]
youtube_dl/extractor/sohu.py
youtube_dl/extractor/soompi.py [new file with mode: 0644]
youtube_dl/extractor/soundcloud.py
youtube_dl/extractor/southpark.py
youtube_dl/extractor/spankwire.py
youtube_dl/extractor/spiegeltv.py
youtube_dl/extractor/spike.py
youtube_dl/extractor/sportbox.py
youtube_dl/extractor/srf.py [new file with mode: 0644]
youtube_dl/extractor/sunporno.py
youtube_dl/extractor/svt.py [new file with mode: 0644]
youtube_dl/extractor/svtplay.py [deleted file]
youtube_dl/extractor/tagesschau.py
youtube_dl/extractor/teamcoco.py
youtube_dl/extractor/ted.py
youtube_dl/extractor/telecinco.py
youtube_dl/extractor/tenplay.py
youtube_dl/extractor/testtube.py
youtube_dl/extractor/tf1.py
youtube_dl/extractor/theplatform.py
youtube_dl/extractor/thesixtyone.py
youtube_dl/extractor/thisamericanlife.py [new file with mode: 0644]
youtube_dl/extractor/tlc.py
youtube_dl/extractor/tmz.py
youtube_dl/extractor/tnaflix.py
youtube_dl/extractor/tube8.py
youtube_dl/extractor/tubitv.py [new file with mode: 0644]
youtube_dl/extractor/tumblr.py
youtube_dl/extractor/turbo.py
youtube_dl/extractor/tutv.py
youtube_dl/extractor/tv2.py [new file with mode: 0644]
youtube_dl/extractor/tvc.py [new file with mode: 0644]
youtube_dl/extractor/tvigle.py
youtube_dl/extractor/tvplay.py
youtube_dl/extractor/twentyfourvideo.py
youtube_dl/extractor/twitch.py
youtube_dl/extractor/twitter.py [new file with mode: 0644]
youtube_dl/extractor/udemy.py
youtube_dl/extractor/udn.py
youtube_dl/extractor/ultimedia.py
youtube_dl/extractor/ustream.py
youtube_dl/extractor/vbox7.py
youtube_dl/extractor/veehd.py
youtube_dl/extractor/vessel.py
youtube_dl/extractor/vgtv.py
youtube_dl/extractor/vice.py
youtube_dl/extractor/videomega.py
youtube_dl/extractor/videott.py
youtube_dl/extractor/vidme.py
youtube_dl/extractor/vier.py
youtube_dl/extractor/viewster.py
youtube_dl/extractor/viki.py
youtube_dl/extractor/vimeo.py
youtube_dl/extractor/vimple.py
youtube_dl/extractor/vine.py
youtube_dl/extractor/vk.py
youtube_dl/extractor/vodlocker.py
youtube_dl/extractor/voicerepublic.py [new file with mode: 0644]
youtube_dl/extractor/vporn.py
youtube_dl/extractor/vube.py
youtube_dl/extractor/vuclip.py
youtube_dl/extractor/vulture.py
youtube_dl/extractor/wat.py
youtube_dl/extractor/webofstories.py
youtube_dl/extractor/wimp.py
youtube_dl/extractor/worldstarhiphop.py
youtube_dl/extractor/xbef.py
youtube_dl/extractor/xhamster.py
youtube_dl/extractor/xminus.py
youtube_dl/extractor/xnxx.py
youtube_dl/extractor/xstream.py [new file with mode: 0644]
youtube_dl/extractor/xtube.py
youtube_dl/extractor/xuite.py
youtube_dl/extractor/xvideos.py
youtube_dl/extractor/yahoo.py
youtube_dl/extractor/yam.py
youtube_dl/extractor/yinyuetai.py [new file with mode: 0644]
youtube_dl/extractor/ynet.py
youtube_dl/extractor/youku.py
youtube_dl/extractor/youporn.py
youtube_dl/extractor/yourupload.py
youtube_dl/extractor/youtube.py
youtube_dl/extractor/zingmp3.py
youtube_dl/options.py
youtube_dl/postprocessor/__init__.py
youtube_dl/postprocessor/atomicparsley.py [deleted file]
youtube_dl/postprocessor/common.py
youtube_dl/postprocessor/embedthumbnail.py [new file with mode: 0644]
youtube_dl/postprocessor/execafterdownload.py
youtube_dl/postprocessor/ffmpeg.py
youtube_dl/postprocessor/metadatafromtitle.py
youtube_dl/postprocessor/xattrpp.py
youtube_dl/update.py
youtube_dl/utils.py
youtube_dl/version.py

diff --git a/AUTHORS b/AUTHORS
index db3f42b265780e27075c99672aa4f4bd6c2f0784..373e05c9f4ecc849bd55f984219b47739728171c 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -123,3 +123,14 @@ Will W.
 Mohammad Teimori Pabandi
 Roman Le Négrate
 Matthias Küch
+Julian Richen
+Ping O.
+Mister Hat
+Peter Ding
+jackyzy823
+George Brighton
+Remita Amine
+Aurélio A. Heckert
+Bernhard Minks
+sceext
+Zach Bruggeman
index caa1478d9951f3a8799eb05f979dade6c96fab52..ac54d7b67b8c36d370495759153678f711ac614e 100644 (file)
--- a/README.md
+++ b/README.md
@@ -5,6 +5,7 @@ youtube-dl - download videos from youtube.com or other video platforms
 - [OPTIONS](#options)
 - [CONFIGURATION](#configuration)
 - [OUTPUT TEMPLATE](#output-template)
+- [FORMAT SELECTION](#format-selection)
 - [VIDEO SELECTION](#video-selection)
 - [FAQ](#faq)
 - [DEVELOPER INSTRUCTIONS](#developer-instructions)
@@ -16,12 +17,12 @@ youtube-dl - download videos from youtube.com or other video platforms
 To install it right away for all UNIX users (Linux, OS X, etc.), type:
 
     sudo curl https://yt-dl.org/latest/youtube-dl -o /usr/local/bin/youtube-dl
-    sudo chmod a+x /usr/local/bin/youtube-dl
+    sudo chmod a+rx /usr/local/bin/youtube-dl
 
 If you do not have curl, you can alternatively use a recent wget:
 
     sudo wget https://yt-dl.org/downloads/latest/youtube-dl -O /usr/local/bin/youtube-dl
-    sudo chmod a+x /usr/local/bin/youtube-dl
+    sudo chmod a+rx /usr/local/bin/youtube-dl
 
 Windows users can [download a .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in their home directory or any other location on their [PATH](http://en.wikipedia.org/wiki/PATH_%28variable%29).
 
@@ -51,8 +52,9 @@ which means you can modify it, redistribute it or use it however you like.
     -i, --ignore-errors              Continue on download errors, for example to skip unavailable videos in a playlist
     --abort-on-error                 Abort downloading of further videos (in the playlist or the command line) if an error occurs
     --dump-user-agent                Display the current browser identification
-    --list-extractors                List all supported extractors and the URLs they would handle
+    --list-extractors                List all supported extractors
     --extractor-descriptions         Output descriptions of all supported extractors
+    --force-generic-extractor        Force extraction to use the generic extractor
     --default-search PREFIX          Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple".
                                      Use the value "auto" to let youtube-dl guess ("auto_warning" to emit a warning when guessing). "error" just throws an error. The
                                      default value "fixup_error" repairs broken URLs, but emits an error if this is not possible instead of searching.
@@ -73,7 +75,7 @@ which means you can modify it, redistribute it or use it however you like.
 ## Video Selection:
     --playlist-start NUMBER          Playlist video to start at (default is 1)
     --playlist-end NUMBER            Playlist video to end at (default is last)
-    --playlist-items ITEM_SPEC       Playlist video items to download. Specify indices of the videos in the playlist seperated by commas like: "--playlist-items 1,2,5,8"
+    --playlist-items ITEM_SPEC       Playlist video items to download. Specify indices of the videos in the playlist separated by commas like: "--playlist-items 1,2,5,8"
                                      if you want to download videos indexed 1, 2, 5, 8 in the playlist. You can specify range: "--playlist-items 1-3,7,10-13", it will
                                      download the videos at index 1, 2, 3, 7, 10, 11, 12 and 13.
     --match-title REGEX              Download only matching titles (regex or caseless sub-string)
@@ -106,7 +108,7 @@ which means you can modify it, redistribute it or use it however you like.
     --playlist-reverse               Download playlist videos in reverse order
     --xattr-set-filesize             Set file xattribute ytdl.filesize with expected filesize (experimental)
     --hls-prefer-native              Use the native HLS downloader instead of ffmpeg (experimental)
-    --external-downloader COMMAND    Use the specified external downloader. Currently supports aria2c,curl,wget
+    --external-downloader COMMAND    Use the specified external downloader. Currently supports aria2c,curl,httpie,wget
     --external-downloader-args ARGS  Give these arguments to the external downloader
 
 ## Filesystem Options:
@@ -132,7 +134,7 @@ which means you can modify it, redistribute it or use it however you like.
     --no-mtime                       Do not use the Last-modified header to set the file modification time
     --write-description              Write video description to a .description file
     --write-info-json                Write video metadata to a .info.json file
-    --write-annotations              Write video annotations to a .annotation file
+    --write-annotations              Write video annotations to a .annotations.xml file
     --load-info FILE                 JSON file containing the video information (created with the "--write-info-json" option)
     --cookies FILE                   File to read cookies from and dump cookie jar in
     --cache-dir DIR                  Location in the filesystem where youtube-dl can store some downloaded information permanently. By default $XDG_CACHE_HOME/youtube-dl
@@ -167,7 +169,7 @@ which means you can modify it, redistribute it or use it however you like.
     --no-progress                    Do not print progress bar
     --console-title                  Display progress in console titlebar
     -v, --verbose                    Print various debugging information
-    --dump-pages                     Print downloaded pages to debug problems (very verbose)
+    --dump-pages                     Print downloaded pages encoded using base64 to debug problems (very verbose)
     --write-pages                    Write downloaded intermediary pages to files in the current directory to debug problems
     --print-traffic                  Display sent and read HTTP traffic
     -C, --call-home                  Contact the youtube-dl server for debugging
@@ -184,22 +186,12 @@ which means you can modify it, redistribute it or use it however you like.
     --sleep-interval SECONDS         Number of seconds to sleep before each download.
 
 ## Video Format Options:
-    -f, --format FORMAT              Video format code, specify the order of preference using slashes, as in -f 22/17/18 .  Instead of format codes, you can select by
-                                     extension for the extensions aac, m4a, mp3, mp4, ogg, wav, webm. You can also use the special names "best", "bestvideo", "bestaudio",
-                                     "worst".  You can filter the video results by putting a condition in brackets, as in -f "best[height=720]" (or -f "[filesize>10M]").
-                                     This works for filesize, height, width, tbr, abr, vbr, asr, and fps and the comparisons <, <=, >, >=, =, != and for ext, acodec,
-                                     vcodec, container, and protocol and the comparisons =, != . Formats for which the value is not known are excluded unless you put a
-                                     question mark (?) after the operator. You can combine format filters, so  -f "[height <=? 720][tbr>500]" selects up to 720p videos
-                                     (or videos where the height is not known) with a bitrate of at least 500 KBit/s. By default, youtube-dl will pick the best quality.
-                                     Use commas to download multiple audio formats, such as -f  136/137/mp4/bestvideo,140/m4a/bestaudio. You can merge the video and audio
-                                     of two formats into a single file using -f <video-format>+<audio-format> (requires ffmpeg or avconv), for example -f
-                                     bestvideo+bestaudio.
+    -f, --format FORMAT              Video format code, see the "FORMAT SELECTION" for all the info
     --all-formats                    Download all available video formats
     --prefer-free-formats            Prefer free video formats unless a specific one is requested
-    --max-quality FORMAT             Highest quality format to download
     -F, --list-formats               List all available formats
-    --youtube-skip-dash-manifest     Do not download the DASH manifest on YouTube videos
-    --merge-output-format FORMAT     If a merge is required (e.g. bestvideo+bestaudio), output to given container format. One of mkv, mp4, ogg, webm, flv.Ignored if no
+    --youtube-skip-dash-manifest     Do not download the DASH manifests and related data on YouTube videos
+    --merge-output-format FORMAT     If a merge is required (e.g. bestvideo+bestaudio), output to given container format. One of mkv, mp4, ogg, webm, flv. Ignored if no
                                      merge is required
 
 ## Subtitle Options:
@@ -222,17 +214,18 @@ which means you can modify it, redistribute it or use it however you like.
     --audio-format FORMAT            Specify audio format: "best", "aac", "vorbis", "mp3", "m4a", "opus", or "wav"; "best" by default
     --audio-quality QUALITY          Specify ffmpeg/avconv audio quality, insert a value between 0 (better) and 9 (worse) for VBR or a specific bitrate like 128K (default
                                      5)
-    --recode-video FORMAT            Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv)
+    --recode-video FORMAT            Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv|avi)
+    --postprocessor-args ARGS        Give these arguments to the postprocessor
     -k, --keep-video                 Keep the video file on disk after the post-processing; the video is erased by default
     --no-post-overwrites             Do not overwrite post-processed files; the post-processed files are overwritten by default
-    --embed-subs                     Embed subtitles in the video (only for mp4 videos)
+    --embed-subs                     Embed subtitles in the video (only for mkv and mp4 videos)
     --embed-thumbnail                Embed thumbnail in the audio as cover art
     --add-metadata                   Write metadata to the video file
     --metadata-from-title FORMAT     Parse additional metadata like song title / artist from the video title. The format syntax is the same as --output, the parsed
-                                     parameters replace existing values. Additional templates: %(album), %(artist). Example: --metadata-from-title "%(artist)s -
+                                     parameters replace existing values. Additional templates: %(album)s, %(artist)s. Example: --metadata-from-title "%(artist)s -
                                      %(title)s" matches a title like "Coldplay - Paradise"
     --xattrs                         Write metadata to the video file's xattrs (using dublin core and xdg standards)
-    --fixup POLICY                   Automatically correct known faults of the file. One of never (do nothing), warn (only emit a warning), detect_or_warn(the default;
+    --fixup POLICY                   Automatically correct known faults of the file. One of never (do nothing), warn (only emit a warning), detect_or_warn (the default;
                                      fix file if we can, warn otherwise)
     --prefer-avconv                  Prefer avconv over ffmpeg for running the postprocessors (default)
     --prefer-ffmpeg                  Prefer ffmpeg over avconv for running the postprocessors
@@ -245,6 +238,26 @@ which means you can modify it, redistribute it or use it however you like.
 
 You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl/config`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config.txt` and `C:\Users\<user name>\youtube-dl.conf`.
 
+### Authentication with `.netrc` file ###
+
+You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every youtube-dl execution and prevent tracking plain text passwords in shell command history. You can achieve this using [`.netrc` file](http://stackoverflow.com/tags/.netrc/info) on per extractor basis. For that you will need to create `.netrc` file in your `$HOME` and restrict permissions to read/write by you only:
+```
+touch $HOME/.netrc
+chmod a-rwx,u+rw $HOME/.netrc
+```
+After that you can add credentials for extractor in the following format, where *extractor* is the name of extractor in lowercase:
+```
+machine <extractor> login <login> password <password>
+```
+For example:
+```
+machine youtube login myaccount@gmail.com password my_youtube_password
+machine twitch login my_twitch_account_name password my_twitch_password
+```
+To activate authentication with `.netrc` file you should pass `--netrc` to youtube-dl or to place it in [configuration file](#configuration).
+
+On Windows you may also need to setup `%HOME%` environment variable manually.
+
 # OUTPUT TEMPLATE
 
 The `-o` option allows users to indicate a template for the output file names. The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "http://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences have the format `%(NAME)s`. To clarify, that is a percent symbol followed by a name in parenthesis, followed by a lowercase S. Allowed names are:
@@ -271,6 +284,17 @@ $ youtube-dl --get-filename -o "%(title)s.%(ext)s" BaW_jenozKc --restrict-filena
 youtube-dl_test_video_.mp4          # A simple file name
 ```
 
+# FORMAT SELECTION
+
+By default youtube-dl tries to download the best quality, but sometimes you may want to download other format.
+The simplest case is requesting a specific format, for example `-f 22`. You can get the list of available formats using `--list-formats`, you can also use a file extension (currently it supports aac, m4a, mp3, mp4, ogg, wav, webm) or the special names `best`, `bestvideo`, `bestaudio` and `worst`.
+
+If you want to download multiple videos and they don't have the same formats available, you can specify the order of preference using slashes, as in `-f 22/17/18`. You can also filter the video results by putting a condition in brackets, as in `-f "best[height=720]"` (or `-f "[filesize>10M]"`).  This works for filesize, height, width, tbr, abr, vbr, asr, and fps and the comparisons <, <=, >, >=, =, != and for ext, acodec, vcodec, container, and protocol and the comparisons =, != . Formats for which the value is not known are excluded unless you put a question mark (?) after the operator. You can combine format filters, so  `-f "[height <=? 720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 KBit/s. Use commas to download multiple formats, such as `-f 136/137/mp4/bestvideo,140/m4a/bestaudio`. You can merge the video and audio of two formats into a single file using `-f <video-format>+<audio-format>` (requires ffmpeg or avconv), for example `-f bestvideo+bestaudio`.
+
+Since the end of April 2015 and version 2015.04.26 youtube-dl uses `-f bestvideo+bestaudio/best` as default format selection (see #5447, #5456). If ffmpeg or avconv are installed this results in downloading `bestvideo` and `bestaudio` separately and muxing them together into a single file giving the best overall quality available. Otherwise it falls back to `best` and results in downloading best available quality served as a single file. `best` is also needed for videos that don't come from YouTube because they don't provide the audio and video in two different files. If you want to only download some dash formats (for example if you are not interested in getting videos with a resolution higher than 1080p), you can add `-f bestvideo[height<=?1080]+bestaudio/best` to your configuration file. Note that if you use youtube-dl to stream to `stdout` (and most likely to pipe it to your media player then), i.e. you explicitly specify output template as `-o -`, youtube-dl still uses `-f best` format selection in order to start content delivery immediately to your player and not to wait until `bestvideo` and `bestaudio` are downloaded and muxed.
+
+If you want to preserve the old format selection behavior (prior to youtube-dl 2015.04.26), i.e. you want to download best available quality media served as a single file, you should explicitly specify your choice with `-f best`. You may want to add it to the [configuration file](#configuration) in order not to type it every time you run youtube-dl.
+
 # VIDEO SELECTION
 
 Videos can be filtered by their upload date using the options `--date`, `--datebefore` or `--dateafter`, they accept dates in two formats:
@@ -321,9 +345,9 @@ YouTube changed their playlist format in March 2014 and later on, so you'll need
 
 If you have installed youtube-dl with a package manager, pip, setup.py or a tarball, please use that to update. Note that Ubuntu packages do not seem to get updated anymore. Since we are not affiliated with Ubuntu, there is little we can do. Feel free to [report bugs](https://bugs.launchpad.net/ubuntu/+source/youtube-dl/+filebug) to the [Ubuntu packaging guys](mailto:ubuntu-motu@lists.ubuntu.com?subject=outdated%20version%20of%20youtube-dl) - all they have to do is update the package to a somewhat recent version. See above for a way to update.
 
-### Do I always have to pass in `--max-quality FORMAT`, or `-citw`?
+### Do I always have to pass `-citw`?
 
-By default, youtube-dl intends to have the best options (incidentally, if you have a convincing case that these should be different, [please file an issue where you explain that](https://yt-dl.org/bug)). Therefore, it is unnecessary and sometimes harmful to copy long option strings from webpages. In particular, `--max-quality` *limits* the video quality (so if you want the best quality, do NOT pass it in), and the only option out of `-citw` that is regularly useful is `-i`.
+By default, youtube-dl intends to have the best options (incidentally, if you have a convincing case that these should be different, [please file an issue where you explain that](https://yt-dl.org/bug)). Therefore, it is unnecessary and sometimes harmful to copy long option strings from webpages. In particular, the only option out of `-citw` that is regularly useful is `-i`.
 
 ### Can you please put the -b option back?
 
@@ -355,13 +379,29 @@ YouTube has switched to a new video info format in July 2011 which is not suppor
 
 YouTube requires an additional signature since September 2012 which is not supported by old versions of youtube-dl. See [above](#how-do-i-update-youtube-dl) for how to update youtube-dl.
 
+### Video URL contains an ampersand and I'm getting some strange output `[1] 2839` or `'v' is not recognized as an internal or external command` ###
+
+That's actually the output from your shell. Since ampersand is one of the special shell characters it's interpreted by shell preventing you from passing the whole URL to youtube-dl. To disable your shell from interpreting the ampersands (or any other special characters) you have to either put the whole URL in quotes or escape them with a backslash (which approach will work depends on your shell).
+
+For example if your URL is https://www.youtube.com/watch?t=4&v=BaW_jenozKc you should end up with following command:
+
+```youtube-dl 'https://www.youtube.com/watch?t=4&v=BaW_jenozKc'```
+
+or
+
+```youtube-dl https://www.youtube.com/watch?t=4\&v=BaW_jenozKc```
+
+For Windows you have to use the double quotes:
+
+```youtube-dl "https://www.youtube.com/watch?t=4&v=BaW_jenozKc"```
+
 ### ExtractorError: Could not find JS function u'OF'
 
 In February 2015, the new YouTube player contained a character sequence in a string that was misinterpreted by old versions of youtube-dl. See [above](#how-do-i-update-youtube-dl) for how to update youtube-dl.
 
 ### HTTP Error 429: Too Many Requests or 402: Payment Required
 
-These two error codes indicate that the service is blocking your IP address because of overuse. Contact the service and ask them to unblock your IP address, or - if you have acquired a whitelisted IP address already - use the [`--proxy` or `--network-address` options](#network-options) to select another IP address.
+These two error codes indicate that the service is blocking your IP address because of overuse. Contact the service and ask them to unblock your IP address, or - if you have acquired a whitelisted IP address already - use the [`--proxy` or `--source-address` options](#network-options) to select another IP address.
 
 ### SyntaxError: Non-ASCII character ###
 
index c85a3991800e5c86d19bee8ed8fb8d1a9c6d96b9..73445137f3f165de4d98b6f1c367553f5ce9345a 100644 (file)
  - **56.com**
  - **5min**
  - **8tracks**
+ - **91porn**
  - **9gag**
  - **abc.net.au**
  - **Abc7News**
  - **AcademicEarth:Course**
  - **AddAnime**
  - **AdobeTV**
+ - **AdobeTVVideo**
  - **AdultSwim**
  - **Aftenposten**
  - **Aftonbladet**
@@ -26,8 +28,8 @@
  - **anitube.se**
  - **AnySex**
  - **Aparat**
- - **AppleDailyAnimationNews**
- - **AppleDailyRealtimeNews**
+ - **AppleConnect**
+ - **AppleDaily**: 臺灣蘋果日報
  - **AppleTrailers**
  - **archive.org**: archive.org videos
  - **ARD**
@@ -44,6 +46,7 @@
  - **audiomack**
  - **audiomack:album**
  - **Azubu**
+ - **BaiduVideo**: 百度视频
  - **bambuser**
  - **bambuser:channel**
  - **Bandcamp**
@@ -63,6 +66,8 @@
  - **BR**: Bayerischer Rundfunk Mediathek
  - **Break**
  - **Brightcove**
+ - **bt:article**: Bergens Tidende Articles
+ - **bt:vestlendingen**: Bergens Tidende - Vestlendingen
  - **BuzzFeed**
  - **BYUtv**
  - **Camdemy**
  - **CondeNast**: Condé Nast media group: Condé Nast, GQ, Glamour, Vanity Fair, Vogue, W Magazine, WIRED
  - **Cracked**
  - **Criterion**
+ - **CrooksAndLiars**
  - **Crunchyroll**
  - **crunchyroll:playlist**
  - **CSpan**: C-SPAN
- - **CtsNews**
+ - **CtsNews**: 華視新聞
  - **culturebox.francetvinfo.fr**
  - **dailymotion**
  - **dailymotion:playlist**
  - **dailymotion:user**
+ - **DailymotionCloud**
  - **daum.net**
  - **DBTV**
  - **DctpTv**
  - **Discovery**
  - **divxstage**: DivxStage
  - **Dotsub**
- - **DouyuTV**
+ - **DouyuTV**: 斗鱼
+ - **dramafever**
+ - **dramafever:series**
  - **DRBonanza**
  - **Dropbox**
  - **DrTuber**
  - **Eporner**
  - **EroProfile**
  - **Escapist**
+ - **ESPN** (Currently broken)
  - **EveryonesMixtape**
  - **exfm**: ex.fm
  - **ExpoTV**
  - **fc2**
  - **fernsehkritik.tv**
  - **fernsehkritik.tv:postecke**
- - **Firedrive**
  - **Firstpost**
+ - **FiveTV**
  - **Flickr**
  - **Folketinget**: Folketinget (ft.dk; Danish parliament)
  - **FootyRoom**
  - **Foxgay**
  - **FoxNews**
+ - **FoxSports**
  - **france2.fr:generation-quoi**
  - **FranceCulture**
  - **FranceInter**
  - **Gazeta**
  - **GDCVault**
  - **generic**: Generic downloader that works on some sites
+ - **Gfycat**
  - **GiantBomb**
  - **Giga**
  - **Glide**: Glide mobile video messages (glide.me)
  - **GodTube**
  - **GoldenMoustache**
  - **Golem**
- - **GorillaVid**: GorillaVid.in, daclips.in, movpod.in and fastvideo.in
+ - **GorillaVid**: GorillaVid.in, daclips.in, movpod.in, fastvideo.in and realvid.net
  - **Goshgay**
- - **Grooveshark**
  - **Groupon**
  - **Hark**
  - **HearThisAt**
  - **instagram:user**: Instagram user profile
  - **InternetVideoArchive**
  - **IPrima**
+ - **iqiyi**: 爱奇艺
  - **ivi**: ivi.ru
  - **ivi:compilation**: ivi.ru compilations
  - **Izlesene**
  - **KanalPlay**: Kanal 5/9/11 Play
  - **Kankan**
  - **Karaoketv**
+ - **KarriereVideos**
  - **keek**
  - **KeezMovies**
  - **KhanAcademy**
  - **kontrtube**: KontrTube.ru - Труба зовёт
  - **KrasView**: Красвью
  - **Ku6**
+ - **kuwo:album**: 酷我音乐 - 专辑
+ - **kuwo:category**: 酷我音乐 - 分类
+ - **kuwo:chart**: 酷我音乐 - 排行榜
+ - **kuwo:mv**: 酷我音乐 - MV
+ - **kuwo:singer**: 酷我音乐 - 歌手
+ - **kuwo:song**: 酷我音乐
  - **la7.tv**
  - **Laola1Tv**
- - **Letv**
+ - **Letv**: 乐视网
  - **LetvPlaylist**
  - **LetvTv**
  - **Libsyn**
+ - **life:embed**
  - **lifenews**: LIFE | NEWS
  - **LiveLeak**
  - **livestream**
  - **Malemotion**
  - **MDR**
  - **media.ccc.de**
+ - **MegaVideoz**
  - **metacafe**
  - **Metacritic**
  - **Mgoon**
  - **Motherless**
  - **Motorsport**: motorsport.com
  - **MovieClips**
+ - **MovieFap**
  - **Moviezine**
  - **movshare**: MovShare
  - **MPORA**
  - **MySpace**
  - **MySpace:album**
  - **MySpass**
+ - **Myvi**
  - **myvideo**
  - **MyVidster**
+ - **N-JOY**
  - **n-tv.de**
  - **NationalGeographic**
  - **Naver**
  - **NDTV**
  - **NerdCubedFeed**
  - **Nerdist**
+ - **netease:album**: 网易云音乐 - 专辑
+ - **netease:djradio**: 网易云音乐 - 电台
+ - **netease:mv**: 网易云音乐 - MV
+ - **netease:playlist**: 网易云音乐 - 歌单
+ - **netease:program**: 网易云音乐 - 电台节目
+ - **netease:singer**: 网易云音乐 - 歌手
+ - **netease:song**: 网易云音乐
  - **Netzkino**
  - **Newgrounds**
  - **Newstube**
- - **NextMedia**
- - **NextMediaActionNews**
+ - **NextMedia**: 蘋果日報
+ - **NextMediaActionNews**: 蘋果日報 - 動新聞
  - **nfb**: National Film Board of Canada
  - **nfl.com**
  - **nhl.com**
  - **Noco**
  - **Normalboots**
  - **NosVideo**
+ - **Nova**: TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz
  - **novamov**: NovaMov
  - **Nowness**
+ - **NowTV**
  - **nowvideo**: NowVideo
- - **npo.nl**
+ - **npo**: npo.nl and ntr.nl
+ - **npo**: npo.nl and ntr.nl
  - **npo.nl:live**
  - **npo.nl:radio**
  - **npo.nl:radio:fragment**
  - **NRK**
  - **NRKPlaylist**
- - **NRKTV**
+ - **NRKTV**: NRK TV and NRK Radio
  - **ntv.ru**
  - **Nuvid**
  - **NYTimes**
+ - **NYTimesArticle**
  - **ocw.mit.edu**
  - **Odnoklassniki**
  - **OktoberfestTV**
  - **on.aol.com**
+ - **OnionStudios**
  - **Ooyala**
+ - **OoyalaExternal**
  - **OpenFilm**
  - **orf:fm4**: radio FM4
  - **orf:iptv**: iptv.ORF.at
  - **parliamentlive.tv**: UK parliament videos
  - **Patreon**
  - **PBS**
+ - **PhilharmonieDeParis**: Philharmonie de Paris
  - **Phoenix**
  - **Photobucket**
+ - **Pinkbike**
  - **Pladform**
  - **PlanetaPlay**
  - **play.fm**
  - **prosiebensat1**: ProSiebenSat.1 Digital
  - **Puls4**
  - **Pyvideo**
+ - **qqmusic**: QQ音乐
+ - **qqmusic:album**: QQ音乐 - 专辑
+ - **qqmusic:playlist**: QQ音乐 - 歌单
+ - **qqmusic:singer**: QQ音乐 - 歌手
+ - **qqmusic:toplist**: QQ音乐 - 排行榜
  - **QuickVid**
  - **R7**
  - **radio.de**
  - **RadioJavan**
  - **Rai**
  - **RBMARadio**
+ - **RDS**: RDS.ca
  - **RedTube**
  - **Restudy**
  - **ReverbNation**
  - **Rte**
  - **rtl.nl**: rtl.nl and rtlxl.nl
  - **RTL2**
- - **RTLnow**
  - **RTP**
  - **RTS**: RTS.ch
  - **rtve.es:alacarta**: RTVE a la carta
  - **rutube:movie**: Rutube movies
  - **rutube:person**: Rutube person videos
  - **RUTV**: RUTV.RU
+ - **Ruutu**
  - **safari**: safaribooksonline.com online video
  - **safari:course**: safaribooksonline.com online courses
  - **Sandia**: Sandia National Laboratories
  - **Screencast**
  - **ScreencastOMatic**
  - **ScreenwaveMedia**
+ - **SenateISVP**
  - **ServingSys**
  - **Sexu**
  - **SexyKarma**: Sexy Karma and Watch Indian Porn
  - **smotri:broadcast**: Smotri.com broadcasts
  - **smotri:community**: Smotri.com community videos
  - **smotri:user**: Smotri.com user videos
+ - **SnagFilms**
+ - **SnagFilmsEmbed**
  - **Snotr**
- - **Sockshare**
  - **Sohu**
+ - **soompi**
+ - **soompi:show**
  - **soundcloud**
  - **soundcloud:playlist**
  - **soundcloud:set**
  - **soundgasm**
  - **soundgasm:profile**
  - **southpark.cc.com**
+ - **southpark.cc.com:español**
  - **southpark.de**
+ - **southpark.nl**
+ - **southparkstudios.dk**
  - **Space**
  - **SpankBang**
  - **Spankwire**
  - **Spike**
  - **Sport5**
  - **SportBox**
+ - **SportBoxEmbed**
  - **SportDeutschland**
+ - **Sportschau**
+ - **Srf**
  - **SRMediathek**: Saarländischer Rundfunk
  - **SSA**
  - **stanfordoc**: Stanford Open ClassRoom
  - **StreamCZ**
  - **StreetVoice**
  - **SunPorno**
+ - **SVT**
  - **SVTPlay**: SVT Play and Öppet arkiv
  - **SWRMediathek**
  - **Syfy**
  - **TeamFour**
  - **TechTalks**
  - **techtv.mit.edu**
- - **TED**
- - **tegenlicht.vpro.nl**
+ - **ted**
  - **TeleBruxelles**
  - **telecinco.es**
  - **TeleMB**
  - **TheOnion**
  - **ThePlatform**
  - **TheSixtyOne**
+ - **ThisAmericanLife**
  - **ThisAV**
  - **THVideo**
  - **THVideoPlaylist**
  - **tlc.com**
  - **tlc.de**
  - **TMZ**
+ - **TMZArticle**
  - **TNAFlix**
  - **tou.tv**
  - **Toypics**: Toypics user profile
  - **Trilulilu**
  - **TruTube**
  - **Tube8**
+ - **TubiTv**
  - **Tudou**
  - **Tumblr**
  - **TuneIn**
  - **Turbo**
  - **Tutv**
  - **tv.dfb.de**
+ - **TV2**
+ - **TV2Article**
  - **TV4**: tv4.se and tv4play.se
+ - **TVC**
+ - **TVCArticle**
  - **tvigle**: Интернет-телевидение Tvigle.ru
  - **tvp.pl**
  - **tvp.pl:Series**
  - **twitch:stream**
  - **twitch:video**
  - **twitch:vod**
+ - **TwitterCard**
  - **Ubu**
  - **udemy**
  - **udemy:course**
- - **UDNEmbed**
+ - **UDNEmbed**: 聯合影音
  - **Ultimedia**
  - **Unistra**
  - **Urort**: NRK P3 Urørt
  - **Vessel**
  - **Vesti**: Вести.Ru
  - **Vevo**
- - **VGTV**
+ - **VGTV**: VGTV and BTTV
  - **vh1.com**
  - **Vice**
  - **Viddler**
  - **vier:videos**
  - **Viewster**
  - **viki**
+ - **viki:channel**
  - **vimeo**
  - **vimeo:album**
  - **vimeo:channel**
  - **vimeo:review**: Review pages on vimeo
  - **vimeo:user**
  - **vimeo:watchlater**: Vimeo watch later list, "vimeowatchlater" keyword (requires authentication)
- - **Vimple**: Vimple.ru
+ - **Vimple**: Vimple - one-click video hosting
  - **Vine**
  - **vine:user**
- - **vk.com**
- - **vk.com:user-videos**: vk.com:All of a user's videos
+ - **vk**: VK
+ - **vk:uservideos**: VK - User's Videos
  - **Vodlocker**
+ - **VoiceRepublic**
  - **Vporn**
  - **VRT**
  - **vube**: Vube.com
  - **wdr:mobile**
  - **WDRMaus**: Sendung mit der Maus
  - **WebOfStories**
+ - **WebOfStoriesPlaylist**
  - **Weibo**
  - **Wimp**
  - **Wistia**
+ - **WNL**
  - **WorldStarHipHop**
  - **wrzuta.pl**
  - **WSJ**: Wall Street Journal
  - **XBef**
  - **XboxClips**
  - **XHamster**
+ - **XHamsterEmbed**
  - **XMinus**
  - **XNXX**
+ - **Xstream**
  - **XTube**
  - **XTubeUser**: XTube user profile
- - **Xuite**
+ - **Xuite**: 隨意窩Xuite影音
  - **XVideos**
  - **XXXYMovies**
  - **Yahoo**: Yahoo screen and movies
- - **Yam**
+ - **Yam**: 蕃薯藤yam天空部落
  - **yandexmusic:album**: Яндекс.Музыка - Альбом
  - **yandexmusic:playlist**: Яндекс.Музыка - Плейлист
  - **yandexmusic:track**: Яндекс.Музыка - Трек
  - **YesJapan**
+ - **yinyuetai:video**: 音悦Tai
  - **Ynet**
  - **YouJizz**
- - **Youku**
+ - **youku**: 优酷
  - **YouPorn**
  - **YourUpload**
  - **youtube**: YouTube.com
index 12afdf184f0215e9947515cd3a8516ccad2e480e..e1129e58f44c9f5118b16a52dacdd869d3dd0123 100644 (file)
@@ -150,7 +150,7 @@ def expect_info_dict(self, got_dict, expected_dict):
                              'invalid value for field %s, expected %r, got %r' % (info_field, expected, got))
 
     # Check for the presence of mandatory fields
-    if got_dict.get('_type') != 'playlist':
+    if got_dict.get('_type') not in ('playlist', 'multi_video'):
         for key in ('id', 'url', 'title', 'ext'):
             self.assertTrue(got_dict.get(key), 'Missing mandatory field %s' % key)
     # Check for mandatory fields that are automatically set by YoutubeDL
index cbff9bd16486fcda2c155e6978c354e320cfc95b..7bf59c25fdf77dd4c736f21b9ede9304e9bb214c 100644 (file)
@@ -7,8 +7,7 @@
     "forcethumbnail": false, 
     "forcetitle": false, 
     "forceurl": false, 
-    "format": null, 
-    "format_limit": null, 
+    "format": "best",
     "ignoreerrors": false, 
     "listformats": null, 
     "logtostderr": false, 
index 652519831cec63a518ceb87d409603f13d9383a8..a13c09ef40c7d8c101c86b471cae142ada9ccd7c 100644 (file)
@@ -12,6 +12,7 @@ import copy
 
 from test.helper import FakeYDL, assertRegexpMatches
 from youtube_dl import YoutubeDL
+from youtube_dl.compat import compat_str
 from youtube_dl.extractor import YoutubeIE
 from youtube_dl.postprocessor.common import PostProcessor
 from youtube_dl.utils import match_filter_func
@@ -101,39 +102,6 @@ class TestFormatSelection(unittest.TestCase):
         downloaded = ydl.downloaded_info_dicts[0]
         self.assertEqual(downloaded['ext'], 'flv')
 
-    def test_format_limit(self):
-        formats = [
-            {'format_id': 'meh', 'url': 'http://example.com/meh', 'preference': 1},
-            {'format_id': 'good', 'url': 'http://example.com/good', 'preference': 2},
-            {'format_id': 'great', 'url': 'http://example.com/great', 'preference': 3},
-            {'format_id': 'excellent', 'url': 'http://example.com/exc', 'preference': 4},
-        ]
-        info_dict = _make_result(formats)
-
-        ydl = YDL()
-        ydl.process_ie_result(info_dict)
-        downloaded = ydl.downloaded_info_dicts[0]
-        self.assertEqual(downloaded['format_id'], 'excellent')
-
-        ydl = YDL({'format_limit': 'good'})
-        assert ydl.params['format_limit'] == 'good'
-        ydl.process_ie_result(info_dict.copy())
-        downloaded = ydl.downloaded_info_dicts[0]
-        self.assertEqual(downloaded['format_id'], 'good')
-
-        ydl = YDL({'format_limit': 'great', 'format': 'all'})
-        ydl.process_ie_result(info_dict.copy())
-        self.assertEqual(ydl.downloaded_info_dicts[0]['format_id'], 'meh')
-        self.assertEqual(ydl.downloaded_info_dicts[1]['format_id'], 'good')
-        self.assertEqual(ydl.downloaded_info_dicts[2]['format_id'], 'great')
-        self.assertTrue('3' in ydl.msgs[0])
-
-        ydl = YDL()
-        ydl.params['format_limit'] = 'excellent'
-        ydl.process_ie_result(info_dict.copy())
-        downloaded = ydl.downloaded_info_dicts[0]
-        self.assertEqual(downloaded['format_id'], 'excellent')
-
     def test_format_selection(self):
         formats = [
             {'format_id': '35', 'ext': 'mp4', 'preference': 1, 'url': TEST_URL},
@@ -270,7 +238,7 @@ class TestFormatSelection(unittest.TestCase):
             f2['url'] = 'url:' + f2id
 
             info_dict = _make_result([f1, f2], extractor='youtube')
-            ydl = YDL()
+            ydl = YDL({'format': 'best/bestvideo'})
             yie = YoutubeIE(ydl)
             yie._sort_formats(info_dict['formats'])
             ydl.process_ie_result(info_dict)
@@ -278,7 +246,7 @@ class TestFormatSelection(unittest.TestCase):
             self.assertEqual(downloaded['format_id'], f1id)
 
             info_dict = _make_result([f2, f1], extractor='youtube')
-            ydl = YDL()
+            ydl = YDL({'format': 'best/bestvideo'})
             yie = YoutubeIE(ydl)
             yie._sort_formats(info_dict['formats'])
             ydl.process_ie_result(info_dict)
@@ -443,27 +411,36 @@ class TestYoutubeDL(unittest.TestCase):
             def run(self, info):
                 with open(audiofile, 'wt') as f:
                     f.write('EXAMPLE')
-                info['filepath']
-                return False, info
+                return [info['filepath']], info
 
-        def run_pp(params):
+        def run_pp(params, PP):
             with open(filename, 'wt') as f:
                 f.write('EXAMPLE')
             ydl = YoutubeDL(params)
-            ydl.add_post_processor(SimplePP())
+            ydl.add_post_processor(PP())
             ydl.post_process(filename, {'filepath': filename})
 
-        run_pp({'keepvideo': True})
+        run_pp({'keepvideo': True}, SimplePP)
         self.assertTrue(os.path.exists(filename), '%s doesn\'t exist' % filename)
         self.assertTrue(os.path.exists(audiofile), '%s doesn\'t exist' % audiofile)
         os.unlink(filename)
         os.unlink(audiofile)
 
-        run_pp({'keepvideo': False})
+        run_pp({'keepvideo': False}, SimplePP)
         self.assertFalse(os.path.exists(filename), '%s exists' % filename)
         self.assertTrue(os.path.exists(audiofile), '%s doesn\'t exist' % audiofile)
         os.unlink(audiofile)
 
+        class ModifierPP(PostProcessor):
+            def run(self, info):
+                with open(info['filepath'], 'wt') as f:
+                    f.write('MODIFIED')
+                return [], info
+
+        run_pp({'keepvideo': False}, ModifierPP)
+        self.assertTrue(os.path.exists(filename), '%s doesn\'t exist' % filename)
+        os.unlink(filename)
+
     def test_match_filter(self):
         class FilterYDL(YDL):
             def __init__(self, *args, **kwargs):
@@ -531,6 +508,51 @@ class TestYoutubeDL(unittest.TestCase):
         res = get_videos(f)
         self.assertEqual(res, ['1'])
 
+    def test_playlist_items_selection(self):
+        entries = [{
+            'id': compat_str(i),
+            'title': compat_str(i),
+            'url': TEST_URL,
+        } for i in range(1, 5)]
+        playlist = {
+            '_type': 'playlist',
+            'id': 'test',
+            'entries': entries,
+            'extractor': 'test:playlist',
+            'extractor_key': 'test:playlist',
+            'webpage_url': 'http://example.com',
+        }
+
+        def get_ids(params):
+            ydl = YDL(params)
+            # make a copy because the dictionary can be modified
+            ydl.process_ie_result(playlist.copy())
+            return [int(v['id']) for v in ydl.downloaded_info_dicts]
+
+        result = get_ids({})
+        self.assertEqual(result, [1, 2, 3, 4])
+
+        result = get_ids({'playlistend': 10})
+        self.assertEqual(result, [1, 2, 3, 4])
+
+        result = get_ids({'playlistend': 2})
+        self.assertEqual(result, [1, 2])
+
+        result = get_ids({'playliststart': 10})
+        self.assertEqual(result, [])
+
+        result = get_ids({'playliststart': 2})
+        self.assertEqual(result, [2, 3, 4])
+
+        result = get_ids({'playlist_items': '2-4'})
+        self.assertEqual(result, [2, 3, 4])
+
+        result = get_ids({'playlist_items': '2,4'})
+        self.assertEqual(result, [2, 4])
+
+        result = get_ids({'playlist_items': '10'})
+        self.assertEqual(result, [])
+
 
 if __name__ == '__main__':
     unittest.main()
index 4dc7de7b5b8d55fc1d95296b58e076aba664e1cc..315a3f5ae6a597662d05f56e97672b4ff93aff10 100644 (file)
@@ -39,7 +39,7 @@ class TestAES(unittest.TestCase):
         encrypted = base64.b64encode(
             intlist_to_bytes(self.iv[:8]) +
             b'\x17\x15\x93\xab\x8d\x80V\xcdV\xe0\t\xcdo\xc2\xa5\xd8ksM\r\xe27N\xae'
-        )
+        ).decode('utf-8')
         decrypted = (aes_decrypt_text(encrypted, password, 16))
         self.assertEqual(decrypted, self.secret_msg)
 
@@ -47,7 +47,7 @@ class TestAES(unittest.TestCase):
         encrypted = base64.b64encode(
             intlist_to_bytes(self.iv[:8]) +
             b'\x0b\xe6\xa4\xd9z\x0e\xb8\xb9\xd0\xd4i_\x85\x1d\x99\x98_\xe5\x80\xe7.\xbf\xa5\x83'
-        )
+        ).decode('utf-8')
         decrypted = (aes_decrypt_text(encrypted, password, 32))
         self.assertEqual(decrypted, self.secret_msg)
 
index 1eb454e068970eb1a0d48cc3cd881e0bf71f9463..c3ba8ad2e3aa1f5cd33dd5a61a184d52cc0c07a9 100644 (file)
@@ -14,6 +14,8 @@ from youtube_dl.utils import get_filesystem_encoding
 from youtube_dl.compat import (
     compat_getenv,
     compat_expanduser,
+    compat_urllib_parse_unquote,
+    compat_urllib_parse_unquote_plus,
 )
 
 
@@ -42,5 +44,28 @@ class TestCompat(unittest.TestCase):
             dir(youtube_dl.compat))) - set(['unicode_literals'])
         self.assertEqual(all_names, sorted(present_names))
 
+    def test_compat_urllib_parse_unquote(self):
+        self.assertEqual(compat_urllib_parse_unquote('abc%20def'), 'abc def')
+        self.assertEqual(compat_urllib_parse_unquote('%7e/abc+def'), '~/abc+def')
+        self.assertEqual(compat_urllib_parse_unquote(''), '')
+        self.assertEqual(compat_urllib_parse_unquote('%'), '%')
+        self.assertEqual(compat_urllib_parse_unquote('%%'), '%%')
+        self.assertEqual(compat_urllib_parse_unquote('%%%'), '%%%')
+        self.assertEqual(compat_urllib_parse_unquote('%2F'), '/')
+        self.assertEqual(compat_urllib_parse_unquote('%2f'), '/')
+        self.assertEqual(compat_urllib_parse_unquote('%E6%B4%A5%E6%B3%A2'), '津波')
+        self.assertEqual(
+            compat_urllib_parse_unquote('''<meta property="og:description" content="%E2%96%81%E2%96%82%E2%96%83%E2%96%84%25%E2%96%85%E2%96%86%E2%96%87%E2%96%88" />
+%<a href="https://ar.wikipedia.org/wiki/%D8%AA%D8%B3%D9%88%D9%86%D8%A7%D9%85%D9%8A">%a'''),
+            '''<meta property="og:description" content="▁▂▃▄%▅▆▇█" />
+%<a href="https://ar.wikipedia.org/wiki/تسونامي">%a''')
+        self.assertEqual(
+            compat_urllib_parse_unquote('''%28%5E%E2%97%A3_%E2%97%A2%5E%29%E3%81%A3%EF%B8%BB%E3%83%87%E2%95%90%E4%B8%80    %E2%87%80    %E2%87%80    %E2%87%80    %E2%87%80    %E2%87%80    %E2%86%B6%I%Break%25Things%'''),
+            '''(^◣_◢^)っ︻デ═一    ⇀    ⇀    ⇀    ⇀    ⇀    ↶%I%Break%Things%''')
+
+    def test_compat_urllib_parse_unquote_plus(self):
+        self.assertEqual(compat_urllib_parse_unquote_plus('abc%20def'), 'abc def')
+        self.assertEqual(compat_urllib_parse_unquote_plus('%7e/abc+def'), '~/abc def')
+
 if __name__ == '__main__':
     unittest.main()
index 6a149ae4f707e1dc048890b72a4903ccb8a5f785..1110357a7e8097eb38479d2a15837516af32a726 100644 (file)
@@ -153,7 +153,7 @@ def generator(test_case):
                     break
 
             if is_playlist:
-                self.assertEqual(res_dict['_type'], 'playlist')
+                self.assertTrue(res_dict['_type'] in ['playlist', 'multi_video'])
                 self.assertTrue('entries' in res_dict)
                 expect_info_dict(self, res_dict, test_case.get('info_dict', {}))
 
index 891ee620b1f2627dd6991e0cccfbc58b59fb6a95..c4e3adb67b7d1034b36cdd3c45969fe321351c64 100644 (file)
@@ -266,7 +266,7 @@ class TestNRKSubtitles(BaseTestSubtitles):
         self.DL.params['allsubtitles'] = True
         subtitles = self.getSubtitles()
         self.assertEqual(set(subtitles.keys()), set(['no']))
-        self.assertEqual(md5(subtitles['no']), '1d221e6458c95c5494dcd38e6a1f129a')
+        self.assertEqual(md5(subtitles['no']), '544fa917d3197fcbee64634559221cc2')
 
 
 class TestRaiSubtitles(BaseTestSubtitles):
index 2e3a6480cb15e3c762fa68533d6dad988740a6c4..e13e11b59f7f427e8c082f003c650268895ef6f3 100644 (file)
@@ -40,7 +40,8 @@ from youtube_dl.utils import (
     read_batch_urls,
     sanitize_filename,
     sanitize_path,
-    sanitize_url_path_consecutive_slashes,
+    prepend_extension,
+    replace_extension,
     shell_quote,
     smuggle_url,
     str_to_int,
@@ -51,6 +52,7 @@ from youtube_dl.utils import (
     unified_strdate,
     unsmuggle_url,
     uppercase_escape,
+    lowercase_escape,
     url_basename,
     urlencode_postdata,
     version_tuple,
@@ -58,6 +60,8 @@ from youtube_dl.utils import (
     xpath_text,
     render_table,
     match_str,
+    parse_dfxp_time_expr,
+    dfxp2srt,
 )
 
 
@@ -171,25 +175,21 @@ class TestUtil(unittest.TestCase):
         self.assertEqual(sanitize_path('./abc'), 'abc')
         self.assertEqual(sanitize_path('./../abc'), '..\\abc')
 
-    def test_sanitize_url_path_consecutive_slashes(self):
-        self.assertEqual(
-            sanitize_url_path_consecutive_slashes('http://hostname/foo//bar/filename.html'),
-            'http://hostname/foo/bar/filename.html')
-        self.assertEqual(
-            sanitize_url_path_consecutive_slashes('http://hostname//foo/bar/filename.html'),
-            'http://hostname/foo/bar/filename.html')
-        self.assertEqual(
-            sanitize_url_path_consecutive_slashes('http://hostname//'),
-            'http://hostname/')
-        self.assertEqual(
-            sanitize_url_path_consecutive_slashes('http://hostname/foo/bar/filename.html'),
-            'http://hostname/foo/bar/filename.html')
-        self.assertEqual(
-            sanitize_url_path_consecutive_slashes('http://hostname/'),
-            'http://hostname/')
-        self.assertEqual(
-            sanitize_url_path_consecutive_slashes('http://hostname/abc//'),
-            'http://hostname/abc/')
+    def test_prepend_extension(self):
+        self.assertEqual(prepend_extension('abc.ext', 'temp'), 'abc.temp.ext')
+        self.assertEqual(prepend_extension('abc.ext', 'temp', 'ext'), 'abc.temp.ext')
+        self.assertEqual(prepend_extension('abc.unexpected_ext', 'temp', 'ext'), 'abc.unexpected_ext.temp')
+        self.assertEqual(prepend_extension('abc', 'temp'), 'abc.temp')
+        self.assertEqual(prepend_extension('.abc', 'temp'), '.abc.temp')
+        self.assertEqual(prepend_extension('.abc.ext', 'temp'), '.abc.temp.ext')
+
+    def test_replace_extension(self):
+        self.assertEqual(replace_extension('abc.ext', 'temp'), 'abc.temp')
+        self.assertEqual(replace_extension('abc.ext', 'temp', 'ext'), 'abc.temp')
+        self.assertEqual(replace_extension('abc.unexpected_ext', 'temp', 'ext'), 'abc.unexpected_ext.temp')
+        self.assertEqual(replace_extension('abc', 'temp'), 'abc.temp')
+        self.assertEqual(replace_extension('.abc', 'temp'), '.abc.temp')
+        self.assertEqual(replace_extension('.abc.ext', 'temp'), '.abc.temp')
 
     def test_ordered_set(self):
         self.assertEqual(orderedSet([1, 1, 2, 3, 4, 4, 5, 6, 7, 3, 5]), [1, 2, 3, 4, 5, 6, 7])
@@ -398,6 +398,10 @@ class TestUtil(unittest.TestCase):
         self.assertEqual(uppercase_escape('aä'), 'aä')
         self.assertEqual(uppercase_escape('\\U0001d550'), '𝕐')
 
+    def test_lowercase_escape(self):
+        self.assertEqual(lowercase_escape('aä'), 'aä')
+        self.assertEqual(lowercase_escape('\\u0026'), '&')
+
     def test_limit_length(self):
         self.assertEqual(limit_length(None, 12), None)
         self.assertEqual(limit_length('foo', 12), 'foo')
@@ -581,6 +585,57 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4')
             'like_count > 100 & dislike_count <? 50 & description',
             {'like_count': 190, 'dislike_count': 10}))
 
+    def test_parse_dfxp_time_expr(self):
+        self.assertEqual(parse_dfxp_time_expr(None), 0.0)
+        self.assertEqual(parse_dfxp_time_expr(''), 0.0)
+        self.assertEqual(parse_dfxp_time_expr('0.1'), 0.1)
+        self.assertEqual(parse_dfxp_time_expr('0.1s'), 0.1)
+        self.assertEqual(parse_dfxp_time_expr('00:00:01'), 1.0)
+        self.assertEqual(parse_dfxp_time_expr('00:00:01.100'), 1.1)
+
+    def test_dfxp2srt(self):
+        dfxp_data = '''<?xml version="1.0" encoding="UTF-8"?>
+            <tt xmlns="http://www.w3.org/ns/ttml" xml:lang="en" xmlns:tts="http://www.w3.org/ns/ttml#parameter">
+            <body>
+                <div xml:lang="en">
+                    <p begin="0" end="1">The following line contains Chinese characters and special symbols</p>
+                    <p begin="1" end="2">第二行<br/>♪♪</p>
+                    <p begin="2" dur="1"><span>Third<br/>Line</span></p>
+                </div>
+            </body>
+            </tt>'''
+        srt_data = '''1
+00:00:00,000 --> 00:00:01,000
+The following line contains Chinese characters and special symbols
+
+2
+00:00:01,000 --> 00:00:02,000
+第二行
+♪♪
+
+3
+00:00:02,000 --> 00:00:03,000
+Third
+Line
+
+'''
+        self.assertEqual(dfxp2srt(dfxp_data), srt_data)
+
+        dfxp_data_no_default_namespace = '''<?xml version="1.0" encoding="UTF-8"?>
+            <tt xml:lang="en" xmlns:tts="http://www.w3.org/ns/ttml#parameter">
+            <body>
+                <div xml:lang="en">
+                    <p begin="0" end="1">The first line</p>
+                </div>
+            </body>
+            </tt>'''
+        srt_data = '''1
+00:00:00,000 --> 00:00:01,000
+The first line
+
+'''
+        self.assertEqual(dfxp2srt(dfxp_data_no_default_namespace), srt_data)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tox.ini b/tox.ini
index 00c6e00e3b72c4de21dc725173e3bb60ea5fa55b..cd805fe8ac27481937a1000a5a37412ff4f0d923 100644 (file)
--- a/tox.ini
+++ b/tox.ini
@@ -4,6 +4,8 @@ envlist = py26,py27,py33,py34
 deps =
    nose
    coverage
+# We need a valid $HOME for test_compat_expanduser
+passenv = HOME
 defaultargs = test --exclude test_download.py --exclude test_age_restriction.py
     --exclude test_subtitles.py --exclude test_write_annotations.py
     --exclude test_youtube_lists.py
index a68b24ab485d6ef8e064ee7533a4a15379eb87cf..00af78e0600f8d2136116e91bcda179f70dbf9a5 100755 (executable)
@@ -49,6 +49,7 @@ from .utils import (
     ExtractorError,
     format_bytes,
     formatSeconds,
+    HEADRequest,
     locked_file,
     make_HTTPS_handler,
     MaxDownloadsReached,
@@ -64,7 +65,6 @@ from .utils import (
     sanitize_path,
     std_headers,
     subtitles_filename,
-    takewhile_inclusive,
     UnavailableVideoError,
     url_basename,
     version_tuple,
@@ -72,6 +72,7 @@ from .utils import (
     write_string,
     YoutubeDLHandler,
     prepend_extension,
+    replace_extension,
     args_to_str,
     age_restricted,
 )
@@ -118,7 +119,7 @@ class YoutubeDL(object):
 
     username:          Username for authentication purposes.
     password:          Password for authentication purposes.
-    videopassword:     Password for acces a video.
+    videopassword:     Password for accessing a video.
     usenetrc:          Use netrc for authentication instead.
     verbose:           Print additional info to stdout.
     quiet:             Do not print messages to stdout.
@@ -135,10 +136,10 @@ class YoutubeDL(object):
                        (or video) as a single JSON line.
     simulate:          Do not download the video files.
     format:            Video format code. See options.py for more information.
-    format_limit:      Highest quality format to try.
     outtmpl:           Template for output names.
     restrictfilenames: Do not allow "&" and spaces in file names
     ignoreerrors:      Do not stop on download errors.
+    force_generic_extractor: Force downloader to use the generic extractor
     nooverwrites:      Prevent overwriting files.
     playliststart:     Playlist item to start at.
     playlistend:       Playlist item to end at.
@@ -261,7 +262,8 @@ class YoutubeDL(object):
     The following options are used by the post processors:
     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
                        otherwise prefer avconv.
-    exec_cmd:          Arbitrary command to run after downloading
+    postprocessor_args: A list of additional command-line arguments for the
+                        postprocessor.
     """
 
     params = None
@@ -627,13 +629,16 @@ class YoutubeDL(object):
             info_dict.setdefault(key, value)
 
     def extract_info(self, url, download=True, ie_key=None, extra_info={},
-                     process=True):
+                     process=True, force_generic_extractor=False):
         '''
         Returns a list with a dictionary for each video we find.
         If 'download', also downloads the videos.
         extra_info is a dict containing the extra values to add to each result
         '''
 
+        if not ie_key and force_generic_extractor:
+            ie_key = 'Generic'
+
         if ie_key:
             ies = [self.get_info_extractor(ie_key)]
         else:
@@ -761,7 +766,9 @@ class YoutubeDL(object):
             if isinstance(ie_entries, list):
                 n_all_entries = len(ie_entries)
                 if playlistitems:
-                    entries = [ie_entries[i - 1] for i in playlistitems]
+                    entries = [
+                        ie_entries[i - 1] for i in playlistitems
+                        if -n_all_entries <= i - 1 < n_all_entries]
                 else:
                     entries = ie_entries[playliststart:playlistend]
                 n_entries = len(entries)
@@ -916,15 +923,17 @@ class YoutubeDL(object):
         if not available_formats:
             return None
 
-        if format_spec == 'best' or format_spec is None:
-            return available_formats[-1]
-        elif format_spec == 'worst':
+        if format_spec in ['best', 'worst', None]:
+            format_idx = 0 if format_spec == 'worst' else -1
             audiovideo_formats = [
                 f for f in available_formats
                 if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
             if audiovideo_formats:
-                return audiovideo_formats[0]
-            return available_formats[0]
+                return audiovideo_formats[format_idx]
+            # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format
+            elif (all(f.get('acodec') != 'none' for f in available_formats) or
+                  all(f.get('vcodec') != 'none' for f in available_formats)):
+                return available_formats[format_idx]
         elif format_spec == 'bestaudio':
             audio_formats = [
                 f for f in available_formats
@@ -1001,7 +1010,7 @@ class YoutubeDL(object):
                 t.get('preference'), t.get('width'), t.get('height'),
                 t.get('id'), t.get('url')))
             for i, t in enumerate(thumbnails):
-                if 'width' in t and 'height' in t:
+                if t.get('width') and t.get('height'):
                     t['resolution'] = '%dx%d' % (t['width'], t['height'])
                 if t.get('id') is None:
                     t['id'] = '%d' % i
@@ -1013,13 +1022,13 @@ class YoutubeDL(object):
             info_dict['display_id'] = info_dict['id']
 
         if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
-            # Working around negative timestamps in Windows
-            # (see http://bugs.python.org/issue1646728)
-            if info_dict['timestamp'] < 0 and os.name == 'nt':
-                info_dict['timestamp'] = 0
-            upload_date = datetime.datetime.utcfromtimestamp(
-                info_dict['timestamp'])
-            info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
+            # Working around out-of-range timestamp values (e.g. negative ones on Windows,
+            # see http://bugs.python.org/issue1646728)
+            try:
+                upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
+                info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
+            except (ValueError, OverflowError, OSError):
+                pass
 
         if self.params.get('listsubtitles', False):
             if 'automatic_captions' in info_dict:
@@ -1030,12 +1039,6 @@ class YoutubeDL(object):
             info_dict['id'], info_dict.get('subtitles'),
             info_dict.get('automatic_captions'))
 
-        # This extractors handle format selection themselves
-        if info_dict['extractor'] in ['Youku']:
-            if download:
-                self.process_info(info_dict)
-            return info_dict
-
         # We now pick which formats have to be downloaded
         if info_dict.get('formats') is None:
             # There's only one format available
@@ -1046,6 +1049,8 @@ class YoutubeDL(object):
         if not formats:
             raise ExtractorError('No video formats found!')
 
+        formats_dict = {}
+
         # We check that all the formats have the format and format_id fields
         for i, format in enumerate(formats):
             if 'url' not in format:
@@ -1053,6 +1058,18 @@ class YoutubeDL(object):
 
             if format.get('format_id') is None:
                 format['format_id'] = compat_str(i)
+            format_id = format['format_id']
+            if format_id not in formats_dict:
+                formats_dict[format_id] = []
+            formats_dict[format_id].append(format)
+
+        # Make sure all formats have unique format_id
+        for format_id, ambiguous_formats in formats_dict.items():
+            if len(ambiguous_formats) > 1:
+                for i, format in enumerate(ambiguous_formats):
+                    format['format_id'] = '%s-%d' % (format_id, i)
+
+        for i, format in enumerate(formats):
             if format.get('format') is None:
                 format['format'] = '{id} - {res}{note}'.format(
                     id=format['format_id'],
@@ -1068,12 +1085,6 @@ class YoutubeDL(object):
             full_format_info.update(format)
             format['http_headers'] = self._calc_headers(full_format_info)
 
-        format_limit = self.params.get('format_limit', None)
-        if format_limit:
-            formats = list(takewhile_inclusive(
-                lambda f: f['format_id'] != format_limit, formats
-            ))
-
         # TODO Central sorting goes here
 
         if formats[0] is not info_dict:
@@ -1091,7 +1102,14 @@ class YoutubeDL(object):
 
         req_format = self.params.get('format')
         if req_format is None:
-            req_format = 'best'
+            req_format_list = []
+            if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and
+                    info_dict['extractor'] in ['youtube', 'ted']):
+                merger = FFmpegMergerPP(self)
+                if merger.available and merger.can_merge():
+                    req_format_list.append('bestvideo+bestaudio')
+            req_format_list.append('best')
+            req_format = '/'.join(req_format_list)
         formats_to_download = []
         if req_format == 'all':
             formats_to_download = formats
@@ -1273,7 +1291,7 @@ class YoutubeDL(object):
             return
 
         if self.params.get('writedescription', False):
-            descfn = filename + '.description'
+            descfn = replace_extension(filename, 'description', info_dict.get('ext'))
             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
                 self.to_screen('[info] Video description is already present')
             elif info_dict.get('description') is None:
@@ -1288,7 +1306,7 @@ class YoutubeDL(object):
                     return
 
         if self.params.get('writeannotations', False):
-            annofn = filename + '.annotations.xml'
+            annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext'))
             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
                 self.to_screen('[info] Video annotations are already present')
             else:
@@ -1335,13 +1353,13 @@ class YoutubeDL(object):
                     return
 
         if self.params.get('writeinfojson', False):
-            infofn = os.path.splitext(filename)[0] + '.info.json'
+            infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
                 self.to_screen('[info] Video description metadata is already present')
             else:
                 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
                 try:
-                    write_json_file(info_dict, infofn)
+                    write_json_file(self.filter_requested_info(info_dict), infofn)
                 except (OSError, IOError):
                     self.report_error('Cannot write metadata to JSON file ' + infofn)
                     return
@@ -1361,24 +1379,57 @@ class YoutubeDL(object):
                 if info_dict.get('requested_formats') is not None:
                     downloaded = []
                     success = True
-                    merger = FFmpegMergerPP(self, not self.params.get('keepvideo'))
+                    merger = FFmpegMergerPP(self)
                     if not merger.available:
                         postprocessors = []
                         self.report_warning('You have requested multiple '
                                             'formats but ffmpeg or avconv are not installed.'
-                                            ' The formats won\'t be merged')
+                                            ' The formats won\'t be merged.')
                     else:
                         postprocessors = [merger]
-                    for f in info_dict['requested_formats']:
-                        new_info = dict(info_dict)
-                        new_info.update(f)
-                        fname = self.prepare_filename(new_info)
-                        fname = prepend_extension(fname, 'f%s' % f['format_id'])
-                        downloaded.append(fname)
-                        partial_success = dl(fname, new_info)
-                        success = success and partial_success
-                    info_dict['__postprocessors'] = postprocessors
-                    info_dict['__files_to_merge'] = downloaded
+
+                    def compatible_formats(formats):
+                        video, audio = formats
+                        # Check extension
+                        video_ext, audio_ext = audio.get('ext'), video.get('ext')
+                        if video_ext and audio_ext:
+                            COMPATIBLE_EXTS = (
+                                ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v'),
+                                ('webm')
+                            )
+                            for exts in COMPATIBLE_EXTS:
+                                if video_ext in exts and audio_ext in exts:
+                                    return True
+                        # TODO: Check acodec/vcodec
+                        return False
+
+                    filename_real_ext = os.path.splitext(filename)[1][1:]
+                    filename_wo_ext = (
+                        os.path.splitext(filename)[0]
+                        if filename_real_ext == info_dict['ext']
+                        else filename)
+                    requested_formats = info_dict['requested_formats']
+                    if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
+                        info_dict['ext'] = 'mkv'
+                        self.report_warning(
+                            'Requested formats are incompatible for merge and will be merged into mkv.')
+                    # Ensure filename always has a correct extension for successful merge
+                    filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
+                    if os.path.exists(encodeFilename(filename)):
+                        self.to_screen(
+                            '[download] %s has already been downloaded and '
+                            'merged' % filename)
+                    else:
+                        for f in requested_formats:
+                            new_info = dict(info_dict)
+                            new_info.update(f)
+                            fname = self.prepare_filename(new_info)
+                            fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext'])
+                            downloaded.append(fname)
+                            partial_success = dl(fname, new_info)
+                            success = success and partial_success
+                        info_dict['__postprocessors'] = postprocessors
+                        info_dict['__files_to_merge'] = downloaded
                 else:
                     # Just a single file
                     success = dl(filename, info_dict)
@@ -1448,7 +1499,8 @@ class YoutubeDL(object):
         for url in url_list:
             try:
                 # It also downloads the videos
-                res = self.extract_info(url)
+                res = self.extract_info(
+                    url, force_generic_extractor=self.params.get('force_generic_extractor', False))
             except UnavailableVideoError:
                 self.report_error('unable to download video')
             except MaxDownloadsReached:
@@ -1465,7 +1517,7 @@ class YoutubeDL(object):
                 [info_filename], mode='r',
                 openhook=fileinput.hook_encoded('utf-8'))) as f:
             # FileInput doesn't have a read method, we can't call json.load
-            info = json.loads('\n'.join(f))
+            info = self.filter_requested_info(json.loads('\n'.join(f)))
         try:
             self.process_ie_result(info, download=True)
         except DownloadError:
@@ -1477,6 +1529,12 @@ class YoutubeDL(object):
                 raise
         return self._download_retcode
 
+    @staticmethod
+    def filter_requested_info(info_dict):
+        return dict(
+            (k, v) for k, v in info_dict.items()
+            if k not in ['requested_formats', 'requested_subtitles'])
+
     def post_process(self, filename, ie_info):
         """Run all the postprocessors on the given file."""
         info = dict(ie_info)
@@ -1486,24 +1544,18 @@ class YoutubeDL(object):
             pps_chain.extend(ie_info['__postprocessors'])
         pps_chain.extend(self._pps)
         for pp in pps_chain:
-            keep_video = None
-            old_filename = info['filepath']
+            files_to_delete = []
             try:
-                keep_video_wish, info = pp.run(info)
-                if keep_video_wish is not None:
-                    if keep_video_wish:
-                        keep_video = keep_video_wish
-                    elif keep_video is None:
-                        # No clear decision yet, let IE decide
-                        keep_video = keep_video_wish
+                files_to_delete, info = pp.run(info)
             except PostProcessingError as e:
                 self.report_error(e.msg)
-            if keep_video is False and not self.params.get('keepvideo', False):
-                try:
+            if files_to_delete and not self.params.get('keepvideo', False):
+                for old_filename in files_to_delete:
                     self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
-                    os.remove(encodeFilename(old_filename))
-                except (IOError, OSError):
-                    self.report_warning('Unable to remove downloaded video file')
+                    try:
+                        os.remove(encodeFilename(old_filename))
+                    except (IOError, OSError):
+                        self.report_warning('Unable to remove downloaded original file')
 
     def _make_archive_id(self, info_dict):
         # Future-proof against any change in case
@@ -1671,7 +1723,8 @@ class YoutubeDL(object):
             if req_is_string:
                 req = url_escaped
             else:
-                req = compat_urllib_request.Request(
+                req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
+                req = req_type(
                     url_escaped, data=req.data, headers=req.headers,
                     origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
 
@@ -1817,7 +1870,7 @@ class YoutubeDL(object):
             thumb_ext = determine_ext(t['url'], 'jpg')
             suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
             thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
-            thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
+            t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
 
             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
                 self.to_screen('[%s] %s: Thumbnail %sis already present' %
index 1c8b411b7f037d4bce2face086f0743f4f23003a..55b22c889f97c73d28e732466f850bcfb1615c83 100644 (file)
@@ -169,7 +169,7 @@ def _real_main(argv=None):
         if not opts.audioquality.isdigit():
             parser.error('invalid audio quality specified')
     if opts.recodevideo is not None:
-        if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv']:
+        if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv', 'avi']:
             parser.error('invalid video recode format specified')
     if opts.convertsubtitles is not None:
         if opts.convertsubtitles not in ['srt', 'vtt', 'ass']:
@@ -240,15 +240,18 @@ def _real_main(argv=None):
     if opts.xattrs:
         postprocessors.append({'key': 'XAttrMetadata'})
     if opts.embedthumbnail:
-        if not opts.addmetadata:
-            postprocessors.append({'key': 'FFmpegAudioFix'})
-        postprocessors.append({'key': 'AtomicParsley'})
+        already_have_thumbnail = opts.writethumbnail or opts.write_all_thumbnails
+        postprocessors.append({
+            'key': 'EmbedThumbnail',
+            'already_have_thumbnail': already_have_thumbnail
+        })
+        if not already_have_thumbnail:
+            opts.writethumbnail = True
     # Please keep ExecAfterDownload towards the bottom as it allows the user to modify the final file in any way.
     # So if the user is able to remove the file before your postprocessor runs it might cause a few problems.
     if opts.exec_cmd:
         postprocessors.append({
             'key': 'ExecAfterDownload',
-            'verboseOutput': opts.verbose,
             'exec_cmd': opts.exec_cmd,
         })
     if opts.xattr_set_filesize:
@@ -260,6 +263,9 @@ def _real_main(argv=None):
     external_downloader_args = None
     if opts.external_downloader_args:
         external_downloader_args = shlex.split(opts.external_downloader_args)
+    postprocessor_args = None
+    if opts.postprocessor_args:
+        postprocessor_args = shlex.split(opts.postprocessor_args)
     match_filter = (
         None if opts.match_filter is None
         else match_filter_func(opts.match_filter))
@@ -285,12 +291,12 @@ def _real_main(argv=None):
         'simulate': opts.simulate or any_getting,
         'skip_download': opts.skip_download,
         'format': opts.format,
-        'format_limit': opts.format_limit,
         'listformats': opts.listformats,
         'outtmpl': outtmpl,
         'autonumber_size': opts.autonumber_size,
         'restrictfilenames': opts.restrictfilenames,
         'ignoreerrors': opts.ignoreerrors,
+        'force_generic_extractor': opts.force_generic_extractor,
         'ratelimit': opts.ratelimit,
         'nooverwrites': opts.nooverwrites,
         'retries': opts_retries,
@@ -348,7 +354,6 @@ def _real_main(argv=None):
         'default_search': opts.default_search,
         'youtube_include_dash_manifest': opts.youtube_include_dash_manifest,
         'encoding': opts.encoding,
-        'exec_cmd': opts.exec_cmd,
         'extract_flat': opts.extract_flat,
         'merge_output_format': opts.merge_output_format,
         'postprocessors': postprocessors,
@@ -365,6 +370,7 @@ def _real_main(argv=None):
         'ffmpeg_location': opts.ffmpeg_location,
         'hls_prefer_native': opts.hls_prefer_native,
         'external_downloader_args': external_downloader_args,
+        'postprocessor_args': postprocessor_args,
         'cn_verification_proxy': opts.cn_verification_proxy,
     }
 
index 07224d5084158184ac30c927b1b79802d398c336..7817adcfdd546f70cfb76e0634b8df8ddbcaf8e0 100644 (file)
@@ -152,7 +152,7 @@ def aes_decrypt_text(data, password, key_size_bytes):
     """
     NONCE_LENGTH_BYTES = 8
 
-    data = bytes_to_intlist(base64.b64decode(data))
+    data = bytes_to_intlist(base64.b64decode(data.encode('utf-8')))
     password = bytes_to_intlist(password.encode('utf-8'))
 
     key = password[:key_size_bytes] + [0] * (key_size_bytes - len(password))
index 973bcd32074107f70c1b781e95b97ef34501b88f..0c57c7aebf6bd45a8f9edfc7e1d585f1bbf2f23a 100644 (file)
@@ -9,6 +9,7 @@ import shutil
 import socket
 import subprocess
 import sys
+import itertools
 
 
 try:
@@ -46,11 +47,6 @@ try:
 except ImportError:  # Python 2
     import htmlentitydefs as compat_html_entities
 
-try:
-    import html.parser as compat_html_parser
-except ImportError:  # Python 2
-    import HTMLParser as compat_html_parser
-
 try:
     import http.client as compat_http_client
 except ImportError:  # Python 2
@@ -79,42 +75,74 @@ except ImportError:
     import BaseHTTPServer as compat_http_server
 
 try:
+    from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes
     from urllib.parse import unquote as compat_urllib_parse_unquote
-except ImportError:
-    def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
-        if string == '':
+    from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus
+except ImportError:  # Python 2
+    _asciire = (compat_urllib_parse._asciire if hasattr(compat_urllib_parse, '_asciire')
+                else re.compile('([\x00-\x7f]+)'))
+
+    # HACK: The following are the correct unquote_to_bytes, unquote and unquote_plus
+    # implementations from cpython 3.4.3's stdlib. Python 2's version
+    # is apparently broken (see https://github.com/rg3/youtube-dl/pull/6244)
+
+    def compat_urllib_parse_unquote_to_bytes(string):
+        """unquote_to_bytes('abc%20def') -> b'abc def'."""
+        # Note: strings are encoded as UTF-8. This is only an issue if it contains
+        # unescaped non-ASCII characters, which URIs should not.
+        if not string:
+            # Is it a string-like object?
+            string.split
+            return b''
+        if isinstance(string, unicode):
+            string = string.encode('utf-8')
+        bits = string.split(b'%')
+        if len(bits) == 1:
             return string
-        res = string.split('%')
-        if len(res) == 1:
+        res = [bits[0]]
+        append = res.append
+        for item in bits[1:]:
+            try:
+                append(compat_urllib_parse._hextochr[item[:2]])
+                append(item[2:])
+            except KeyError:
+                append(b'%')
+                append(item)
+        return b''.join(res)
+
+    def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
+        """Replace %xx escapes by their single-character equivalent. The optional
+        encoding and errors parameters specify how to decode percent-encoded
+        sequences into Unicode characters, as accepted by the bytes.decode()
+        method.
+        By default, percent-encoded sequences are decoded with UTF-8, and invalid
+        sequences are replaced by a placeholder character.
+
+        unquote('abc%20def') -> 'abc def'.
+        """
+        if '%' not in string:
+            string.split
             return string
         if encoding is None:
             encoding = 'utf-8'
         if errors is None:
             errors = 'replace'
-        # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
-        pct_sequence = b''
-        string = res[0]
-        for item in res[1:]:
-            try:
-                if not item:
-                    raise ValueError
-                pct_sequence += item[:2].decode('hex')
-                rest = item[2:]
-                if not rest:
-                    # This segment was just a single percent-encoded character.
-                    # May be part of a sequence of code units, so delay decoding.
-                    # (Stored in pct_sequence).
-                    continue
-            except ValueError:
-                rest = '%' + item
-            # Encountered non-percent-encoded characters. Flush the current
-            # pct_sequence.
-            string += pct_sequence.decode(encoding, errors) + rest
-            pct_sequence = b''
-        if pct_sequence:
-            # Flush the final pct_sequence
-            string += pct_sequence.decode(encoding, errors)
-        return string
+        bits = _asciire.split(string)
+        res = [bits[0]]
+        append = res.append
+        for i in range(1, len(bits), 2):
+            append(compat_urllib_parse_unquote_to_bytes(bits[i]).decode(encoding, errors))
+            append(bits[i + 1])
+        return ''.join(res)
+
+    def compat_urllib_parse_unquote_plus(string, encoding='utf-8', errors='replace'):
+        """Like unquote(), but also replace plus signs by spaces, as required for
+        unquoting HTML form values.
+
+        unquote_plus('%7e/abc+def') -> '~/abc def'
+        """
+        string = string.replace('+', ' ')
+        return compat_urllib_parse_unquote(string, encoding, errors)
 
 try:
     compat_str = unicode  # Python 2
@@ -393,6 +421,15 @@ else:
             pass
         return _terminal_size(columns, lines)
 
+try:
+    itertools.count(start=0, step=1)
+    compat_itertools_count = itertools.count
+except TypeError:  # Python 2.6
+    def compat_itertools_count(start=0, step=1):
+        n = start
+        while True:
+            yield n
+            n += step
 
 __all__ = [
     'compat_HTTPError',
@@ -404,9 +441,9 @@ __all__ = [
     'compat_getenv',
     'compat_getpass',
     'compat_html_entities',
-    'compat_html_parser',
     'compat_http_client',
     'compat_http_server',
+    'compat_itertools_count',
     'compat_kwargs',
     'compat_ord',
     'compat_parse_qs',
@@ -417,6 +454,8 @@ __all__ = [
     'compat_urllib_error',
     'compat_urllib_parse',
     'compat_urllib_parse_unquote',
+    'compat_urllib_parse_unquote_plus',
+    'compat_urllib_parse_unquote_to_bytes',
     'compat_urllib_parse_urlparse',
     'compat_urllib_request',
     'compat_urlparse',
index 9fb66e2f7f680a71c05fdd866c72b0db2dd91a77..dccc59212d3028bb9a96f0eb9ffff4acb0be681e 100644 (file)
@@ -6,8 +6,9 @@ from .f4m import F4mFD
 from .hls import HlsFD
 from .hls import NativeHlsFD
 from .http import HttpFD
-from .mplayer import MplayerFD
+from .rtsp import RtspFD
 from .rtmp import RtmpFD
+from .dash import DashSegmentsFD
 
 from ..utils import (
     determine_protocol,
@@ -17,9 +18,10 @@ PROTOCOL_MAP = {
     'rtmp': RtmpFD,
     'm3u8_native': NativeHlsFD,
     'm3u8': HlsFD,
-    'mms': MplayerFD,
-    'rtsp': MplayerFD,
+    'mms': RtspFD,
+    'rtsp': RtspFD,
     'f4m': F4mFD,
+    'http_dash_segments': DashSegmentsFD,
 }
 
 
index a0fc5ead06a4e8adba7886d2a4087d28ca9cdbcb..97e755d4baa56972a9a4e5223a6871edd8bf0565 100644 (file)
@@ -8,6 +8,7 @@ import time
 from ..compat import compat_str
 from ..utils import (
     encodeFilename,
+    decodeArgument,
     format_bytes,
     timeconvert,
 )
@@ -353,19 +354,15 @@ class FileDownloader(object):
         # this interface
         self._progress_hooks.append(ph)
 
-    def _debug_cmd(self, args, subprocess_encoding, exe=None):
+    def _debug_cmd(self, args, exe=None):
         if not self.params.get('verbose', False):
             return
 
+        str_args = [decodeArgument(a) for a in args]
+
         if exe is None:
-            exe = os.path.basename(args[0])
+            exe = os.path.basename(str_args[0])
 
-        if subprocess_encoding:
-            str_args = [
-                a.decode(subprocess_encoding) if isinstance(a, bytes) else a
-                for a in args]
-        else:
-            str_args = args
         try:
             import pipes
             shell_quote = lambda args: ' '.join(map(pipes.quote, str_args))
diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py
new file mode 100644 (file)
index 0000000..a4685d3
--- /dev/null
@@ -0,0 +1,66 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import FileDownloader
+from ..compat import compat_urllib_request
+
+
+class DashSegmentsFD(FileDownloader):
+    """
+    Download segments in a DASH manifest
+    """
+    def real_download(self, filename, info_dict):
+        self.report_destination(filename)
+        tmpfilename = self.temp_name(filename)
+        base_url = info_dict['url']
+        segment_urls = info_dict['segment_urls']
+
+        is_test = self.params.get('test', False)
+        remaining_bytes = self._TEST_FILE_SIZE if is_test else None
+        byte_counter = 0
+
+        def append_url_to_file(outf, target_url, target_name, remaining_bytes=None):
+            self.to_screen('[DashSegments] %s: Downloading %s' % (info_dict['id'], target_name))
+            req = compat_urllib_request.Request(target_url)
+            if remaining_bytes is not None:
+                req.add_header('Range', 'bytes=0-%d' % (remaining_bytes - 1))
+
+            data = self.ydl.urlopen(req).read()
+
+            if remaining_bytes is not None:
+                data = data[:remaining_bytes]
+
+            outf.write(data)
+            return len(data)
+
+        def combine_url(base_url, target_url):
+            if re.match(r'^https?://', target_url):
+                return target_url
+            return '%s/%s' % (base_url, target_url)
+
+        with open(tmpfilename, 'wb') as outf:
+            append_url_to_file(
+                outf, combine_url(base_url, info_dict['initialization_url']),
+                'initialization segment')
+            for i, segment_url in enumerate(segment_urls):
+                segment_len = append_url_to_file(
+                    outf, combine_url(base_url, segment_url),
+                    'segment %d / %d' % (i + 1, len(segment_urls)),
+                    remaining_bytes)
+                byte_counter += segment_len
+                if remaining_bytes is not None:
+                    remaining_bytes -= segment_len
+                    if remaining_bytes <= 0:
+                        break
+
+        self.try_rename(tmpfilename, filename)
+
+        self._hook_progress({
+            'downloaded_bytes': byte_counter,
+            'total_bytes': byte_counter,
+            'filename': filename,
+            'status': 'finished',
+        })
+
+        return True
index 1673b2382af545e1ec8f4ef81bc8656555e388fb..1d5cc99043d02f658064e688c268c37171c37325 100644 (file)
@@ -2,11 +2,11 @@ from __future__ import unicode_literals
 
 import os.path
 import subprocess
-import sys
 
 from .common import FileDownloader
 from ..utils import (
     encodeFilename,
+    encodeArgument,
 )
 
 
@@ -60,17 +60,9 @@ class ExternalFD(FileDownloader):
 
     def _call_downloader(self, tmpfilename, info_dict):
         """ Either overwrite this or implement _make_cmd """
-        cmd = self._make_cmd(tmpfilename, info_dict)
-
-        if sys.platform == 'win32' and sys.version_info < (3, 0):
-            # Windows subprocess module does not actually support Unicode
-            # on Python 2.x
-            # See http://stackoverflow.com/a/9951851/35070
-            subprocess_encoding = sys.getfilesystemencoding()
-            cmd = [a.encode(subprocess_encoding, 'ignore') for a in cmd]
-        else:
-            subprocess_encoding = None
-        self._debug_cmd(cmd, subprocess_encoding)
+        cmd = [encodeArgument(a) for a in self._make_cmd(tmpfilename, info_dict)]
+
+        self._debug_cmd(cmd)
 
         p = subprocess.Popen(
             cmd, stderr=subprocess.PIPE)
@@ -117,6 +109,14 @@ class Aria2cFD(ExternalFD):
         cmd += ['--', info_dict['url']]
         return cmd
 
+
+class HttpieFD(ExternalFD):
+    def _make_cmd(self, tmpfilename, info_dict):
+        cmd = ['http', '--download', '--output', tmpfilename, info_dict['url']]
+        for key, val in info_dict['http_headers'].items():
+            cmd += ['%s:%s' % (key, val)]
+        return cmd
+
 _BY_NAME = dict(
     (klass.get_basename(), klass)
     for name, klass in globals().items()
@@ -131,5 +131,6 @@ def list_external_downloaders():
 def get_external_downloader(external_downloader):
     """ Given the name of the executable, see whether we support the given
         downloader . """
-    bn = os.path.basename(external_downloader)
+    # Drop .exe extension on Windows
+    bn = os.path.splitext(os.path.basename(external_downloader))[0]
     return _BY_NAME[bn]
index 4ab000d6732cca5f71e8b8ab1d99dd5bfb49d57e..b1a858c452617ed452bc0dcae8d612d22fd224d3 100644 (file)
@@ -389,6 +389,8 @@ class F4mFD(FileDownloader):
             url = base_url + name
             if akamai_pv:
                 url += '?' + akamai_pv.strip(';')
+            if info_dict.get('extra_param_to_segment_url'):
+                url += info_dict.get('extra_param_to_segment_url')
             frag_filename = '%s-%s' % (tmpfilename, name)
             try:
                 success = http_dl.download(frag_filename, {'url': url})
index d136bebd1fe45761312bd90c31a95ddaf1754271..b7f144af9ea33a102246632e04e71707be3d98ad 100644 (file)
@@ -28,13 +28,8 @@ class HttpFD(FileDownloader):
         add_headers = info_dict.get('http_headers')
         if add_headers:
             headers.update(add_headers)
-        data = info_dict.get('http_post_data')
-        http_method = info_dict.get('http_method')
-        basic_request = compat_urllib_request.Request(url, data, headers)
-        request = compat_urllib_request.Request(url, data, headers)
-        if http_method is not None:
-            basic_request.get_method = lambda: http_method
-            request.get_method = lambda: http_method
+        basic_request = compat_urllib_request.Request(url, None, headers)
+        request = compat_urllib_request.Request(url, None, headers)
 
         is_test = self.params.get('test', False)
 
diff --git a/youtube_dl/downloader/mplayer.py b/youtube_dl/downloader/mplayer.py
deleted file mode 100644 (file)
index 72cef30..0000000
+++ /dev/null
@@ -1,43 +0,0 @@
-from __future__ import unicode_literals
-
-import os
-import subprocess
-
-from .common import FileDownloader
-from ..utils import (
-    check_executable,
-    encodeFilename,
-)
-
-
-class MplayerFD(FileDownloader):
-    def real_download(self, filename, info_dict):
-        url = info_dict['url']
-        self.report_destination(filename)
-        tmpfilename = self.temp_name(filename)
-
-        args = [
-            'mplayer', '-really-quiet', '-vo', 'null', '-vc', 'dummy',
-            '-dumpstream', '-dumpfile', tmpfilename, url]
-        # Check for mplayer first
-        if not check_executable('mplayer', ['-h']):
-            self.report_error('MMS or RTSP download detected but "%s" could not be run' % args[0])
-            return False
-
-        # Download using mplayer.
-        retval = subprocess.call(args)
-        if retval == 0:
-            fsize = os.path.getsize(encodeFilename(tmpfilename))
-            self.to_screen('\r[%s] %s bytes' % (args[0], fsize))
-            self.try_rename(tmpfilename, filename)
-            self._hook_progress({
-                'downloaded_bytes': fsize,
-                'total_bytes': fsize,
-                'filename': filename,
-                'status': 'finished',
-            })
-            return True
-        else:
-            self.to_stderr('\n')
-            self.report_error('mplayer exited with code %d' % retval)
-            return False
index ddf5724ae6f05259194c67473bfd212ddeb896ff..7d19bb808a820da77aeb21070ebbdec4355f6739 100644 (file)
@@ -3,7 +3,6 @@ from __future__ import unicode_literals
 import os
 import re
 import subprocess
-import sys
 import time
 
 from .common import FileDownloader
@@ -11,6 +10,7 @@ from ..compat import compat_str
 from ..utils import (
     check_executable,
     encodeFilename,
+    encodeArgument,
     get_exe_version,
 )
 
@@ -121,7 +121,7 @@ class RtmpFD(FileDownloader):
         # possible. This is part of rtmpdump's normal usage, AFAIK.
         basic_args = [
             'rtmpdump', '--verbose', '-r', url,
-            '-o', encodeFilename(tmpfilename, True)]
+            '-o', tmpfilename]
         if player_url is not None:
             basic_args += ['--swfVfy', player_url]
         if page_url is not None:
@@ -131,7 +131,7 @@ class RtmpFD(FileDownloader):
         if play_path is not None:
             basic_args += ['--playpath', play_path]
         if tc_url is not None:
-            basic_args += ['--tcUrl', url]
+            basic_args += ['--tcUrl', tc_url]
         if test:
             basic_args += ['--stop', '1']
         if flash_version is not None:
@@ -154,16 +154,9 @@ class RtmpFD(FileDownloader):
         if not live and continue_dl:
             args += ['--skip', '1']
 
-        if sys.platform == 'win32' and sys.version_info < (3, 0):
-            # Windows subprocess module does not actually support Unicode
-            # on Python 2.x
-            # See http://stackoverflow.com/a/9951851/35070
-            subprocess_encoding = sys.getfilesystemencoding()
-            args = [a.encode(subprocess_encoding, 'ignore') for a in args]
-        else:
-            subprocess_encoding = None
+        args = [encodeArgument(a) for a in args]
 
-        self._debug_cmd(args, subprocess_encoding, exe='rtmpdump')
+        self._debug_cmd(args, exe='rtmpdump')
 
         RD_SUCCESS = 0
         RD_FAILED = 1
@@ -180,7 +173,11 @@ class RtmpFD(FileDownloader):
             prevsize = os.path.getsize(encodeFilename(tmpfilename))
             self.to_screen('[rtmpdump] %s bytes' % prevsize)
             time.sleep(5.0)  # This seems to be needed
-            retval = run_rtmpdump(basic_args + ['-e'] + [[], ['-k', '1']][retval == RD_FAILED])
+            args = basic_args + ['--resume']
+            if retval == RD_FAILED:
+                args += ['--skip', '1']
+            args = [encodeArgument(a) for a in args]
+            retval = run_rtmpdump(args)
             cursize = os.path.getsize(encodeFilename(tmpfilename))
             if prevsize == cursize and retval == RD_FAILED:
                 break
diff --git a/youtube_dl/downloader/rtsp.py b/youtube_dl/downloader/rtsp.py
new file mode 100644 (file)
index 0000000..3eb2952
--- /dev/null
@@ -0,0 +1,45 @@
+from __future__ import unicode_literals
+
+import os
+import subprocess
+
+from .common import FileDownloader
+from ..utils import (
+    check_executable,
+    encodeFilename,
+)
+
+
+class RtspFD(FileDownloader):
+    def real_download(self, filename, info_dict):
+        url = info_dict['url']
+        self.report_destination(filename)
+        tmpfilename = self.temp_name(filename)
+
+        if check_executable('mplayer', ['-h']):
+            args = [
+                'mplayer', '-really-quiet', '-vo', 'null', '-vc', 'dummy',
+                '-dumpstream', '-dumpfile', tmpfilename, url]
+        elif check_executable('mpv', ['-h']):
+            args = [
+                'mpv', '-really-quiet', '--vo=null', '--stream-dump=' + tmpfilename, url]
+        else:
+            self.report_error('MMS or RTSP download detected but neither "mplayer" nor "mpv" could be run. Please install any.')
+            return False
+
+        retval = subprocess.call(args)
+        if retval == 0:
+            fsize = os.path.getsize(encodeFilename(tmpfilename))
+            self.to_screen('\r[%s] %s bytes' % (args[0], fsize))
+            self.try_rename(tmpfilename, filename)
+            self._hook_progress({
+                'downloaded_bytes': fsize,
+                'total_bytes': fsize,
+                'filename': filename,
+                'status': 'finished',
+            })
+            return True
+        else:
+            self.to_stderr('\n')
+            self.report_error('%s exited with code %d' % (args[0], retval))
+            return False
index 3d6e981b22d9148c783af2b31b8d9d3fde0ae88f..3cfa804ecf42db5603a30162310e681c38329aac 100644 (file)
@@ -4,7 +4,10 @@ from .abc import ABCIE
 from .abc7news import Abc7NewsIE
 from .academicearth import AcademicEarthCourseIE
 from .addanime import AddAnimeIE
-from .adobetv import AdobeTVIE
+from .adobetv import (
+    AdobeTVIE,
+    AdobeTVVideoIE,
+)
 from .adultswim import AdultSwimIE
 from .aftenposten import AftenpostenIE
 from .aftonbladet import AftonbladetIE
@@ -16,9 +19,14 @@ from .anysex import AnySexIE
 from .aol import AolIE
 from .allocine import AllocineIE
 from .aparat import AparatIE
+from .appleconnect import AppleConnectIE
 from .appletrailers import AppleTrailersIE
 from .archiveorg import ArchiveOrgIE
-from .ard import ARDIE, ARDMediathekIE
+from .ard import (
+    ARDIE,
+    ARDMediathekIE,
+    SportschauIE,
+)
 from .arte import (
     ArteTvIE,
     ArteTVPlus7IE,
@@ -32,6 +40,7 @@ from .atresplayer import AtresPlayerIE
 from .atttechchannel import ATTTechChannelIE
 from .audiomack import AudiomackIE, AudiomackAlbumIE
 from .azubu import AzubuIE
+from .baidu import BaiduVideoIE
 from .bambuser import BambuserIE, BambuserChannelIE
 from .bandcamp import BandcampIE, BandcampAlbumIE
 from .bbccouk import BBCCoUkIE
@@ -70,6 +79,7 @@ from .chirbit import (
     ChirbitProfileIE,
 )
 from .cinchcast import CinchcastIE
+from .cinemassacre import CinemassacreIE
 from .clipfish import ClipfishIE
 from .cliphunter import CliphunterIE
 from .clipsyndicate import ClipsyndicateIE
@@ -101,6 +111,7 @@ from .dailymotion import (
     DailymotionIE,
     DailymotionPlaylistIE,
     DailymotionUserIE,
+    DailymotionCloudIE,
 )
 from .daum import DaumIE
 from .dbtv import DBTVIE
@@ -110,6 +121,10 @@ from .dfb import DFBIE
 from .dhm import DHMIE
 from .dotsub import DotsubIE
 from .douyutv import DouyuTVIE
+from .dramafever import (
+    DramaFeverIE,
+    DramaFeverSeriesIE,
+)
 from .dreisat import DreiSatIE
 from .drbonanza import DRBonanzaIE
 from .drtuber import DrTuberIE
@@ -134,11 +149,11 @@ from .ellentv import (
 )
 from .elpais import ElPaisIE
 from .embedly import EmbedlyIE
-from .empflix import EMPFlixIE
 from .engadget import EngadgetIE
 from .eporner import EpornerIE
 from .eroprofile import EroProfileIE
 from .escapist import EscapistIE
+from .espn import ESPNIE
 from .everyonesmixtape import EveryonesMixtapeIE
 from .exfm import ExfmIE
 from .expotv import ExpoTVIE
@@ -146,10 +161,10 @@ from .extremetube import ExtremeTubeIE
 from .facebook import FacebookIE
 from .faz import FazIE
 from .fc2 import FC2IE
-from .firedrive import FiredriveIE
 from .firstpost import FirstpostIE
 from .firsttv import FirstTVIE
 from .fivemin import FiveMinIE
+from .fivetv import FiveTVIE
 from .fktv import (
     FKTVIE,
     FKTVPosteckeIE,
@@ -160,6 +175,7 @@ from .footyroom import FootyRoomIE
 from .fourtube import FourTubeIE
 from .foxgay import FoxgayIE
 from .foxnews import FoxNewsIE
+from .foxsports import FoxSportsIE
 from .franceculture import FranceCultureIE
 from .franceinter import FranceInterIE
 from .francetv import (
@@ -185,6 +201,7 @@ from .gametrailers import GametrailersIE
 from .gazeta import GazetaIE
 from .gdcvault import GDCVaultIE
 from .generic import GenericIE
+from .gfycat import GfycatIE
 from .giantbomb import GiantBombIE
 from .giga import GigaIE
 from .glide import GlideIE
@@ -196,7 +213,6 @@ from .googleplus import GooglePlusIE
 from .googlesearch import GoogleSearchIE
 from .gorillavid import GorillaVidIE
 from .goshgay import GoshgayIE
-from .grooveshark import GroovesharkIE
 from .groupon import GrouponIE
 from .hark import HarkIE
 from .hearthisat import HearThisAtIE
@@ -226,6 +242,7 @@ from .infoq import InfoQIE
 from .instagram import InstagramIE, InstagramUserIE
 from .internetvideoarchive import InternetVideoArchiveIE
 from .iprima import IPrimaIE
+from .iqiyi import IqiyiIE
 from .ivi import (
     IviIE,
     IviCompilationIE
@@ -240,6 +257,7 @@ from .kaltura import KalturaIE
 from .kanalplay import KanalPlayIE
 from .kankan import KankanIE
 from .karaoketv import KaraoketvIE
+from .karrierevideos import KarriereVideosIE
 from .keezmovies import KeezMoviesIE
 from .khanacademy import KhanAcademyIE
 from .kickstarter import KickStarterIE
@@ -247,6 +265,14 @@ from .keek import KeekIE
 from .kontrtube import KontrTubeIE
 from .krasview import KrasViewIE
 from .ku6 import Ku6IE
+from .kuwo import (
+    KuwoIE,
+    KuwoAlbumIE,
+    KuwoChartIE,
+    KuwoSingerIE,
+    KuwoCategoryIE,
+    KuwoMvIE,
+)
 from .la7 import LA7IE
 from .laola1tv import Laola1TvIE
 from .lecture2go import Lecture2GoIE
@@ -256,7 +282,10 @@ from .letv import (
     LetvPlaylistIE
 )
 from .libsyn import LibsynIE
-from .lifenews import LifeNewsIE
+from .lifenews import (
+    LifeNewsIE,
+    LifeEmbedIE,
+)
 from .liveleak import LiveLeakIE
 from .livestream import (
     LivestreamIE,
@@ -274,6 +303,7 @@ from .macgamestore import MacGameStoreIE
 from .mailru import MailRuIE
 from .malemotion import MalemotionIE
 from .mdr import MDRIE
+from .megavideoz import MegaVideozIE
 from .metacafe import MetacafeIE
 from .metacritic import MetacriticIE
 from .mgoon import MgoonIE
@@ -307,6 +337,7 @@ from .musicvault import MusicVaultIE
 from .muzu import MuzuTVIE
 from .myspace import MySpaceIE, MySpaceAlbumIE
 from .myspass import MySpassIE
+from .myvi import MyviIE
 from .myvideo import MyVideoIE
 from .myvidster import MyVidsterIE
 from .nationalgeographic import NationalGeographicIE
@@ -318,18 +349,29 @@ from .nbc import (
     NBCSportsIE,
     NBCSportsVPlayerIE,
 )
-from .ndr import NDRIE
+from .ndr import (
+    NDRIE,
+    NJoyIE,
+)
 from .ndtv import NDTVIE
 from .netzkino import NetzkinoIE
 from .nerdcubed import NerdCubedFeedIE
 from .nerdist import NerdistIE
+from .neteasemusic import (
+    NetEaseMusicIE,
+    NetEaseMusicAlbumIE,
+    NetEaseMusicSingerIE,
+    NetEaseMusicListIE,
+    NetEaseMusicMvIE,
+    NetEaseMusicProgramIE,
+    NetEaseMusicDjRadioIE,
+)
 from .newgrounds import NewgroundsIE
 from .newstube import NewstubeIE
 from .nextmedia import (
     NextMediaIE,
     NextMediaActionNewsIE,
-    AppleDailyRealtimeNewsIE,
-    AppleDailyAnimationNewsIE
+    AppleDailyIE,
 )
 from .nfb import NFBIE
 from .nfl import NFLIE
@@ -343,15 +385,18 @@ from .ninegag import NineGagIE
 from .noco import NocoIE
 from .normalboots import NormalbootsIE
 from .nosvideo import NosVideoIE
+from .nova import NovaIE
 from .novamov import NovaMovIE
 from .nowness import NownessIE
+from .nowtv import NowTVIE
 from .nowvideo import NowVideoIE
 from .npo import (
     NPOIE,
     NPOLiveIE,
     NPORadioIE,
     NPORadioFragmentIE,
-    TegenlichtVproIE,
+    VPROIE,
+    WNLIE
 )
 from .nrk import (
     NRKIE,
@@ -360,11 +405,18 @@ from .nrk import (
 )
 from .ntvde import NTVDeIE
 from .ntvru import NTVRuIE
-from .nytimes import NYTimesIE
+from .nytimes import (
+    NYTimesIE,
+    NYTimesArticleIE,
+)
 from .nuvid import NuvidIE
 from .odnoklassniki import OdnoklassnikiIE
 from .oktoberfesttv import OktoberfestTVIE
-from .ooyala import OoyalaIE
+from .onionstudios import OnionStudiosIE
+from .ooyala import (
+    OoyalaIE,
+    OoyalaExternalIE,
+)
 from .openfilm import OpenFilmIE
 from .orf import (
     ORFTVthekIE,
@@ -375,8 +427,10 @@ from .orf import (
 from .parliamentliveuk import ParliamentLiveUKIE
 from .patreon import PatreonIE
 from .pbs import PBSIE
+from .philharmoniedeparis import PhilharmonieDeParisIE
 from .phoenix import PhoenixIE
 from .photobucket import PhotobucketIE
+from .pinkbike import PinkbikeIE
 from .planetaplay import PlanetaPlayIE
 from .pladform import PladformIE
 from .played import PlayedIE
@@ -384,6 +438,7 @@ from .playfm import PlayFMIE
 from .playvid import PlayvidIE
 from .playwire import PlaywireIE
 from .podomatic import PodomaticIE
+from .porn91 import Porn91IE
 from .pornhd import PornHdIE
 from .pornhub import (
     PornHubIE,
@@ -397,6 +452,13 @@ from .promptfile import PromptFileIE
 from .prosiebensat1 import ProSiebenSat1IE
 from .puls4 import Puls4IE
 from .pyvideo import PyvideoIE
+from .qqmusic import (
+    QQMusicIE,
+    QQMusicSingerIE,
+    QQMusicAlbumIE,
+    QQMusicToplistIE,
+    QQMusicPlaylistIE,
+)
 from .quickvid import QuickVidIE
 from .r7 import R7IE
 from .radiode import RadioDeIE
@@ -405,6 +467,7 @@ from .radiobremen import RadioBremenIE
 from .radiofrance import RadioFranceIE
 from .rai import RaiIE
 from .rbmaradio import RBMARadioIE
+from .rds import RDSIE
 from .redtube import RedTubeIE
 from .restudy import RestudyIE
 from .reverbnation import ReverbNationIE
@@ -415,7 +478,6 @@ from .roxwel import RoxwelIE
 from .rtbf import RTBFIE
 from .rte import RteIE
 from .rtlnl import RtlNlIE
-from .rtlnow import RTLnowIE
 from .rtl2 import RTL2IE
 from .rtp import RTPIE
 from .rts import RTSIE
@@ -429,6 +491,7 @@ from .rutube import (
     RutubePersonIE,
 )
 from .rutv import RUTVIE
+from .ruutu import RuutuIE
 from .sandia import SandiaIE
 from .safari import (
     SafariIE,
@@ -440,7 +503,8 @@ from .sbs import SBSIE
 from .scivee import SciVeeIE
 from .screencast import ScreencastIE
 from .screencastomatic import ScreencastOMaticIE
-from .screenwavemedia import CinemassacreIE, ScreenwaveMediaIE, TeamFourIE
+from .screenwavemedia import ScreenwaveMediaIE, TeamFourIE
+from .senateisvp import SenateISVPIE
 from .servingsys import ServingSysIE
 from .sexu import SexuIE
 from .sexykarma import SexyKarmaIE
@@ -455,9 +519,16 @@ from .smotri import (
     SmotriUserIE,
     SmotriBroadcastIE,
 )
+from .snagfilms import (
+    SnagFilmsIE,
+    SnagFilmsEmbedIE,
+)
 from .snotr import SnotrIE
-from .sockshare import SockshareIE
 from .sohu import SohuIE
+from .soompi import (
+    SoompiIE,
+    SoompiShowIE,
+)
 from .soundcloud import (
     SoundcloudIE,
     SoundcloudSetIE,
@@ -470,7 +541,10 @@ from .soundgasm import (
 )
 from .southpark import (
     SouthParkIE,
-    SouthparkDeIE,
+    SouthParkDeIE,
+    SouthParkDkIE,
+    SouthParkEsIE,
+    SouthParkNlIE
 )
 from .space import SpaceIE
 from .spankbang import SpankBangIE
@@ -479,8 +553,12 @@ from .spiegel import SpiegelIE, SpiegelArticleIE
 from .spiegeltv import SpiegeltvIE
 from .spike import SpikeIE
 from .sport5 import Sport5IE
-from .sportbox import SportBoxIE
+from .sportbox import (
+    SportBoxIE,
+    SportBoxEmbedIE,
+)
 from .sportdeutschland import SportDeutschlandIE
+from .srf import SrfIE
 from .srmediathek import SRMediathekIE
 from .ssa import SSAIE
 from .stanfordoc import StanfordOpenClassroomIE
@@ -489,7 +567,10 @@ from .streamcloud import StreamcloudIE
 from .streamcz import StreamCZIE
 from .streetvoice import StreetVoiceIE
 from .sunporno import SunPornoIE
-from .svtplay import SVTPlayIE
+from .svt import (
+    SVTIE,
+    SVTPlayIE,
+)
 from .swrmediathek import SWRMediathekIE
 from .syfy import SyfyIE
 from .sztvhu import SztvHuIE
@@ -515,11 +596,19 @@ from .tf1 import TF1IE
 from .theonion import TheOnionIE
 from .theplatform import ThePlatformIE
 from .thesixtyone import TheSixtyOneIE
+from .thisamericanlife import ThisAmericanLifeIE
 from .thisav import ThisAVIE
 from .tinypic import TinyPicIE
 from .tlc import TlcIE, TlcDeIE
-from .tmz import TMZIE
-from .tnaflix import TNAFlixIE
+from .tmz import (
+    TMZIE,
+    TMZArticleIE,
+)
+from .tnaflix import (
+    TNAFlixIE,
+    EMPFlixIE,
+    MovieFapIE,
+)
 from .thvideo import (
     THVideoIE,
     THVideoPlaylistIE
@@ -530,12 +619,21 @@ from .traileraddict import TrailerAddictIE
 from .trilulilu import TriluliluIE
 from .trutube import TruTubeIE
 from .tube8 import Tube8IE
+from .tubitv import TubiTvIE
 from .tudou import TudouIE
 from .tumblr import TumblrIE
 from .tunein import TuneInIE
 from .turbo import TurboIE
 from .tutv import TutvIE
+from .tv2 import (
+    TV2IE,
+    TV2ArticleIE,
+)
 from .tv4 import TV4IE
+from .tvc import (
+    TVCIE,
+    TVCArticleIE,
+)
 from .tvigle import TvigleIE
 from .tvp import TvpIE, TvpSeriesIE
 from .tvplay import TVPlayIE
@@ -554,6 +652,7 @@ from .twitch import (
     TwitchBookmarksIE,
     TwitchStreamIE,
 )
+from .twitter import TwitterCardIE
 from .ubu import UbuIE
 from .udemy import (
     UdemyIE,
@@ -571,7 +670,11 @@ from .veoh import VeohIE
 from .vessel import VesselIE
 from .vesti import VestiIE
 from .vevo import VevoIE
-from .vgtv import VGTVIE
+from .vgtv import (
+    BTArticleIE,
+    BTVestlendingenIE,
+    VGTVIE,
+)
 from .vh1 import VH1IE
 from .vice import ViceIE
 from .viddler import ViddlerIE
@@ -602,12 +705,16 @@ from .vine import (
     VineIE,
     VineUserIE,
 )
-from .viki import VikiIE
+from .viki import (
+    VikiIE,
+    VikiChannelIE,
+)
 from .vk import (
     VKIE,
     VKUserVideosIE,
 )
 from .vodlocker import VodlockerIE
+from .voicerepublic import VoiceRepublicIE
 from .vporn import VpornIE
 from .vrt import VRTIE
 from .vube import VubeIE
@@ -622,7 +729,10 @@ from .wdr import (
     WDRMobileIE,
     WDRMausIE,
 )
-from .webofstories import WebOfStoriesIE
+from .webofstories import (
+    WebOfStoriesIE,
+    WebOfStoriesPlaylistIE,
+)
 from .weibo import WeiboIE
 from .wimp import WimpIE
 from .wistia import WistiaIE
@@ -631,12 +741,16 @@ from .wrzuta import WrzutaIE
 from .wsj import WSJIE
 from .xbef import XBefIE
 from .xboxclips import XboxClipsIE
-from .xhamster import XHamsterIE
+from .xhamster import (
+    XHamsterIE,
+    XHamsterEmbedIE,
+)
 from .xminus import XMinusIE
 from .xnxx import XNXXIE
-from .xvideos import XVideosIE
+from .xstream import XstreamIE
 from .xtube import XTubeUserIE, XTubeIE
 from .xuite import XuiteIE
+from .xvideos import XVideosIE
 from .xxxymovies import XXXYMoviesIE
 from .yahoo import (
     YahooIE,
@@ -649,6 +763,7 @@ from .yandexmusic import (
     YandexMusicPlaylistIE,
 )
 from .yesjapan import YesJapanIE
+from .yinyuetai import YinYueTaiIE
 from .ynet import YnetIE
 from .youjizz import YouJizzIE
 from .youku import YoukuIE
index 97d12856092975a094ec18a5fd7ecafef39c255a..5e43adc51f98c2f22e728c49150b84ae64f704e3 100644 (file)
@@ -5,6 +5,8 @@ from ..utils import (
     parse_duration,
     unified_strdate,
     str_to_int,
+    float_or_none,
+    ISO639Utils,
 )
 
 
@@ -69,3 +71,61 @@ class AdobeTVIE(InfoExtractor):
             'view_count': view_count,
             'formats': formats,
         }
+
+
+class AdobeTVVideoIE(InfoExtractor):
+    _VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P<id>\d+)'
+
+    _TEST = {
+        # From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners
+        'url': 'https://video.tv.adobe.com/v/2456/',
+        'md5': '43662b577c018ad707a63766462b1e87',
+        'info_dict': {
+            'id': '2456',
+            'ext': 'mp4',
+            'title': 'New experience with Acrobat DC',
+            'description': 'New experience with Acrobat DC',
+            'duration': 248.667,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        player_params = self._parse_json(self._search_regex(
+            r'var\s+bridge\s*=\s*([^;]+);', webpage, 'player parameters'),
+            video_id)
+
+        formats = [{
+            'url': source['src'],
+            'width': source.get('width'),
+            'height': source.get('height'),
+            'tbr': source.get('bitrate'),
+        } for source in player_params['sources']]
+
+        # For both metadata and downloaded files the duration varies among
+        # formats. I just pick the max one
+        duration = max(filter(None, [
+            float_or_none(source.get('duration'), scale=1000)
+            for source in player_params['sources']]))
+
+        subtitles = {}
+        for translation in player_params.get('translations', []):
+            lang_id = translation.get('language_w3c') or ISO639Utils.long2short(translation['language_medium'])
+            if lang_id not in subtitles:
+                subtitles[lang_id] = []
+            subtitles[lang_id].append({
+                'url': translation['vttPath'],
+                'ext': 'vtt',
+            })
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': player_params['title'],
+            'description': self._og_search_description(webpage),
+            'duration': duration,
+            'subtitles': subtitles,
+        }
index e15c015fbafd466aaba089bef979843a397f1ab7..0c00acfb5766ad6e899877b8ed173225c10fc4ec 100644 (file)
@@ -1,21 +1,11 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
-from ..utils import (
-    int_or_none,
-    parse_iso8601,
-    xpath_with_ns,
-    xpath_text,
-    find_xpath_attr,
-)
 
 
 class AftenpostenIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?aftenposten\.no/webtv/(?:#!/)?video/(?P<id>\d+)'
-
     _TEST = {
         'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more',
         'md5': 'fd828cd29774a729bf4d4425fe192972',
@@ -30,69 +20,4 @@ class AftenpostenIE(InfoExtractor):
     }
 
     def _real_extract(self, url):
-        video_id = self._match_id(url)
-
-        data = self._download_xml(
-            'http://frontend.xstream.dk/ap/feed/video/?platform=web&id=%s' % video_id, video_id)
-
-        NS_MAP = {
-            'atom': 'http://www.w3.org/2005/Atom',
-            'xt': 'http://xstream.dk/',
-            'media': 'http://search.yahoo.com/mrss/',
-        }
-
-        entry = data.find(xpath_with_ns('./atom:entry', NS_MAP))
-
-        title = xpath_text(
-            entry, xpath_with_ns('./atom:title', NS_MAP), 'title')
-        description = xpath_text(
-            entry, xpath_with_ns('./atom:summary', NS_MAP), 'description')
-        timestamp = parse_iso8601(xpath_text(
-            entry, xpath_with_ns('./atom:published', NS_MAP), 'upload date'))
-
-        formats = []
-        media_group = entry.find(xpath_with_ns('./media:group', NS_MAP))
-        for media_content in media_group.findall(xpath_with_ns('./media:content', NS_MAP)):
-            media_url = media_content.get('url')
-            if not media_url:
-                continue
-            tbr = int_or_none(media_content.get('bitrate'))
-            mobj = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', media_url)
-            if mobj:
-                formats.append({
-                    'url': mobj.group('url'),
-                    'play_path': 'mp4:%s' % mobj.group('playpath'),
-                    'app': mobj.group('app'),
-                    'ext': 'flv',
-                    'tbr': tbr,
-                    'format_id': 'rtmp-%d' % tbr,
-                })
-            else:
-                formats.append({
-                    'url': media_url,
-                    'tbr': tbr,
-                })
-        self._sort_formats(formats)
-
-        link = find_xpath_attr(
-            entry, xpath_with_ns('./atom:link', NS_MAP), 'rel', 'original')
-        if link is not None:
-            formats.append({
-                'url': link.get('href'),
-                'format_id': link.get('rel'),
-            })
-
-        thumbnails = [{
-            'url': splash.get('url'),
-            'width': int_or_none(splash.get('width')),
-            'height': int_or_none(splash.get('height')),
-        } for splash in media_group.findall(xpath_with_ns('./xt:splash', NS_MAP))]
-
-        return {
-            'id': video_id,
-            'title': title,
-            'description': description,
-            'timestamp': timestamp,
-            'formats': formats,
-            'thumbnails': thumbnails,
-        }
+        return self.url_result('xstream:ap:%s' % self._match_id(url), 'Xstream')
index a117502bc0ad7bfec11592ec57da575898cacc3d..e0518cf261fbffc4dd23bc4a3800d04eae324139 100644 (file)
@@ -6,11 +6,11 @@ from ..utils import int_or_none
 
 
 class AftonbladetIE(InfoExtractor):
-    _VALID_URL = r'http://tv\.aftonbladet\.se/webbtv.+?(?P<id>article[0-9]+)\.ab(?:$|[?#])'
+    _VALID_URL = r'http://tv\.aftonbladet\.se/abtv/articles/(?P<id>[0-9]+)'
     _TEST = {
-        'url': 'http://tv.aftonbladet.se/webbtv/nyheter/vetenskap/rymden/article36015.ab',
+        'url': 'http://tv.aftonbladet.se/abtv/articles/36015',
         'info_dict': {
-            'id': 'article36015',
+            'id': '36015',
             'ext': 'mp4',
             'title': 'Vulkanutbrott i rymden - nu släpper NASA bilderna',
             'description': 'Jupiters måne mest aktiv av alla himlakroppar',
@@ -25,8 +25,9 @@ class AftonbladetIE(InfoExtractor):
 
         # find internal video meta data
         meta_url = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json'
-        internal_meta_id = self._html_search_regex(
-            r'data-aptomaId="([\w\d]+)"', webpage, 'internal_meta_id')
+        player_config = self._parse_json(self._html_search_regex(
+            r'data-player-config="([^"]+)"', webpage, 'player config'), video_id)
+        internal_meta_id = player_config['videoId']
         internal_meta_url = meta_url % internal_meta_id
         internal_meta_json = self._download_json(
             internal_meta_url, video_id, 'Downloading video meta data')
diff --git a/youtube_dl/extractor/appleconnect.py b/youtube_dl/extractor/appleconnect.py
new file mode 100644 (file)
index 0000000..ea7a703
--- /dev/null
@@ -0,0 +1,50 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    str_to_int,
+    ExtractorError
+)
+
+
+class AppleConnectIE(InfoExtractor):
+    _VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/idsa\.(?P<id>[\w-]+)'
+    _TEST = {
+        'url': 'https://itunes.apple.com/us/post/idsa.4ab17a39-2720-11e5-96c5-a5b38f6c42d3',
+        'md5': '10d0f2799111df4cb1c924520ca78f98',
+        'info_dict': {
+            'id': '4ab17a39-2720-11e5-96c5-a5b38f6c42d3',
+            'ext': 'm4v',
+            'title': 'Energy',
+            'uploader': 'Drake',
+            'thumbnail': 'http://is5.mzstatic.com/image/thumb/Video5/v4/78/61/c5/7861c5fa-ad6d-294b-1464-cf7605b911d6/source/1920x1080sr.jpg',
+            'upload_date': '20150710',
+            'timestamp': 1436545535,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        try:
+            video_json = self._html_search_regex(
+                r'class="auc-video-data">(\{.*?\})', webpage, 'json')
+        except ExtractorError:
+            raise ExtractorError('This post doesn\'t contain a video', expected=True)
+
+        video_data = self._parse_json(video_json, video_id)
+        timestamp = str_to_int(self._html_search_regex(r'data-timestamp="(\d+)"', webpage, 'timestamp'))
+        like_count = str_to_int(self._html_search_regex(r'(\d+) Loves', webpage, 'like count'))
+
+        return {
+            'id': video_id,
+            'url': video_data['sslSrc'],
+            'title': video_data['title'],
+            'description': video_data['description'],
+            'uploader': video_data['artistName'],
+            'thumbnail': video_data['artworkUrl'],
+            'timestamp': timestamp,
+            'like_count': like_count,
+        }
index 9fc35a42b8612d828ccc3ae43c9e4f74782f5352..8feb7cb7456ec4db8d6a8f28b411a58cb5ac47a1 100644 (file)
@@ -33,7 +33,7 @@ class ArchiveOrgIE(InfoExtractor):
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        json_url = url + ('?' if '?' in url else '&') + 'output=json'
+        json_url = url + ('&' if '?' in url else '?') + 'output=json'
         data = self._download_json(json_url, video_id)
 
         def get_optional(data_dict, field):
index 6a35ea463edcafe3b9d7db4c53b9bf0c53198fd0..6f465789b497a6625776c383ff699a64b0b5c346 100644 (file)
@@ -8,6 +8,7 @@ from .generic import GenericIE
 from ..utils import (
     determine_ext,
     ExtractorError,
+    get_element_by_attribute,
     qualities,
     int_or_none,
     parse_duration,
@@ -22,19 +23,125 @@ class ARDMediathekIE(InfoExtractor):
     _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
 
     _TESTS = [{
-        'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
-        'only_matching': True,
+        'url': 'http://www.ardmediathek.de/tv/Dokumentation-und-Reportage/Ich-liebe-das-Leben-trotzdem/rbb-Fernsehen/Video?documentId=29582122&bcastId=3822114',
+        'info_dict': {
+            'id': '29582122',
+            'ext': 'mp4',
+            'title': 'Ich liebe das Leben trotzdem',
+            'description': 'md5:45e4c225c72b27993314b31a84a5261c',
+            'duration': 4557,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
     }, {
-        'url': 'http://www.ardmediathek.de/tv/Tatort/Das-Wunder-von-Wolbeck-Video-tgl-ab-20/Das-Erste/Video?documentId=22490580&bcastId=602916',
+        'url': 'http://www.ardmediathek.de/tv/Tatort/Tatort-Scheinwelten-H%C3%B6rfassung-Video/Das-Erste/Video?documentId=29522730&bcastId=602916',
+        'md5': 'f4d98b10759ac06c0072bbcd1f0b9e3e',
         'info_dict': {
-            'id': '22490580',
+            'id': '29522730',
             'ext': 'mp4',
-            'title': 'Das Wunder von Wolbeck (Video tgl. ab 20 Uhr)',
-            'description': 'Auf einem restaurierten Hof bei Wolbeck wird der Heilpraktiker Raffael Lembeck eines morgens von seiner Frau Stella tot aufgefunden. Das Opfer war offensichtlich in seiner Praxis zu Fall gekommen und ist dann verblutet, erklärt Prof. Boerne am Tatort.',
+            'title': 'Tatort: Scheinwelten - Hörfassung (Video tgl. ab 20 Uhr)',
+            'description': 'md5:196392e79876d0ac94c94e8cdb2875f1',
+            'duration': 5252,
         },
-        'skip': 'Blocked outside of Germany',
+    }, {
+        # audio
+        'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086',
+        'md5': '219d94d8980b4f538c7fcb0865eb7f2c',
+        'info_dict': {
+            'id': '28488308',
+            'ext': 'mp3',
+            'title': 'Tod eines Fußballers',
+            'description': 'md5:f6e39f3461f0e1f54bfa48c8875c86ef',
+            'duration': 3240,
+        },
+    }, {
+        'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
+        'only_matching': True,
     }]
 
+    def _extract_media_info(self, media_info_url, webpage, video_id):
+        media_info = self._download_json(
+            media_info_url, video_id, 'Downloading media JSON')
+
+        formats = self._extract_formats(media_info, video_id)
+
+        if not formats:
+            if '"fsk"' in webpage:
+                raise ExtractorError(
+                    'This video is only available after 20:00', expected=True)
+            elif media_info.get('_geoblocked'):
+                raise ExtractorError('This video is not available due to geo restriction', expected=True)
+
+        self._sort_formats(formats)
+
+        duration = int_or_none(media_info.get('_duration'))
+        thumbnail = media_info.get('_previewImage')
+
+        subtitles = {}
+        subtitle_url = media_info.get('_subtitleUrl')
+        if subtitle_url:
+            subtitles['de'] = [{
+                'ext': 'srt',
+                'url': subtitle_url,
+            }]
+
+        return {
+            'id': video_id,
+            'duration': duration,
+            'thumbnail': thumbnail,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
+
+    def _extract_formats(self, media_info, video_id):
+        type_ = media_info.get('_type')
+        media_array = media_info.get('_mediaArray', [])
+        formats = []
+        for num, media in enumerate(media_array):
+            for stream in media.get('_mediaStreamArray', []):
+                stream_urls = stream.get('_stream')
+                if not stream_urls:
+                    continue
+                if not isinstance(stream_urls, list):
+                    stream_urls = [stream_urls]
+                quality = stream.get('_quality')
+                server = stream.get('_server')
+                for stream_url in stream_urls:
+                    ext = determine_ext(stream_url)
+                    if ext == 'f4m':
+                        formats.extend(self._extract_f4m_formats(
+                            stream_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124',
+                            video_id, preference=-1, f4m_id='hds'))
+                    elif ext == 'm3u8':
+                        formats.extend(self._extract_m3u8_formats(
+                            stream_url, video_id, 'mp4', preference=1, m3u8_id='hls'))
+                    else:
+                        if server and server.startswith('rtmp'):
+                            f = {
+                                'url': server,
+                                'play_path': stream_url,
+                                'format_id': 'a%s-rtmp-%s' % (num, quality),
+                            }
+                        elif stream_url.startswith('http'):
+                            f = {
+                                'url': stream_url,
+                                'format_id': 'a%s-%s-%s' % (num, ext, quality)
+                            }
+                        else:
+                            continue
+                        m = re.search(r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$', stream_url)
+                        if m:
+                            f.update({
+                                'width': int(m.group('width')),
+                                'height': int(m.group('height')),
+                            })
+                        if type_ == 'audio':
+                            f['vcodec'] = 'none'
+                        formats.append(f)
+        return formats
+
     def _real_extract(self, url):
         # determine video id from url
         m = re.match(self._VALID_URL, url)
@@ -92,46 +199,22 @@ class ARDMediathekIE(InfoExtractor):
                     'format_id': fid,
                     'url': furl,
                 })
+            self._sort_formats(formats)
+            info = {
+                'formats': formats,
+            }
         else:  # request JSON file
-            media_info = self._download_json(
-                'http://www.ardmediathek.de/play/media/%s' % video_id, video_id)
-            # The second element of the _mediaArray contains the standard http urls
-            streams = media_info['_mediaArray'][1]['_mediaStreamArray']
-            if not streams:
-                if '"fsk"' in webpage:
-                    raise ExtractorError('This video is only available after 20:00')
-
-            formats = []
-            for s in streams:
-                if type(s['_stream']) == list:
-                    for index, url in enumerate(s['_stream'][::-1]):
-                        quality = s['_quality'] + index
-                        formats.append({
-                            'quality': quality,
-                            'url': url,
-                            'format_id': '%s-%s' % (determine_ext(url), quality)
-                        })
-                    continue
-
-                format = {
-                    'quality': s['_quality'],
-                    'url': s['_stream'],
-                }
-
-                format['format_id'] = '%s-%s' % (
-                    determine_ext(format['url']), format['quality'])
+            info = self._extract_media_info(
+                'http://www.ardmediathek.de/play/media/%s' % video_id, webpage, video_id)
 
-                formats.append(format)
-
-        self._sort_formats(formats)
-
-        return {
+        info.update({
             'id': video_id,
             'title': title,
             'description': description,
-            'formats': formats,
             'thumbnail': thumbnail,
-        }
+        })
+
+        return info
 
 
 class ARDIE(InfoExtractor):
@@ -189,3 +272,41 @@ class ARDIE(InfoExtractor):
             'upload_date': upload_date,
             'thumbnail': thumbnail,
         }
+
+
+class SportschauIE(ARDMediathekIE):
+    IE_NAME = 'Sportschau'
+    _VALID_URL = r'(?P<baseurl>https?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video(?P<id>[^/#?]+))\.html'
+    _TESTS = [{
+        'url': 'http://www.sportschau.de/tourdefrance/videoseppeltkokainhatnichtsmitklassischemdopingzutun100.html',
+        'info_dict': {
+            'id': 'seppeltkokainhatnichtsmitklassischemdopingzutun100',
+            'ext': 'mp4',
+            'title': 'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun"',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'description': 'Der ARD-Doping Experte Hajo Seppelt gibt seine Einschätzung zum ersten Dopingfall der diesjährigen Tour de France um den Italiener Luca Paolini ab.',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        base_url = mobj.group('baseurl')
+
+        webpage = self._download_webpage(url, video_id)
+        title = get_element_by_attribute('class', 'headline', webpage)
+        description = self._html_search_meta('description', webpage, 'description')
+
+        info = self._extract_media_info(
+            base_url + '-mc_defaultQuality-h.json', webpage, video_id)
+
+        info.update({
+            'title': title,
+            'description': description,
+        })
+
+        return info
index 8273bd6c9ae3cdff82052c8f63efc68be97561b3..76de244774369dd53c510961ecf6b6a7641c7027 100644 (file)
@@ -7,7 +7,6 @@ from .common import InfoExtractor
 from ..utils import (
     find_xpath_attr,
     unified_strdate,
-    get_element_by_id,
     get_element_by_attribute,
     int_or_none,
     qualities,
@@ -195,7 +194,9 @@ class ArteTVFutureIE(ArteTVPlus7IE):
     def _real_extract(self, url):
         anchor_id, lang = self._extract_url_info(url)
         webpage = self._download_webpage(url, anchor_id)
-        row = get_element_by_id(anchor_id, webpage)
+        row = self._search_regex(
+            r'(?s)id="%s"[^>]*>.+?(<div[^>]*arte_vp_url[^>]*>)' % anchor_id,
+            webpage, 'row')
         return self._extract_from_webpage(row, anchor_id, lang)
 
 
diff --git a/youtube_dl/extractor/baidu.py b/youtube_dl/extractor/baidu.py
new file mode 100644 (file)
index 0000000..e37ee44
--- /dev/null
@@ -0,0 +1,69 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+
+
+class BaiduVideoIE(InfoExtractor):
+    IE_DESC = '百度视频'
+    _VALID_URL = r'http://v\.baidu\.com/(?P<type>[a-z]+)/(?P<id>\d+)\.htm'
+    _TESTS = [{
+        'url': 'http://v.baidu.com/comic/1069.htm?frp=bdbrand&q=%E4%B8%AD%E5%8D%8E%E5%B0%8F%E5%BD%93%E5%AE%B6',
+        'info_dict': {
+            'id': '1069',
+            'title': '中华小当家 TV版 (全52集)',
+            'description': 'md5:395a419e41215e531c857bb037bbaf80',
+        },
+        'playlist_count': 52,
+    }, {
+        'url': 'http://v.baidu.com/show/11595.htm?frp=bdbrand',
+        'info_dict': {
+            'id': '11595',
+            'title': 're:^奔跑吧兄弟',
+            'description': 'md5:1bf88bad6d850930f542d51547c089b8',
+        },
+        'playlist_mincount': 3,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        playlist_id = mobj.group('id')
+        category = category2 = mobj.group('type')
+        if category == 'show':
+            category2 = 'tvshow'
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        playlist_title = self._html_search_regex(
+            r'title\s*:\s*(["\'])(?P<title>[^\']+)\1', webpage,
+            'playlist title', group='title')
+        playlist_description = self._html_search_regex(
+            r'<input[^>]+class="j-data-intro"[^>]+value="([^"]+)"/>', webpage,
+            playlist_id, 'playlist description')
+
+        site = self._html_search_regex(
+            r'filterSite\s*:\s*["\']([^"]*)["\']', webpage,
+            'primary provider site')
+        api_result = self._download_json(
+            'http://v.baidu.com/%s_intro/?dtype=%sPlayUrl&id=%s&site=%s' % (
+                category, category2, playlist_id, site),
+            playlist_id, 'Get playlist links')
+
+        entries = []
+        for episode in api_result[0]['episodes']:
+            episode_id = '%s_%s' % (playlist_id, episode['episode'])
+
+            redirect_page = self._download_webpage(
+                compat_urlparse.urljoin(url, episode['url']), episode_id,
+                note='Download Baidu redirect page')
+            real_url = self._html_search_regex(
+                r'location\.replace\("([^"]+)"\)', redirect_page, 'real URL')
+
+            entries.append(self.url_result(
+                real_url, video_title=episode['single_title']))
+
+        return self.playlist_result(
+            entries, playlist_id, playlist_title, playlist_description)
index c193e66cad7275cffb6ee96e051d567b9262e773..8dff1d6e377c0c246cfc958821b1d18cae4b2b64 100644 (file)
@@ -1,12 +1,18 @@
 from __future__ import unicode_literals
 
 import re
-import json
 import itertools
 
 from .common import InfoExtractor
 from ..compat import (
+    compat_urllib_parse,
     compat_urllib_request,
+    compat_str,
+)
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    float_or_none,
 )
 
 
@@ -14,6 +20,8 @@ class BambuserIE(InfoExtractor):
     IE_NAME = 'bambuser'
     _VALID_URL = r'https?://bambuser\.com/v/(?P<id>\d+)'
     _API_KEY = '005f64509e19a868399060af746a00aa'
+    _LOGIN_URL = 'https://bambuser.com/user'
+    _NETRC_MACHINE = 'bambuser'
 
     _TEST = {
         'url': 'http://bambuser.com/v/4050584',
@@ -26,6 +34,9 @@ class BambuserIE(InfoExtractor):
             'duration': 3741,
             'uploader': 'pixelversity',
             'uploader_id': '344706',
+            'timestamp': 1382976692,
+            'upload_date': '20131028',
+            'view_count': int,
         },
         'params': {
             # It doesn't respect the 'Range' header, it would download the whole video
@@ -34,23 +45,60 @@ class BambuserIE(InfoExtractor):
         },
     }
 
+    def _login(self):
+        (username, password) = self._get_login_info()
+        if username is None:
+            return
+
+        login_form = {
+            'form_id': 'user_login',
+            'op': 'Log in',
+            'name': username,
+            'pass': password,
+        }
+
+        request = compat_urllib_request.Request(
+            self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
+        request.add_header('Referer', self._LOGIN_URL)
+        response = self._download_webpage(
+            request, None, 'Logging in as %s' % username)
+
+        login_error = self._html_search_regex(
+            r'(?s)<div class="messages error">(.+?)</div>',
+            response, 'login error', default=None)
+        if login_error:
+            raise ExtractorError(
+                'Unable to login: %s' % login_error, expected=True)
+
+    def _real_initialize(self):
+        self._login()
+
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        info_url = ('http://player-c.api.bambuser.com/getVideo.json?'
-                    '&api_key=%s&vid=%s' % (self._API_KEY, video_id))
-        info_json = self._download_webpage(info_url, video_id)
-        info = json.loads(info_json)['result']
+        video_id = self._match_id(url)
+
+        info = self._download_json(
+            'http://player-c.api.bambuser.com/getVideo.json?api_key=%s&vid=%s'
+            % (self._API_KEY, video_id), video_id)
+
+        error = info.get('error')
+        if error:
+            raise ExtractorError(
+                '%s returned error: %s' % (self.IE_NAME, error), expected=True)
+
+        result = info['result']
 
         return {
             'id': video_id,
-            'title': info['title'],
-            'url': info['url'],
-            'thumbnail': info.get('preview'),
-            'duration': int(info['length']),
-            'view_count': int(info['views_total']),
-            'uploader': info['username'],
-            'uploader_id': info['owner']['uid'],
+            'title': result['title'],
+            'url': result['url'],
+            'thumbnail': result.get('preview'),
+            'duration': int_or_none(result.get('length')),
+            'uploader': result.get('username'),
+            'uploader_id': compat_str(result.get('owner', {}).get('uid')),
+            'timestamp': int_or_none(result.get('created')),
+            'fps': float_or_none(result.get('framerate')),
+            'view_count': int_or_none(result.get('views_total')),
+            'comment_count': int_or_none(result.get('comment_count')),
         }
 
 
index 86929496708fccf3bc0febe78cd1e599fda1ab97..505877b773d45b36be31d8dea8a6a1766d72d4ca 100644 (file)
@@ -72,7 +72,7 @@ class BandcampIE(InfoExtractor):
 
         download_link = m_download.group(1)
         video_id = self._search_regex(
-            r'(?ms)var TralbumData = {.*?id: (?P<id>\d+),?$',
+            r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$',
             webpage, 'video id')
 
         download_webpage = self._download_webpage(download_link, video_id, 'Downloading free downloads page')
index abc34a5761487b5a900294dac59db4a053b95cb0..5825d286774fa003d343f41c85e07e35340cb428 100644 (file)
@@ -3,7 +3,10 @@ from __future__ import unicode_literals
 import xml.etree.ElementTree
 
 from .common import InfoExtractor
-from ..utils import ExtractorError
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+)
 from ..compat import compat_HTTPError
 
 
@@ -112,6 +115,34 @@ class BBCCoUkIE(InfoExtractor):
                 # rtmp download
                 'skip_download': True,
             }
+        }, {
+            'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
+            'info_dict': {
+                'id': 'p02n76xf',
+                'ext': 'flv',
+                'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
+                'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
+                'duration': 3540,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+            'skip': 'geolocation',
+        }, {
+            'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
+            'info_dict': {
+                'id': 'b05zmgw1',
+                'ext': 'flv',
+                'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
+                'title': 'Royal Academy Summer Exhibition',
+                'duration': 3540,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+            'skip': 'geolocation',
         }, {
             'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
             'only_matching': True,
@@ -220,26 +251,11 @@ class BBCCoUkIE(InfoExtractor):
         for connection in self._extract_connections(media):
             captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
             lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
-            ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}'))
-            srt = ''
-
-            def _extract_text(p):
-                if p.text is not None:
-                    stripped_text = p.text.strip()
-                    if stripped_text:
-                        return stripped_text
-                return ' '.join(span.text.strip() for span in p.findall('{http://www.w3.org/2006/10/ttaf1}span'))
-            for pos, p in enumerate(ps):
-                srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), _extract_text(p))
             subtitles[lang] = [
                 {
                     'url': connection.get('href'),
                     'ext': 'ttml',
                 },
-                {
-                    'data': srt,
-                    'ext': 'srt',
-                },
             ]
         return subtitles
 
@@ -250,7 +266,7 @@ class BBCCoUkIE(InfoExtractor):
                 programme_id, 'Downloading media selection XML')
         except ExtractorError as ee:
             if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
-                media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().encode('utf-8'))
+                media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8'))
             else:
                 raise
 
@@ -326,16 +342,27 @@ class BBCCoUkIE(InfoExtractor):
 
         webpage = self._download_webpage(url, group_id, 'Downloading video page')
 
-        programme_id = self._search_regex(
-            r'"vpid"\s*:\s*"([\da-z]{8})"', webpage, 'vpid', fatal=False, default=None)
+        programme_id = None
+
+        tviplayer = self._search_regex(
+            r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
+            webpage, 'player', default=None)
+
+        if tviplayer:
+            player = self._parse_json(tviplayer, group_id).get('player', {})
+            duration = int_or_none(player.get('duration'))
+            programme_id = player.get('vpid')
+
+        if not programme_id:
+            programme_id = self._search_regex(
+                r'"vpid"\s*:\s*"([\da-z]{8})"', webpage, 'vpid', fatal=False, default=None)
+
         if programme_id:
-            player = self._download_json(
-                'http://www.bbc.co.uk/iplayer/episode/%s.json' % group_id,
-                group_id)['jsConf']['player']
-            title = player['title']
-            description = player['subtitle']
-            duration = player['duration']
             formats, subtitles = self._download_media_selector(programme_id)
+            title = self._og_search_title(webpage)
+            description = self._search_regex(
+                r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
+                webpage, 'description', fatal=False)
         else:
             programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
 
@@ -345,6 +372,7 @@ class BBCCoUkIE(InfoExtractor):
             'id': programme_id,
             'title': title,
             'description': description,
+            'thumbnail': self._og_search_thumbnail(webpage, default=None),
             'duration': duration,
             'formats': formats,
             'subtitles': subtitles,
index d2abd4d772c95e9877a607af7cc0b6e4d56e123a..03dad4636afdf0443735fde8f1d643aea553ba10 100644 (file)
@@ -1,7 +1,7 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..compat import compat_urllib_parse
+from ..compat import compat_urllib_parse_unquote
 from ..utils import (
     xpath_text,
     xpath_with_ns,
@@ -16,11 +16,11 @@ class BetIE(InfoExtractor):
         {
             'url': 'http://www.bet.com/news/politics/2014/12/08/in-bet-exclusive-obama-talks-race-and-racism.html',
             'info_dict': {
-                'id': '740ab250-bb94-4a8a-8787-fe0de7c74471',
+                'id': 'news/national/2014/a-conversation-with-president-obama',
                 'display_id': 'in-bet-exclusive-obama-talks-race-and-racism',
                 'ext': 'flv',
-                'title': 'BET News Presents: A Conversation With President Obama',
-                'description': 'md5:5a88d8ae912c1b33e090290af7ec33c6',
+                'title': 'A Conversation With President Obama',
+                'description': 'md5:699d0652a350cf3e491cd15cc745b5da',
                 'duration': 1534,
                 'timestamp': 1418075340,
                 'upload_date': '20141208',
@@ -35,7 +35,7 @@ class BetIE(InfoExtractor):
         {
             'url': 'http://www.bet.com/video/news/national/2014/justice-for-ferguson-a-community-reacts.html',
             'info_dict': {
-                'id': 'bcd1b1df-673a-42cf-8d01-b282db608f2d',
+                'id': 'news/national/2014/justice-for-ferguson-a-community-reacts',
                 'display_id': 'justice-for-ferguson-a-community-reacts',
                 'ext': 'flv',
                 'title': 'Justice for Ferguson: A Community Reacts',
@@ -57,10 +57,13 @@ class BetIE(InfoExtractor):
         display_id = self._match_id(url)
         webpage = self._download_webpage(url, display_id)
 
-        media_url = compat_urllib_parse.unquote(self._search_regex(
+        media_url = compat_urllib_parse_unquote(self._search_regex(
             [r'mediaURL\s*:\s*"([^"]+)"', r"var\s+mrssMediaUrl\s*=\s*'([^']+)'"],
             webpage, 'media URL'))
 
+        video_id = self._search_regex(
+            r'/video/(.*)/_jcr_content/', media_url, 'video id')
+
         mrss = self._download_xml(media_url, display_id)
 
         item = mrss.find('./channel/item')
@@ -75,8 +78,6 @@ class BetIE(InfoExtractor):
         description = xpath_text(
             item, './description', 'description', fatal=False)
 
-        video_id = xpath_text(item, './guid', 'video id', fatal=False)
-
         timestamp = parse_iso8601(xpath_text(
             item, xpath_with_ns('./dc:date', NS_MAP),
             'upload date', fatal=False))
index 77b562d99625a30035a38d14e15c6927e8b007e9..4d8cce1ef252fde0ac02dc166d3fb4fff528d1a8 100644 (file)
@@ -2,7 +2,10 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..utils import int_or_none
+from ..utils import (
+    int_or_none,
+    fix_xml_ampersands,
+)
 
 
 class BildIE(InfoExtractor):
@@ -15,7 +18,7 @@ class BildIE(InfoExtractor):
             'id': '38184146',
             'ext': 'mp4',
             'title': 'BILD hat sie getestet',
-            'thumbnail': 'http://bilder.bild.de/fotos/stand-das-koennen-die-neuen-ipads-38184138/Bild/1.bild.jpg',
+            'thumbnail': 're:^https?://.*\.jpg$',
             'duration': 196,
             'description': 'Mit dem iPad Air 2 und dem iPad Mini 3 hat Apple zwei neue Tablet-Modelle präsentiert. BILD-Reporter Sven Stein durfte die Geräte bereits testen. ',
         }
@@ -25,7 +28,7 @@ class BildIE(InfoExtractor):
         video_id = self._match_id(url)
 
         xml_url = url.split(".bild.html")[0] + ",view=xml.bild.xml"
-        doc = self._download_xml(xml_url, video_id)
+        doc = self._download_xml(xml_url, video_id, transform_source=fix_xml_ampersands)
 
         duration = int_or_none(doc.attrib.get('duration'), scale=1000)
 
index 75d744852edc382721cee8556067f89ccb0092df..ecc17ebebca9e1819fc804f37d48dcceb80c44c5 100644 (file)
@@ -2,34 +2,54 @@
 from __future__ import unicode_literals
 
 import re
+import itertools
+import json
+import xml.etree.ElementTree as ET
 
 from .common import InfoExtractor
 from ..utils import (
     int_or_none,
     unified_strdate,
+    ExtractorError,
 )
 
 
 class BiliBiliIE(InfoExtractor):
     _VALID_URL = r'http://www\.bilibili\.(?:tv|com)/video/av(?P<id>[0-9]+)/'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.bilibili.tv/video/av1074402/',
         'md5': '2c301e4dab317596e837c3e7633e7d86',
         'info_dict': {
-            'id': '1074402',
+            'id': '1074402_part1',
             'ext': 'flv',
             'title': '【金坷垃】金泡沫',
             'duration': 308,
             'upload_date': '20140420',
             'thumbnail': 're:^https?://.+\.jpg',
         },
-    }
+    }, {
+        'url': 'http://www.bilibili.com/video/av1041170/',
+        'info_dict': {
+            'id': '1041170',
+            'title': '【BD1080P】刀语【诸神&异域】',
+        },
+        'playlist_count': 9,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
 
+        if '(此视频不存在或被删除)' in webpage:
+            raise ExtractorError(
+                'The video does not exist or was deleted', expected=True)
+
+        if '>你没有权限浏览! 由于版权相关问题 我们不对您所在的地区提供服务<' in webpage:
+            raise ExtractorError(
+                'The video is not available in your region due to copyright reasons',
+                expected=True)
+
         video_code = self._search_regex(
             r'(?s)<div itemprop="video".*?>(.*?)</div>', webpage, 'video code')
 
@@ -54,19 +74,22 @@ class BiliBiliIE(InfoExtractor):
 
         cid = self._search_regex(r'cid=(\d+)', webpage, 'cid')
 
-        lq_doc = self._download_xml(
+        entries = []
+
+        lq_page = self._download_webpage(
             'http://interface.bilibili.com/v_cdn_play?appkey=1&cid=%s' % cid,
             video_id,
             note='Downloading LQ video info'
         )
-        lq_durl = lq_doc.find('./durl')
-        formats = [{
-            'format_id': 'lq',
-            'quality': 1,
-            'url': lq_durl.find('./url').text,
-            'filesize': int_or_none(
-                lq_durl.find('./size'), get_attr='text'),
-        }]
+        try:
+            err_info = json.loads(lq_page)
+            raise ExtractorError(
+                'BiliBili said: ' + err_info['error_text'], expected=True)
+        except ValueError:
+            pass
+
+        lq_doc = ET.fromstring(lq_page)
+        lq_durls = lq_doc.findall('./durl')
 
         hq_doc = self._download_xml(
             'http://interface.bilibili.com/playurl?appkey=1&cid=%s' % cid,
@@ -75,22 +98,45 @@ class BiliBiliIE(InfoExtractor):
             fatal=False,
         )
         if hq_doc is not False:
-            hq_durl = hq_doc.find('./durl')
-            formats.append({
-                'format_id': 'hq',
-                'quality': 2,
-                'ext': 'flv',
-                'url': hq_durl.find('./url').text,
+            hq_durls = hq_doc.findall('./durl')
+            assert len(lq_durls) == len(hq_durls)
+        else:
+            hq_durls = itertools.repeat(None)
+
+        i = 1
+        for lq_durl, hq_durl in zip(lq_durls, hq_durls):
+            formats = [{
+                'format_id': 'lq',
+                'quality': 1,
+                'url': lq_durl.find('./url').text,
                 'filesize': int_or_none(
-                    hq_durl.find('./size'), get_attr='text'),
+                    lq_durl.find('./size'), get_attr='text'),
+            }]
+            if hq_durl is not None:
+                formats.append({
+                    'format_id': 'hq',
+                    'quality': 2,
+                    'ext': 'flv',
+                    'url': hq_durl.find('./url').text,
+                    'filesize': int_or_none(
+                        hq_durl.find('./size'), get_attr='text'),
+                })
+            self._sort_formats(formats)
+
+            entries.append({
+                'id': '%s_part%d' % (video_id, i),
+                'title': title,
+                'formats': formats,
+                'duration': duration,
+                'upload_date': upload_date,
+                'thumbnail': thumbnail,
             })
 
-        self._sort_formats(formats)
+            i += 1
+
         return {
+            '_type': 'multi_video',
+            'entries': entries,
             'id': video_id,
-            'title': title,
-            'formats': formats,
-            'duration': duration,
-            'upload_date': upload_date,
-            'thumbnail': thumbnail,
+            'title': title
         }
index b632ce967d515e8c819d1361afcaeddae99f8e9e..c3296283d0dfd1dd753b6e31082e48361414baa3 100644 (file)
@@ -5,7 +5,6 @@ import re
 from .common import InfoExtractor
 
 from ..compat import (
-    compat_str,
     compat_urllib_request,
     compat_urlparse,
 )
@@ -14,6 +13,8 @@ from ..utils import (
     int_or_none,
     parse_iso8601,
     unescapeHTML,
+    xpath_text,
+    xpath_with_ns,
 )
 
 
@@ -23,10 +24,10 @@ class BlipTVIE(InfoExtractor):
     _TESTS = [
         {
             'url': 'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352',
-            'md5': 'c6934ad0b6acf2bd920720ec888eb812',
+            'md5': '80baf1ec5c3d2019037c1c707d676b9f',
             'info_dict': {
                 'id': '5779306',
-                'ext': 'mov',
+                'ext': 'm4v',
                 'title': 'CBR EXCLUSIVE: "Gotham City Imposters" Bats VS Jokerz Short 3',
                 'description': 'md5:9bc31f227219cde65e47eeec8d2dc596',
                 'timestamp': 1323138843,
@@ -100,8 +101,31 @@ class BlipTVIE(InfoExtractor):
                 'vcodec': 'none',
             }
         },
+        {
+            # missing duration
+            'url': 'http://blip.tv/rss/flash/6700880',
+            'info_dict': {
+                'id': '6684191',
+                'ext': 'm4v',
+                'title': 'Cowboy Bebop: Gateway Shuffle Review',
+                'description': 'md5:3acc480c0f9ae157f5fe88547ecaf3f8',
+                'timestamp': 1386639757,
+                'upload_date': '20131210',
+                'uploader': 'sfdebris',
+                'uploader_id': '706520',
+            }
+        }
     ]
 
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)
+        if mobj:
+            return 'http://blip.tv/a/a-' + mobj.group(1)
+        mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage)
+        if mobj:
+            return mobj.group(1)
+
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         lookup_id = mobj.group('lookup_id')
@@ -119,35 +143,34 @@ class BlipTVIE(InfoExtractor):
 
         rss = self._download_xml('http://blip.tv/rss/flash/%s' % video_id, video_id, 'Downloading video RSS')
 
-        def blip(s):
-            return '{http://blip.tv/dtd/blip/1.0}%s' % s
-
-        def media(s):
-            return '{http://search.yahoo.com/mrss/}%s' % s
-
-        def itunes(s):
-            return '{http://www.itunes.com/dtds/podcast-1.0.dtd}%s' % s
+        def _x(p):
+            return xpath_with_ns(p, {
+                'blip': 'http://blip.tv/dtd/blip/1.0',
+                'media': 'http://search.yahoo.com/mrss/',
+                'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd',
+            })
 
         item = rss.find('channel/item')
 
-        video_id = item.find(blip('item_id')).text
-        title = item.find('./title').text
-        description = clean_html(compat_str(item.find(blip('puredescription')).text))
-        timestamp = parse_iso8601(item.find(blip('datestamp')).text)
-        uploader = item.find(blip('user')).text
-        uploader_id = item.find(blip('userid')).text
-        duration = int(item.find(blip('runtime')).text)
-        media_thumbnail = item.find(media('thumbnail'))
-        thumbnail = media_thumbnail.get('url') if media_thumbnail is not None else item.find(itunes('image')).text
-        categories = [category.text for category in item.findall('category')]
+        video_id = xpath_text(item, _x('blip:item_id'), 'video id') or lookup_id
+        title = xpath_text(item, 'title', 'title', fatal=True)
+        description = clean_html(xpath_text(item, _x('blip:puredescription'), 'description'))
+        timestamp = parse_iso8601(xpath_text(item, _x('blip:datestamp'), 'timestamp'))
+        uploader = xpath_text(item, _x('blip:user'), 'uploader')
+        uploader_id = xpath_text(item, _x('blip:userid'), 'uploader id')
+        duration = int_or_none(xpath_text(item, _x('blip:runtime'), 'duration'))
+        media_thumbnail = item.find(_x('media:thumbnail'))
+        thumbnail = (media_thumbnail.get('url') if media_thumbnail is not None
+                     else xpath_text(item, 'image', 'thumbnail'))
+        categories = [category.text for category in item.findall('category') if category is not None]
 
         formats = []
         subtitles_urls = {}
 
-        media_group = item.find(media('group'))
-        for media_content in media_group.findall(media('content')):
+        media_group = item.find(_x('media:group'))
+        for media_content in media_group.findall(_x('media:content')):
             url = media_content.get('url')
-            role = media_content.get(blip('role'))
+            role = media_content.get(_x('blip:role'))
             msg = self._download_webpage(
                 url + '?showplayer=20140425131715&referrer=http://blip.tv&mask=7&skin=flashvars&view=url',
                 video_id, 'Resolving URL for %s' % role)
@@ -166,8 +189,8 @@ class BlipTVIE(InfoExtractor):
                     'url': real_url,
                     'format_id': role,
                     'format_note': media_type,
-                    'vcodec': media_content.get(blip('vcodec')) or 'none',
-                    'acodec': media_content.get(blip('acodec')),
+                    'vcodec': media_content.get(_x('blip:vcodec')) or 'none',
+                    'acodec': media_content.get(_x('blip:acodec')),
                     'filesize': media_content.get('filesize'),
                     'width': int_or_none(media_content.get('width')),
                     'height': int_or_none(media_content.get('height')),
index 45ba5173246575ab617dbab911280b75d61d61e8..66e394e1093105b936191da798734128d4ea1afe 100644 (file)
@@ -16,27 +16,38 @@ class BRIE(InfoExtractor):
 
     _TESTS = [
         {
-            'url': 'http://www.br.de/mediathek/video/sendungen/heimatsound/heimatsound-festival-2014-trailer-100.html',
-            'md5': '93556dd2bcb2948d9259f8670c516d59',
+            'url': 'http://www.br.de/mediathek/video/sendungen/abendschau/betriebliche-altersvorsorge-104.html',
+            'md5': '83a0477cf0b8451027eb566d88b51106',
             'info_dict': {
-                'id': '25e279aa-1ffd-40fd-9955-5325bd48a53a',
+                'id': '48f656ef-287e-486f-be86-459122db22cc',
                 'ext': 'mp4',
-                'title': 'Wenn das Traditions-Theater wackelt',
-                'description': 'Heimatsound-Festival 2014: Wenn das Traditions-Theater wackelt',
-                'duration': 34,
-                'uploader': 'BR',
-                'upload_date': '20140802',
+                'title': 'Die böse Überraschung',
+                'description': 'Betriebliche Altersvorsorge: Die böse Überraschung',
+                'duration': 180,
+                'uploader': 'Reinhard Weber',
+                'upload_date': '20150422',
             }
         },
         {
-            'url': 'http://www.br.de/nachrichten/schaeuble-haushaltsentwurf-bundestag-100.html',
-            'md5': '3db0df1a9a9cd9fa0c70e6ea8aa8e820',
+            'url': 'http://www.br.de/nachrichten/oberbayern/inhalt/muenchner-polizeipraesident-schreiber-gestorben-100.html',
+            'md5': 'a44396d73ab6a68a69a568fae10705bb',
             'info_dict': {
-                'id': 'c6aae3de-2cf9-43f2-957f-f17fef9afaab',
+                'id': 'a4b83e34-123d-4b81-9f4e-c0d3121a4e05',
+                'ext': 'mp4',
+                'title': 'Manfred Schreiber ist tot',
+                'description': 'Abendschau kompakt: Manfred Schreiber ist tot',
+                'duration': 26,
+            }
+        },
+        {
+            'url': 'http://www.br.de/radio/br-klassik/sendungen/allegro/premiere-urauffuehrung-the-land-2015-dance-festival-muenchen-100.html',
+            'md5': '8b5b27c0b090f3b35eac4ab3f7a73d3d',
+            'info_dict': {
+                'id': '74c603c9-26d3-48bb-b85b-079aeed66e0b',
                 'ext': 'aac',
-                'title': '"Keine neuen Schulden im nächsten Jahr"',
-                'description': 'Haushaltsentwurf: "Keine neuen Schulden im nächsten Jahr"',
-                'duration': 64,
+                'title': 'Kurzweilig und sehr bewegend',
+                'description': '"The Land" von Peeping Tom: Kurzweilig und sehr bewegend',
+                'duration': 296,
             }
         },
         {
index 0733bece7c45880ab5c20b916d5bd8c9700da548..4721c22930f15cb51d0daaac294eeeca3a329092 100644 (file)
@@ -13,6 +13,7 @@ from ..compat import (
     compat_urllib_parse_urlparse,
     compat_urllib_request,
     compat_urlparse,
+    compat_xml_parse_error,
 )
 from ..utils import (
     determine_ext,
@@ -117,7 +118,10 @@ class BrightcoveIE(InfoExtractor):
         object_str = re.sub(r'(<object[^>]*)(xmlns=".*?")', r'\1', object_str)
         object_str = fix_xml_ampersands(object_str)
 
-        object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8'))
+        try:
+            object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8'))
+        except compat_xml_parse_error:
+            return
 
         fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars')
         if fv_el is not None:
@@ -153,6 +157,28 @@ class BrightcoveIE(InfoExtractor):
         linkBase = find_param('linkBaseURL')
         if linkBase is not None:
             params['linkBaseURL'] = linkBase
+        return cls._make_brightcove_url(params)
+
+    @classmethod
+    def _build_brighcove_url_from_js(cls, object_js):
+        # The layout of JS is as follows:
+        # customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) {
+        #   // build Brightcove <object /> XML
+        # }
+        m = re.search(
+            r'''(?x)customBC.\createVideo\(
+                .*?                                                  # skipping width and height
+                ["\'](?P<playerID>\d+)["\']\s*,\s*                   # playerID
+                ["\'](?P<playerKey>AQ[^"\']{48})[^"\']*["\']\s*,\s*  # playerKey begins with AQ and is 50 characters
+                                                                     # in length, however it's appended to itself
+                                                                     # in places, so truncate
+                ["\'](?P<videoID>\d+)["\']                           # @videoPlayer
+            ''', object_js)
+        if m:
+            return cls._make_brightcove_url(m.groupdict())
+
+    @classmethod
+    def _make_brightcove_url(cls, params):
         data = compat_urllib_parse.urlencode(params)
         return cls._FEDERATED_URL_TEMPLATE % data
 
@@ -169,7 +195,7 @@ class BrightcoveIE(InfoExtractor):
         """Return a list of all Brightcove URLs from the webpage """
 
         url_m = re.search(
-            r'<meta\s+property="og:video"\s+content="(https?://(?:secure|c)\.brightcove.com/[^"]+)"',
+            r'<meta\s+property=[\'"]og:video[\'"]\s+content=[\'"](https?://(?:secure|c)\.brightcove.com/[^\'"]+)[\'"]',
             webpage)
         if url_m:
             url = unescapeHTML(url_m.group(1))
@@ -183,9 +209,14 @@ class BrightcoveIE(InfoExtractor):
             (?:
                 [^>]+?class=[\'"][^>]*?BrightcoveExperience.*?[\'"] |
                 [^>]*?>\s*<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/
-            ).+?</object>''',
+            ).+?>\s*</object>''',
             webpage)
-        return [cls._build_brighcove_url(m) for m in matches]
+        if matches:
+            return list(filter(None, [cls._build_brighcove_url(m) for m in matches]))
+
+        return list(filter(None, [
+            cls._build_brighcove_url_from_js(custom_bc)
+            for custom_bc in re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)]))
 
     def _real_extract(self, url):
         url, smuggled_data = unsmuggle_url(url, {})
index 6252be05b7f4b57787152b4edae5378675a96847..3b2de517e53da39e06912ce1a97c4aafe7fa250e 100644 (file)
@@ -16,7 +16,7 @@ class BYUtvIE(InfoExtractor):
             'ext': 'mp4',
             'description': 'md5:5438d33774b6bdc662f9485a340401cc',
             'title': 'Season 5 Episode 5',
-            'thumbnail': 're:^https?://.*promo.*'
+            'thumbnail': 're:^https?://.*\.jpg$'
         },
         'params': {
             'skip_download': True,
index 1b14471e57198c2a04833089c174c0c6c3108ab8..699b4f7d08b1928ffa1799adc755774977a84237 100644 (file)
@@ -25,14 +25,14 @@ class CanalplusIE(InfoExtractor):
     }
 
     _TESTS = [{
-        'url': 'http://www.canalplus.fr/c-infos-documentaires/pid1830-c-zapping.html?vid=922470',
-        'md5': '3db39fb48b9685438ecf33a1078023e4',
+        'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1263092',
+        'md5': 'b3481d7ca972f61e37420798d0a9d934',
         'info_dict': {
-            'id': '922470',
+            'id': '1263092',
             'ext': 'flv',
-            'title': 'Zapping - 26/08/13',
-            'description': 'Le meilleur de toutes les chaînes, tous les jours.\nEmission du 26 août 2013',
-            'upload_date': '20130826',
+            'title': 'Le Zapping - 13/05/15',
+            'description': 'md5:09738c0d06be4b5d06a0940edb0da73f',
+            'upload_date': '20150513',
         },
     }, {
         'url': 'http://www.piwiplus.fr/videos-piwi/pid1405-le-labyrinthe-boing-super-ranger.html?vid=1108190',
@@ -56,7 +56,7 @@ class CanalplusIE(InfoExtractor):
         'skip': 'videos get deleted after a while',
     }, {
         'url': 'http://www.itele.fr/france/video/aubervilliers-un-lycee-en-colere-111559',
-        'md5': '65aa83ad62fe107ce29e564bb8712580',
+        'md5': 'f3a46edcdf28006598ffaf5b30e6a2d4',
         'info_dict': {
             'id': '1213714',
             'ext': 'flv',
index 1ceb9d8d9df6c0268e33de5e34c01a245e134e05..75fffb1563ae9f95bf862ad156111b6962a8429e 100644 (file)
@@ -4,12 +4,13 @@ from .common import InfoExtractor
 
 
 class CBSIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?cbs\.com/shows/[^/]+/(?:video|artist)/(?P<id>[^/]+)/.*'
+    _VALID_URL = r'https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/[^/]+/(?P<id>[^/]+)'
 
     _TESTS = [{
         'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/',
         'info_dict': {
             'id': '4JUVEwq3wUT7',
+            'display_id': 'connect-chat-feat-garth-brooks',
             'ext': 'flv',
             'title': 'Connect Chat feat. Garth Brooks',
             'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!',
@@ -24,6 +25,7 @@ class CBSIE(InfoExtractor):
         'url': 'http://www.cbs.com/shows/liveonletterman/artist/221752/st-vincent/',
         'info_dict': {
             'id': 'WWF_5KqY3PK1',
+            'display_id': 'st-vincent',
             'ext': 'flv',
             'title': 'Live on Letterman - St. Vincent',
             'description': 'Live On Letterman: St. Vincent in concert from New York\'s Ed Sullivan Theater on Tuesday, July 16, 2014.',
@@ -34,12 +36,23 @@ class CBSIE(InfoExtractor):
             'skip_download': True,
         },
         '_skip': 'Blocked outside the US',
+    }, {
+        'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
-        video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
         real_id = self._search_regex(
-            r"video\.settings\.pid\s*=\s*'([^']+)';",
+            [r"video\.settings\.pid\s*=\s*'([^']+)';", r"cbsplayer\.pid\s*=\s*'([^']+)';"],
             webpage, 'real video ID')
-        return self.url_result('theplatform:%s' % real_id)
+        return {
+            '_type': 'url_transparent',
+            'ie_key': 'ThePlatform',
+            'url': 'theplatform:%s' % real_id,
+            'display_id': display_id,
+        }
index 7e47960ab08f3ffba7a1e596b34c2fbfc7fd6c59..52e61d85b3a20bc939771cee2b94188d32f16d17 100644 (file)
@@ -32,7 +32,7 @@ class CBSNewsIE(InfoExtractor):
                 'id': 'fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack',
                 'ext': 'flv',
                 'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack',
-                'thumbnail': 'http://cbsnews2.cbsistatic.com/hub/i/r/2014/04/04/0c9fbc66-576b-41ca-8069-02d122060dd2/thumbnail/140x90/6dad7a502f88875ceac38202984b6d58/en-0404-werner-replace-640x360.jpg',
+                'thumbnail': 're:^https?://.*\.jpg$',
                 'duration': 205,
             },
             'params': {
index 2a5d4be185d64f47ce37ca83f9bb6bd1d7eac829..6924eac704cd5cf02d266bd60125dad9cf13e765 100644 (file)
@@ -16,7 +16,7 @@ class CCCIE(InfoExtractor):
 
     _TEST = {
         'url': 'http://media.ccc.de/browse/congress/2013/30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor.html#video',
-        'md5': '205a365d0d57c0b1e43a12c9ffe8f9be',
+        'md5': '3a1eda8f3a29515d27f5adb967d7e740',
         'info_dict': {
             'id': '20131228183',
             'ext': 'mp4',
@@ -51,7 +51,7 @@ class CCCIE(InfoExtractor):
 
         matches = re.finditer(r'''(?xs)
             <(?:span|div)\s+class='label\s+filetype'>(?P<format>.*?)</(?:span|div)>\s*
-            <a\s+href='(?P<http_url>[^']+)'>\s*
+            <a\s+download\s+href='(?P<http_url>[^']+)'>\s*
             (?:
                 .*?
                 <a\s+href='(?P<torrent_url>[^']+\.torrent)'
index 65f6be62313dfc623cf1f9aa7adc52282872aade..dda583680a03ba3cb420beb74a99af2ec60cbc83 100644 (file)
@@ -7,6 +7,7 @@ from .common import InfoExtractor
 from ..compat import (
     compat_urllib_request,
     compat_urllib_parse,
+    compat_urllib_parse_unquote,
     compat_urllib_parse_urlparse,
 )
 from ..utils import (
@@ -88,7 +89,7 @@ class CeskaTelevizeIE(InfoExtractor):
         if playlist_url == 'error_region':
             raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
 
-        req = compat_urllib_request.Request(compat_urllib_parse.unquote(playlist_url))
+        req = compat_urllib_request.Request(compat_urllib_parse_unquote(playlist_url))
         req.add_header('Referer', url)
 
         playlist = self._download_json(req, video_id)
index c922f695905d70e4052ddfa5c8f336c01221413b..0206d96db4670fb29a40353839dae15911b9c6d3 100644 (file)
@@ -57,7 +57,7 @@ class ChilloutzoneIE(InfoExtractor):
 
         base64_video_info = self._html_search_regex(
             r'var cozVidData = "(.+?)";', webpage, 'video data')
-        decoded_video_info = base64.b64decode(base64_video_info).decode("utf-8")
+        decoded_video_info = base64.b64decode(base64_video_info.encode('utf-8')).decode('utf-8')
         video_info_dict = json.loads(decoded_video_info)
 
         # get video information from dict
diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py
new file mode 100644 (file)
index 0000000..c949a48
--- /dev/null
@@ -0,0 +1,110 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+from .bliptv import BlipTVIE
+
+
+class CinemassacreIE(InfoExtractor):
+    _VALID_URL = 'https?://(?:www\.)?cinemassacre\.com/(?P<date_y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/(?P<display_id>[^?#/]+)'
+    _TESTS = [
+        {
+            'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/',
+            'md5': 'fde81fbafaee331785f58cd6c0d46190',
+            'info_dict': {
+                'id': 'Cinemassacre-19911',
+                'ext': 'mp4',
+                'upload_date': '20121110',
+                'title': '“Angry Video Game Nerd: The Movie” – Trailer',
+                'description': 'md5:fb87405fcb42a331742a0dce2708560b',
+            },
+        },
+        {
+            'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
+            'md5': 'd72f10cd39eac4215048f62ab477a511',
+            'info_dict': {
+                'id': 'Cinemassacre-521be8ef82b16',
+                'ext': 'mp4',
+                'upload_date': '20131002',
+                'title': 'The Mummy’s Hand (1940)',
+            },
+        },
+        {
+            # blip.tv embedded video
+            'url': 'http://cinemassacre.com/2006/12/07/chronologically-confused-about-bad-movie-and-video-game-sequel-titles/',
+            'md5': 'ca9b3c8dd5a66f9375daeb5135f5a3de',
+            'info_dict': {
+                'id': '4065369',
+                'ext': 'flv',
+                'title': 'AVGN: Chronologically Confused about Bad Movie and Video Game Sequel Titles',
+                'upload_date': '20061207',
+                'uploader': 'cinemassacre',
+                'uploader_id': '250778',
+                'timestamp': 1283233867,
+                'description': 'md5:0a108c78d130676b207d0f6d029ecffd',
+            }
+        },
+        {
+            # Youtube embedded video
+            'url': 'http://cinemassacre.com/2006/09/01/mckids/',
+            'md5': '6eb30961fa795fedc750eac4881ad2e1',
+            'info_dict': {
+                'id': 'FnxsNhuikpo',
+                'ext': 'mp4',
+                'upload_date': '20060901',
+                'uploader': 'Cinemassacre Extras',
+                'description': 'md5:de9b751efa9e45fbaafd9c8a1123ed53',
+                'uploader_id': 'Cinemassacre',
+                'title': 'AVGN: McKids',
+            }
+        },
+        {
+            'url': 'http://cinemassacre.com/2015/05/25/mario-kart-64-nintendo-64-james-mike-mondays/',
+            'md5': '1376908e49572389e7b06251a53cdd08',
+            'info_dict': {
+                'id': 'Cinemassacre-555779690c440',
+                'ext': 'mp4',
+                'description': 'Let’s Play Mario Kart 64 !! Mario Kart 64 is a classic go-kart racing game released for the Nintendo 64 (N64). Today James & Mike do 4 player Battle Mode with Kyle and Bootsy!',
+                'title': 'Mario Kart 64 (Nintendo 64) James & Mike Mondays',
+                'upload_date': '20150525',
+            }
+        }
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('display_id')
+        video_date = mobj.group('date_y') + mobj.group('date_m') + mobj.group('date_d')
+
+        webpage = self._download_webpage(url, display_id)
+
+        playerdata_url = self._search_regex(
+            [
+                r'src="(http://(?:player2\.screenwavemedia\.com|player\.screenwavemedia\.com/play)/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"',
+                r'<iframe[^>]+src="((?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"',
+            ],
+            webpage, 'player data URL', default=None)
+        if not playerdata_url:
+            playerdata_url = BlipTVIE._extract_url(webpage)
+        if not playerdata_url:
+            raise ExtractorError('Unable to find player data')
+
+        video_title = self._html_search_regex(
+            r'<title>(?P<title>.+?)\|', webpage, 'title')
+        video_description = self._html_search_regex(
+            r'<div class="entry-content">(?P<description>.+?)</div>',
+            webpage, 'description', flags=re.DOTALL, fatal=False)
+        video_thumbnail = self._og_search_thumbnail(webpage)
+
+        return {
+            '_type': 'url_transparent',
+            'display_id': display_id,
+            'title': video_title,
+            'description': video_description,
+            'upload_date': video_date,
+            'thumbnail': video_thumbnail,
+            'url': playerdata_url,
+        }
index d07d544eaf7742bb782a8b367a09561f14c44a4e..8306d6fb7d0d4414cff36f7b381ca9c877820f58 100644 (file)
@@ -1,7 +1,5 @@
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
 from ..utils import (
     find_xpath_attr,
@@ -10,9 +8,9 @@ from ..utils import (
 
 
 class ClipsyndicateIE(InfoExtractor):
-    _VALID_URL = r'http://www\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)'
+    _VALID_URL = r'http://(?:chic|www)\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe',
         'md5': '4d7d549451bad625e0ff3d7bd56d776c',
         'info_dict': {
@@ -22,11 +20,13 @@ class ClipsyndicateIE(InfoExtractor):
             'duration': 612,
             'thumbnail': 're:^https?://.+\.jpg',
         },
-    }
+    }, {
+        'url': 'http://chic.clipsyndicate.com/video/play/5844117/shark_attack',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = self._match_id(url)
         js_player = self._download_webpage(
             'http://eplayer.clipsyndicate.com/embed/player.js?va_id=%s' % video_id,
             video_id, 'Downlaoding player')
index 3145b30514ea2a075f92077b9f87b64c9e8820a7..5dd69bff7ac73bcc0adc4d91c614045ddf116a9c 100644 (file)
@@ -11,7 +11,7 @@ from ..utils import (
 
 class CNETIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?cnet\.com/videos/(?P<id>[^/]+)/'
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/',
         'info_dict': {
             'id': '56f4ea68-bd21-4852-b08c-4de5b8354c60',
@@ -25,7 +25,20 @@ class CNETIE(InfoExtractor):
         'params': {
             'skip_download': 'requires rtmpdump',
         }
-    }
+    }, {
+        'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/',
+        'info_dict': {
+            'id': '56527b93-d25d-44e3-b738-f989ce2e49ba',
+            'ext': 'flv',
+            'description': 'Khail and Ashley wonder what other civic woes can be solved by self-tweeting objects, investigate a new kind of VR camera and watch an origami robot self-assemble, walk, climb, dig and dissolve. #TDPothole',
+            'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40',
+            'uploader': 'Ashley Esqueda',
+            'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)',
+        },
+        'params': {
+            'skip_download': True,  # requires rtmpdump
+        },
+    }]
 
     def _real_extract(self, url):
         display_id = self._match_id(url)
@@ -42,7 +55,7 @@ class CNETIE(InfoExtractor):
             raise ExtractorError('Cannot find video data')
 
         mpx_account = data['config']['players']['default']['mpx_account']
-        vid = vdata['files']['rtmp']
+        vid = vdata['files'].get('rtmp', vdata['files']['hds'])
         tp_link = 'http://link.theplatform.com/s/%s/%s' % (mpx_account, vid)
 
         video_id = vdata['id']
index 5efc5f4fe556a4424542b441a83f2d6dbd5bc8e7..3b1bd4033fd1c01986c83ab44cc1cebaa1b19e5b 100644 (file)
@@ -12,7 +12,7 @@ from ..utils import (
 
 class CNNIE(InfoExtractor):
     _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/
-        (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z]{3,5})(?:-ap)?|(?=&)))'''
+        (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))'''
 
     _TESTS = [{
         'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn',
index e5edcc84b69ef7bdffdbb7ed158c901c560a7575..91ebb0ce57136dc0076927acdca4e250774746e1 100644 (file)
@@ -201,7 +201,7 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor):
 
         uri = mMovieParams[0][1]
         # Correct cc.com in uri
-        uri = re.sub(r'(episode:[^.]+)(\.cc)?\.com', r'\1.cc.com', uri)
+        uri = re.sub(r'(episode:[^.]+)(\.cc)?\.com', r'\1.com', uri)
 
         index_url = 'http://%s.cc.com/feeds/mrss?%s' % (show_name, compat_urllib_parse.urlencode({'uri': uri}))
         idoc = self._download_xml(
index 8ed97f8dddfc5d9b51dab2630008c36e054a50bf..b9014fc23e53eaf335d65ee56c3db560d218d642 100644 (file)
@@ -22,17 +22,20 @@ from ..compat import (
     compat_str,
 )
 from ..utils import (
+    NO_DEFAULT,
     age_restricted,
+    bug_reports_message,
     clean_html,
     compiled_regex_type,
+    determine_ext,
     ExtractorError,
+    fix_xml_ampersands,
     float_or_none,
     int_or_none,
     RegexNotFoundError,
     sanitize_filename,
     unescapeHTML,
 )
-_NO_DEFAULT = object()
 
 
 class InfoExtractor(object):
@@ -46,7 +49,7 @@ class InfoExtractor(object):
     information possibly downloading the video to the file system, among
     other possible outcomes.
 
-    The type field determines the the type of the result.
+    The type field determines the type of the result.
     By far the most common value (and the default if _type is missing) is
     "video", which indicates a single video.
 
@@ -110,11 +113,8 @@ class InfoExtractor(object):
                                   (quality takes higher priority)
                                  -1 for default (order by other properties),
                                  -2 or smaller for less than default.
-                    * http_method  HTTP method to use for the download.
                     * http_headers  A dictionary of additional HTTP headers
                                  to add to the request.
-                    * http_post_data  Additional data to send with a POST
-                                 request.
                     * stretched_ratio  If given and not 1, indicates that the
                                  video's pixels are not square.
                                  width : height ratio as float.
@@ -324,7 +324,7 @@ class InfoExtractor(object):
                 self._downloader.report_warning(errmsg)
                 return False
 
-    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
+    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
         """ Returns a tuple (page content as string, URL handle) """
         # Strip hashes from the URL (#1038)
         if isinstance(url_or_request, (compat_str, str)):
@@ -334,14 +334,11 @@ class InfoExtractor(object):
         if urlh is False:
             assert not fatal
             return False
-        content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
+        content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
         return (content, urlh)
 
-    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None):
-        content_type = urlh.headers.get('Content-Type', '')
-        webpage_bytes = urlh.read()
-        if prefix is not None:
-            webpage_bytes = prefix + webpage_bytes
+    @staticmethod
+    def _guess_encoding_from_content(content_type, webpage_bytes):
         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
         if m:
             encoding = m.group(1)
@@ -354,6 +351,16 @@ class InfoExtractor(object):
                 encoding = 'utf-16'
             else:
                 encoding = 'utf-8'
+
+        return encoding
+
+    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
+        content_type = urlh.headers.get('Content-Type', '')
+        webpage_bytes = urlh.read()
+        if prefix is not None:
+            webpage_bytes = prefix + webpage_bytes
+        if not encoding:
+            encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
         if self._downloader.params.get('dump_intermediate_pages', False):
             try:
                 url = url_or_request.get_full_url()
@@ -410,13 +417,13 @@ class InfoExtractor(object):
 
         return content
 
-    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5):
+    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
         """ Returns the data of the page as a string """
         success = False
         try_count = 0
         while success is False:
             try:
-                res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
+                res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
                 success = True
             except compat_http_client.IncompleteRead as e:
                 try_count += 1
@@ -431,10 +438,10 @@ class InfoExtractor(object):
 
     def _download_xml(self, url_or_request, video_id,
                       note='Downloading XML', errnote='Unable to download XML',
-                      transform_source=None, fatal=True):
+                      transform_source=None, fatal=True, encoding=None):
         """Return the xml as an xml.etree.ElementTree.Element"""
         xml_string = self._download_webpage(
-            url_or_request, video_id, note, errnote, fatal=fatal)
+            url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
         if xml_string is False:
             return xml_string
         if transform_source:
@@ -445,9 +452,10 @@ class InfoExtractor(object):
                        note='Downloading JSON metadata',
                        errnote='Unable to download JSON metadata',
                        transform_source=None,
-                       fatal=True):
+                       fatal=True, encoding=None):
         json_string = self._download_webpage(
-            url_or_request, video_id, note, errnote, fatal=fatal)
+            url_or_request, video_id, note, errnote, fatal=fatal,
+            encoding=encoding)
         if (not fatal) and json_string is False:
             return None
         return self._parse_json(
@@ -517,7 +525,7 @@ class InfoExtractor(object):
             video_info['description'] = playlist_description
         return video_info
 
-    def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
+    def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
         """
         Perform a regex search on the given string, using a single or a list of
         patterns returning the first matching group.
@@ -543,16 +551,15 @@ class InfoExtractor(object):
                 return next(g for g in mobj.groups() if g is not None)
             else:
                 return mobj.group(group)
-        elif default is not _NO_DEFAULT:
+        elif default is not NO_DEFAULT:
             return default
         elif fatal:
             raise RegexNotFoundError('Unable to extract %s' % _name)
         else:
-            self._downloader.report_warning('unable to extract %s; '
-                                            'please report this issue on http://yt-dl.org/bug' % _name)
+            self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
             return None
 
-    def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
+    def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
         """
         Like _search_regex, but strips HTML tags and unescapes entities.
         """
@@ -564,7 +571,7 @@ class InfoExtractor(object):
 
     def _get_login_info(self):
         """
-        Get the the login info as (username, password)
+        Get the login info as (username, password)
         It will look in the netrc file using the _NETRC_MACHINE value
         If there's no info available, return (None, None)
         """
@@ -700,7 +707,26 @@ class InfoExtractor(object):
         return self._html_search_meta('twitter:player', html,
                                       'twitter card player')
 
-    def _sort_formats(self, formats):
+    @staticmethod
+    def _hidden_inputs(html):
+        return dict([
+            (input.group('name'), input.group('value')) for input in re.finditer(
+                r'''(?x)
+                    <input\s+
+                        type=(?P<q_hidden>["\'])hidden(?P=q_hidden)\s+
+                        name=(?P<q_name>["\'])(?P<name>.+?)(?P=q_name)\s+
+                        (?:id=(?P<q_id>["\']).+?(?P=q_id)\s+)?
+                        value=(?P<q_value>["\'])(?P<value>.*?)(?P=q_value)
+                ''', html)
+        ])
+
+    def _form_hidden_inputs(self, form_id, html):
+        form = self._search_regex(
+            r'(?s)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
+            html, '%s form' % form_id, group='form')
+        return self._hidden_inputs(form)
+
+    def _sort_formats(self, formats, field_preference=None):
         if not formats:
             raise ExtractorError('No video formats found')
 
@@ -710,6 +736,9 @@ class InfoExtractor(object):
             if not f.get('ext') and 'url' in f:
                 f['ext'] = determine_ext(f['url'])
 
+            if isinstance(field_preference, (list, tuple)):
+                return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
+
             preference = f.get('preference')
             if preference is None:
                 proto = f.get('protocol')
@@ -756,7 +785,7 @@ class InfoExtractor(object):
                 f.get('fps') if f.get('fps') is not None else -1,
                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
                 f.get('source_preference') if f.get('source_preference') is not None else -1,
-                f.get('format_id'),
+                f.get('format_id') if f.get('format_id') is not None else '',
             )
         formats.sort(key=_formats_key)
 
@@ -778,8 +807,8 @@ class InfoExtractor(object):
             return True
         except ExtractorError as e:
             if isinstance(e.cause, compat_HTTPError):
-                self.report_warning(
-                    '%s URL is invalid, skipping' % item, video_id)
+                self.to_screen(
+                    '%s: %s URL is invalid, skipping' % (video_id, item))
                 return False
             raise
 
@@ -807,10 +836,14 @@ class InfoExtractor(object):
         self.to_screen(msg)
         time.sleep(timeout)
 
-    def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None):
+    def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
+                             transform_source=lambda s: fix_xml_ampersands(s).strip()):
         manifest = self._download_xml(
             manifest_url, video_id, 'Downloading f4m manifest',
-            'Unable to download f4m manifest')
+            'Unable to download f4m manifest',
+            # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
+            # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
+            transform_source=transform_source)
 
         formats = []
         manifest_version = '1.0'
@@ -820,8 +853,19 @@ class InfoExtractor(object):
             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
         for i, media_el in enumerate(media_nodes):
             if manifest_version == '2.0':
-                manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/' +
-                                (media_el.attrib.get('href') or media_el.attrib.get('url')))
+                media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
+                if not media_url:
+                    continue
+                manifest_url = (
+                    media_url if media_url.startswith('http://') or media_url.startswith('https://')
+                    else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
+                # If media_url is itself a f4m manifest do the recursive extraction
+                # since bitrates in parent manifest (this one) and media_url manifest
+                # may differ leading to inability to resolve the format by requested
+                # bitrate in f4m downloader
+                if determine_ext(manifest_url) == 'f4m':
+                    formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id))
+                    continue
             tbr = int_or_none(media_el.attrib.get('bitrate'))
             formats.append({
                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
@@ -838,7 +882,8 @@ class InfoExtractor(object):
 
     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
                               entry_protocol='m3u8', preference=None,
-                              m3u8_id=None):
+                              m3u8_id=None, note=None, errnote=None,
+                              fatal=True):
 
         formats = [{
             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
@@ -857,8 +902,11 @@ class InfoExtractor(object):
 
         m3u8_doc = self._download_webpage(
             m3u8_url, video_id,
-            note='Downloading m3u8 information',
-            errnote='Failed to download m3u8 information')
+            note=note or 'Downloading m3u8 information',
+            errnote=errnote or 'Failed to download m3u8 information',
+            fatal=fatal)
+        if m3u8_doc is False:
+            return m3u8_doc
         last_info = None
         last_media = None
         kv_rex = re.compile(
@@ -888,7 +936,7 @@ class InfoExtractor(object):
                 format_id = []
                 if m3u8_id:
                     format_id.append(m3u8_id)
-                last_media_name = last_media.get('NAME') if last_media else None
+                last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
                 f = {
                     'format_id': '-'.join(format_id),
@@ -948,7 +996,7 @@ class InfoExtractor(object):
     def _parse_smil_video(self, video, video_id, base, rtmp_count):
         src = video.get('src')
         if not src:
-            return ([], rtmp_count)
+            return [], rtmp_count
         bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
         width = int_or_none(video.get('width'))
         height = int_or_none(video.get('height'))
@@ -961,7 +1009,7 @@ class InfoExtractor(object):
                     proto = 'http'
         ext = video.get('ext')
         if proto == 'm3u8':
-            return (self._extract_m3u8_formats(src, video_id, ext), rtmp_count)
+            return self._extract_m3u8_formats(src, video_id, ext), rtmp_count
         elif proto == 'rtmp':
             rtmp_count += 1
             streamer = video.get('streamer') or base
@@ -1064,9 +1112,6 @@ class InfoExtractor(object):
     def _get_automatic_captions(self, *args, **kwargs):
         raise NotImplementedError("This method must be implemented by subclasses")
 
-    def _subtitles_timecode(self, seconds):
-        return '%02d:%02d:%02d.%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
-
 
 class SearchInfoExtractor(InfoExtractor):
     """
index cf763ee7e03019adc5f957060b0f45e52e532084..94d03ce2af108a4a711f09f3db9fecd5bd62566e 100644 (file)
@@ -11,39 +11,65 @@ from ..utils import (
 
 class CrackedIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?cracked\.com/video_(?P<id>\d+)_[\da-z-]+\.html'
-    _TEST = {
+    _TESTS = [{
+        'url': 'http://www.cracked.com/video_19070_if-animal-actors-got-e21-true-hollywood-stories.html',
+        'md5': '89b90b9824e3806ca95072c4d78f13f7',
+        'info_dict': {
+            'id': '19070',
+            'ext': 'mp4',
+            'title': 'If Animal Actors Got E! True Hollywood Stories',
+            'timestamp': 1404954000,
+            'upload_date': '20140710',
+        }
+    }, {
+        # youtube embed
         'url': 'http://www.cracked.com/video_19006_4-plot-holes-you-didnt-notice-in-your-favorite-movies.html',
-        'md5': '4b29a5eeec292cd5eca6388c7558db9e',
+        'md5': 'ccd52866b50bde63a6ef3b35016ba8c7',
         'info_dict': {
-            'id': '19006',
+            'id': 'EjI00A3rZD0',
             'ext': 'mp4',
-            'title': '4 Plot Holes You Didn\'t Notice in Your Favorite Movies',
-            'description': 'md5:3b909e752661db86007d10e5ec2df769',
-            'timestamp': 1405659600,
-            'upload_date': '20140718',
+            'title': "4 Plot Holes You Didn't Notice in Your Favorite Movies - The Spit Take",
+            'description': 'md5:c603708c718b796fe6079e2b3351ffc7',
+            'upload_date': '20140725',
+            'uploader_id': 'Cracked',
+            'uploader': 'Cracked',
         }
-    }
+    }]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = self._match_id(url)
 
         webpage = self._download_webpage(url, video_id)
 
+        youtube_url = self._search_regex(
+            r'<iframe[^>]+src="((?:https?:)?//www\.youtube\.com/embed/[^"]+)"',
+            webpage, 'youtube url', default=None)
+        if youtube_url:
+            return self.url_result(youtube_url, 'Youtube')
+
         video_url = self._html_search_regex(
-            [r'var\s+CK_vidSrc\s*=\s*"([^"]+)"', r'<video\s+src="([^"]+)"'], webpage, 'video URL')
+            [r'var\s+CK_vidSrc\s*=\s*"([^"]+)"', r'<video\s+src="([^"]+)"'],
+            webpage, 'video URL')
+
+        title = self._search_regex(
+            [r'property="?og:title"?\s+content="([^"]+)"', r'class="?title"?>([^<]+)'],
+            webpage, 'title')
 
-        title = self._og_search_title(webpage)
-        description = self._og_search_description(webpage)
+        description = self._search_regex(
+            r'name="?(?:og:)?description"?\s+content="([^"]+)"',
+            webpage, 'description', default=None)
 
-        timestamp = self._html_search_regex(r'<time datetime="([^"]+)"', webpage, 'upload date', fatal=False)
+        timestamp = self._html_search_regex(
+            r'"date"\s*:\s*"([^"]+)"', webpage, 'upload date', fatal=False)
         if timestamp:
             timestamp = parse_iso8601(timestamp[:-6])
 
         view_count = str_to_int(self._html_search_regex(
-            r'<span class="views" id="viewCounts">([\d,\.]+) Views</span>', webpage, 'view count', fatal=False))
+            r'<span\s+class="?views"? id="?viewCounts"?>([\d,\.]+) Views</span>',
+            webpage, 'view count', fatal=False))
         comment_count = str_to_int(self._html_search_regex(
-            r'<span id="commentCounts">([\d,\.]+)</span>', webpage, 'comment count', fatal=False))
+            r'<span\s+id="?commentCounts"?>([\d,\.]+)</span>',
+            webpage, 'comment count', fatal=False))
 
         m = re.search(r'_(?P<width>\d+)X(?P<height>\d+)\.mp4$', video_url)
         if m:
index 6ded723c96ddad70ebf95b9fb6b73c811bcc6746..d1b6d7366e847015af4581160c918d4a1ee6e11f 100644 (file)
@@ -12,6 +12,7 @@ from math import pow, sqrt, floor
 from .common import InfoExtractor
 from ..compat import (
     compat_urllib_parse,
+    compat_urllib_parse_unquote,
     compat_urllib_request,
 )
 from ..utils import (
@@ -27,7 +28,7 @@ from ..aes import (
 
 
 class CrunchyrollIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)'
+    _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)'
     _NETRC_MACHINE = 'crunchyroll'
     _TESTS = [{
         'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
@@ -45,6 +46,22 @@ class CrunchyrollIE(InfoExtractor):
             # rtmp
             'skip_download': True,
         },
+    }, {
+        'url': 'http://www.crunchyroll.com/media-589804/culture-japan-1',
+        'info_dict': {
+            'id': '589804',
+            'ext': 'flv',
+            'title': 'Culture Japan Episode 1 – Rebuilding Japan after the 3.11',
+            'description': 'md5:fe2743efedb49d279552926d0bd0cd9e',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'uploader': 'Danny Choo Network',
+            'upload_date': '20120213',
+        },
+        'params': {
+            # rtmp
+            'skip_download': True,
+        },
+
     }, {
         'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697',
         'only_matching': True,
@@ -76,8 +93,8 @@ class CrunchyrollIE(InfoExtractor):
         self._login()
 
     def _decrypt_subtitles(self, data, iv, id):
-        data = bytes_to_intlist(data)
-        iv = bytes_to_intlist(iv)
+        data = bytes_to_intlist(base64.b64decode(data.encode('utf-8')))
+        iv = bytes_to_intlist(base64.b64decode(iv.encode('utf-8')))
         id = int(id)
 
         def obfuscate_key_aux(count, modulo, start):
@@ -179,6 +196,16 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
 
         return output
 
+    def _extract_subtitles(self, subtitle):
+        sub_root = xml.etree.ElementTree.fromstring(subtitle)
+        return [{
+            'ext': 'srt',
+            'data': self._convert_subtitles_to_srt(sub_root),
+        }, {
+            'ext': 'ass',
+            'data': self._convert_subtitles_to_ass(sub_root),
+        }]
+
     def _get_subtitles(self, video_id, webpage):
         subtitles = {}
         for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage):
@@ -190,25 +217,11 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
             data = self._search_regex(r'<data>([^<]+)', sub_page, 'subtitle_data', fatal=False)
             if not id or not iv or not data:
                 continue
-            id = int(id)
-            iv = base64.b64decode(iv)
-            data = base64.b64decode(data)
-
             subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8')
             lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False)
             if not lang_code:
                 continue
-            sub_root = xml.etree.ElementTree.fromstring(subtitle)
-            subtitles[lang_code] = [
-                {
-                    'ext': 'srt',
-                    'data': self._convert_subtitles_to_srt(sub_root),
-                },
-                {
-                    'ext': 'ass',
-                    'data': self._convert_subtitles_to_ass(sub_root),
-                },
-            ]
+            subtitles[lang_code] = self._extract_subtitles(subtitle)
         return subtitles
 
     def _real_extract(self, url):
@@ -242,7 +255,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
             video_upload_date = unified_strdate(video_upload_date)
         video_uploader = self._html_search_regex(r'<div>\s*Publisher:(.+?)</div>', webpage, 'video_uploader', fatal=False, flags=re.DOTALL)
 
-        playerdata_url = compat_urllib_parse.unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url'))
+        playerdata_url = compat_urllib_parse_unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url'))
         playerdata_req = compat_urllib_request.Request(playerdata_url)
         playerdata_req.data = compat_urllib_parse.urlencode({'current_page': webpage_url})
         playerdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
@@ -255,16 +268,17 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
         for fmt in re.findall(r'showmedia\.([0-9]{3,4})p', webpage):
             stream_quality, stream_format = self._FORMAT_IDS[fmt]
             video_format = fmt + 'p'
-            streamdata_req = compat_urllib_request.Request('http://www.crunchyroll.com/xml/')
-            # urlencode doesn't work!
-            streamdata_req.data = 'req=RpcApiVideoEncode%5FGetStreamInfo&video%5Fencode%5Fquality=' + stream_quality + '&media%5Fid=' + stream_id + '&video%5Fformat=' + stream_format
+            streamdata_req = compat_urllib_request.Request(
+                'http://www.crunchyroll.com/xml/?req=RpcApiVideoPlayer_GetStandardConfig&media_id=%s&video_format=%s&video_quality=%s'
+                % (stream_id, stream_format, stream_quality),
+                compat_urllib_parse.urlencode({'current_page': url}).encode('utf-8'))
             streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
-            streamdata_req.add_header('Content-Length', str(len(streamdata_req.data)))
             streamdata = self._download_xml(
                 streamdata_req, video_id,
                 note='Downloading media info for %s' % video_format)
-            video_url = streamdata.find('.//host').text
-            video_play_path = streamdata.find('.//file').text
+            stream_info = streamdata.find('./{default}preload/stream_info')
+            video_url = stream_info.find('./host').text
+            video_play_path = stream_info.find('./file').text
             formats.append({
                 'url': video_url,
                 'play_path': video_play_path,
index 955119d40be3797e073b030790b3685b7ca4be15..fbefd37d09a98bb19c82b4c09b7b08c99d147d35 100644 (file)
@@ -7,7 +7,10 @@ from ..utils import (
     int_or_none,
     unescapeHTML,
     find_xpath_attr,
+    smuggle_url,
+    determine_ext,
 )
+from .senateisvp import SenateISVPIE
 
 
 class CSpanIE(InfoExtractor):
@@ -35,11 +38,22 @@ class CSpanIE(InfoExtractor):
         }
     }, {
         'url': 'http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall',
+        'md5': '446562a736c6bf97118e389433ed88d4',
         'info_dict': {
             'id': '342759',
+            'ext': 'mp4',
             'title': 'General Motors Ignition Switch Recall',
+            'duration': 14848,
+            'description': 'md5:70c7c3b8fa63fa60d42772440596034c'
         },
-        'playlist_duration_sum': 14855,
+    }, {
+        # Video from senate.gov
+        'url': 'http://www.c-span.org/video/?104517-1/immigration-reforms-needed-protect-skilled-american-workers',
+        'info_dict': {
+            'id': 'judiciary031715',
+            'ext': 'flv',
+            'title': 'Immigration Reforms Needed to Protect Skilled American Workers',
+        }
     }]
 
     def _real_extract(self, url):
@@ -56,7 +70,7 @@ class CSpanIE(InfoExtractor):
                 # present, otherwise this is a stripped version
                 r'<p class=\'initial\'>(.*?)</p>'
             ],
-            webpage, 'description', flags=re.DOTALL)
+            webpage, 'description', flags=re.DOTALL, default=None)
 
         info_url = 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=program&id=' + video_id
         data = self._download_json(info_url, video_id)
@@ -68,7 +82,16 @@ class CSpanIE(InfoExtractor):
         title = find_xpath_attr(doc, './/string', 'name', 'title').text
         thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text
 
+        senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
+        if senate_isvp_url:
+            surl = smuggle_url(senate_isvp_url, {'force_title': title})
+            return self.url_result(surl, 'SenateISVP', video_id, title)
+
         files = data['video']['files']
+        try:
+            capfile = data['video']['capfile']['#text']
+        except KeyError:
+            capfile = None
 
         entries = [{
             'id': '%s_%d' % (video_id, partnum + 1),
@@ -79,11 +102,22 @@ class CSpanIE(InfoExtractor):
             'description': description,
             'thumbnail': thumbnail,
             'duration': int_or_none(f.get('length', {}).get('#text')),
+            'subtitles': {
+                'en': [{
+                    'url': capfile,
+                    'ext': determine_ext(capfile, 'dfxp')
+                }],
+            } if capfile else None,
         } for partnum, f in enumerate(files)]
 
-        return {
-            '_type': 'playlist',
-            'entries': entries,
-            'title': title,
-            'id': video_id,
-        }
+        if len(entries) == 1:
+            entry = dict(entries[0])
+            entry['id'] = video_id
+            return entry
+        else:
+            return {
+                '_type': 'playlist',
+                'entries': entries,
+                'title': title,
+                'id': video_id,
+            }
index 0226f8036c81d97246c87f2308614d84202ae540..45049bf371370da6e4b64952441e76d86814fd6a 100644 (file)
@@ -6,6 +6,7 @@ from ..utils import parse_iso8601, ExtractorError
 
 
 class CtsNewsIE(InfoExtractor):
+    IE_DESC = '華視新聞'
     # https connection failed (Connection reset)
     _VALID_URL = r'http://news\.cts\.com\.tw/[a-z]+/[a-z]+/\d+/(?P<id>\d+)\.html'
     _TESTS = [{
index 7615ecd4ba3a9d55720f697cb792ca01e92bd1df..1a41c0db181c43c9c35d24988b9fd118c248f399 100644 (file)
@@ -52,6 +52,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
                 'ext': 'mp4',
                 'uploader': 'IGN',
                 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News',
+                'upload_date': '20150306',
+                'duration': 74,
             }
         },
         # Vevo video
@@ -85,7 +87,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        url = 'http://www.dailymotion.com/video/%s' % video_id
+        url = 'https://www.dailymotion.com/video/%s' % video_id
 
         # Retrieve video webpage to extract further information
         request = self._build_request(url)
@@ -106,11 +108,11 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
         age_limit = self._rta_search(webpage)
 
         video_upload_date = None
-        mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
+        mobj = re.search(r'<meta property="video:release_date" content="([0-9]{4})-([0-9]{2})-([0-9]{2}).+?"/>', webpage)
         if mobj is not None:
-            video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
+            video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
 
-        embed_url = 'http://www.dailymotion.com/embed/video/%s' % video_id
+        embed_url = 'https://www.dailymotion.com/embed/video/%s' % video_id
         embed_request = self._build_request(embed_url)
         embed_page = self._download_webpage(
             embed_request, video_id, 'Downloading embed page')
@@ -163,6 +165,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
             'thumbnail': info['thumbnail_url'],
             'age_limit': age_limit,
             'view_count': view_count,
+            'duration': info['duration']
         }
 
     def _get_subtitles(self, video_id, webpage):
@@ -224,7 +227,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
 
 class DailymotionUserIE(DailymotionPlaylistIE):
     IE_NAME = 'dailymotion:user'
-    _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?:old/)?user/(?P<user>[^/]+)'
+    _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?:(?:old/)?user/)?(?P<user>[^/]+)$'
     _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s'
     _TESTS = [{
         'url': 'https://www.dailymotion.com/user/nqtv',
@@ -238,7 +241,8 @@ class DailymotionUserIE(DailymotionPlaylistIE):
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         user = mobj.group('user')
-        webpage = self._download_webpage(url, user)
+        webpage = self._download_webpage(
+            'https://www.dailymotion.com/user/%s' % user, user)
         full_user = unescapeHTML(self._html_search_regex(
             r'<a class="nav-image" title="([^"]+)" href="/%s">' % re.escape(user),
             webpage, 'user'))
@@ -249,3 +253,53 @@ class DailymotionUserIE(DailymotionPlaylistIE):
             'title': full_user,
             'entries': self._extract_entries(user),
         }
+
+
+class DailymotionCloudIE(DailymotionBaseInfoExtractor):
+    _VALID_URL_PREFIX = r'http://api\.dmcloud\.net/(?:player/)?embed/'
+    _VALID_URL = r'%s[^/]+/(?P<id>[^/?]+)' % _VALID_URL_PREFIX
+    _VALID_EMBED_URL = r'%s[^/]+/[^\'"]+' % _VALID_URL_PREFIX
+
+    _TESTS = [{
+        # From http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html
+        # Tested at FranceTvInfo_2
+        'url': 'http://api.dmcloud.net/embed/4e7343f894a6f677b10006b4/556e03339473995ee145930c?auth=1464865870-0-jyhsm84b-ead4c701fb750cf9367bf4447167a3db&autoplay=1',
+        'only_matching': True,
+    }, {
+        # http://www.francetvinfo.fr/societe/larguez-les-amarres-le-cobaturage-se-developpe_980101.html
+        'url': 'http://api.dmcloud.net/player/embed/4e7343f894a6f677b10006b4/559545469473996d31429f06?auth=1467430263-0-90tglw2l-a3a4b64ed41efe48d7fccad85b8b8fda&autoplay=1',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def _extract_dmcloud_url(self, webpage):
+        mobj = re.search(r'<iframe[^>]+src=[\'"](%s)[\'"]' % self._VALID_EMBED_URL, webpage)
+        if mobj:
+            return mobj.group(1)
+
+        mobj = re.search(
+            r'<input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=[\'"](%s)[\'"]' % self._VALID_EMBED_URL,
+            webpage)
+        if mobj:
+            return mobj.group(1)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        request = self._build_request(url)
+        webpage = self._download_webpage(request, video_id)
+
+        title = self._html_search_regex(r'<title>([^>]+)</title>', webpage, 'title')
+
+        video_info = self._parse_json(self._search_regex(
+            r'var\s+info\s*=\s*([^;]+);', webpage, 'video info'), video_id)
+
+        # TODO: parse ios_url, which is in fact a manifest
+        video_url = video_info['mp4_url']
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'thumbnail': video_info.get('thumbnail_url'),
+        }
index 8049779b0a31049f704bae256a3752a9a22ad789..263532cc6e66a94c79670caa5e1600444ce909da 100644 (file)
@@ -3,42 +3,47 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..utils import unified_strdate
 
 
 class DFBIE(InfoExtractor):
     IE_NAME = 'tv.dfb.de'
-    _VALID_URL = r'https?://tv\.dfb\.de/video/[^/]+/(?P<id>\d+)'
+    _VALID_URL = r'https?://tv\.dfb\.de/video/(?P<display_id>[^/]+)/(?P<id>\d+)'
 
     _TEST = {
-        'url': 'http://tv.dfb.de/video/highlights-des-empfangs-in-berlin/9070/',
+        'url': 'http://tv.dfb.de/video/u-19-em-stimmen-zum-spiel-gegen-russland/11633/',
         # The md5 is different each time
         'info_dict': {
-            'id': '9070',
+            'id': '11633',
+            'display_id': 'u-19-em-stimmen-zum-spiel-gegen-russland',
             'ext': 'flv',
-            'title': 'Highlights des Empfangs in Berlin',
-            'upload_date': '20140716',
+            'title': 'U 19-EM: Stimmen zum Spiel gegen Russland',
+            'upload_date': '20150714',
         },
     }
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
+        display_id = mobj.group('display_id')
 
-        webpage = self._download_webpage(url, video_id)
+        webpage = self._download_webpage(url, display_id)
         player_info = self._download_xml(
             'http://tv.dfb.de/server/hd_video.php?play=%s' % video_id,
-            video_id)
+            display_id)
         video_info = player_info.find('video')
 
-        f4m_info = self._download_xml(self._proto_relative_url(video_info.find('url').text.strip()), video_id)
+        f4m_info = self._download_xml(
+            self._proto_relative_url(video_info.find('url').text.strip()), display_id)
         token_el = f4m_info.find('token')
         manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + '&hdcore=3.2.0'
+        formats = self._extract_f4m_formats(manifest_url, display_id)
 
         return {
             'id': video_id,
+            'display_id': display_id,
             'title': video_info.find('title').text,
-            'url': manifest_url,
-            'ext': 'flv',
             'thumbnail': self._og_search_thumbnail(webpage),
-            'upload_date': ''.join(video_info.find('time_date').text.split('.')[::-1]),
+            'upload_date': unified_strdate(video_info.find('time_date').text),
+            'formats': formats,
         }
index d3e6675283cddcb8f6a6dfffbfbd1e1ea3da11bc..d6723ecf26ea67356b288df6e5f3bf612141b91a 100644 (file)
@@ -2,19 +2,19 @@ from __future__ import unicode_literals
 
 from .common import InfoExtractor
 from ..utils import (
+    parse_duration,
     parse_iso8601,
-    int_or_none,
 )
+from ..compat import compat_str
 
 
 class DiscoveryIE(InfoExtractor):
     _VALID_URL = r'http://www\.discovery\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9_\-]*)(?:\.htm)?'
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm',
-        'md5': '3c69d77d9b0d82bfd5e5932a60f26504',
         'info_dict': {
-            'id': 'mission-impossible-outtakes',
-            'ext': 'flv',
+            'id': '20769',
+            'ext': 'mp4',
             'title': 'Mission Impossible Outtakes',
             'description': ('Watch Jamie Hyneman and Adam Savage practice being'
                             ' each other -- to the point of confusing Jamie\'s dog -- and '
@@ -24,22 +24,36 @@ class DiscoveryIE(InfoExtractor):
             'timestamp': 1303099200,
             'upload_date': '20110418',
         },
-    }
+        'params': {
+            'skip_download': True,  # requires ffmpeg
+        }
+    }, {
+        'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mythbusters-the-simpsons',
+        'info_dict': {
+            'id': 'mythbusters-the-simpsons',
+            'title': 'MythBusters: The Simpsons',
+        },
+        'playlist_count': 9,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
+        info = self._download_json(url + '?flat=1', video_id)
 
-        info = self._parse_json(self._search_regex(
-            r'(?s)<script type="application/ld\+json">(.*?)</script>',
-            webpage, 'video info'), video_id)
+        video_title = info.get('playlist_title') or info.get('video_title')
 
-        return {
-            'id': video_id,
-            'title': info['name'],
-            'url': info['contentURL'],
-            'description': info.get('description'),
-            'thumbnail': info.get('thumbnailUrl'),
-            'timestamp': parse_iso8601(info.get('uploadDate')),
-            'duration': int_or_none(info.get('duration')),
-        }
+        entries = [{
+            'id': compat_str(video_info['id']),
+            'formats': self._extract_m3u8_formats(
+                video_info['src'], video_id, ext='mp4',
+                note='Download m3u8 information for video %d' % (idx + 1)),
+            'title': video_info['title'],
+            'description': video_info.get('description'),
+            'duration': parse_duration(video_info.get('video_length')),
+            'webpage_url': video_info.get('href'),
+            'thumbnail': video_info.get('thumbnailURL'),
+            'alt_title': video_info.get('secondary_title'),
+            'timestamp': parse_iso8601(video_info.get('publishedDate')),
+        } for idx, video_info in enumerate(info['playlist'])]
+
+        return self.playlist_result(entries, video_id, video_title)
index f51d88a986b79d65cae3c1604ee3d16e9515c0fd..e9ca236d4a03c13b1b29b3386535c4262332dab0 100644 (file)
@@ -36,7 +36,8 @@ class DotsubIE(InfoExtractor):
         if not video_url:
             webpage = self._download_webpage(url, video_id)
             video_url = self._search_regex(
-                r'"file"\s*:\s*\'([^\']+)', webpage, 'video url')
+                [r'<source[^>]+src="([^"]+)"', r'"file"\s*:\s*\'([^\']+)'],
+                webpage, 'video url')
 
         return {
             'id': video_id,
index 479430c51072ab91e976df4d459af372c5608cdd..373b3b4b4735d8544128c48a10037eed3c570e5d 100644 (file)
@@ -9,6 +9,7 @@ from ..compat import (compat_str, compat_basestring)
 
 
 class DouyuTVIE(InfoExtractor):
+    IE_DESC = '斗鱼'
     _VALID_URL = r'http://(?:www\.)?douyutv\.com/(?P<id>[A-Za-z0-9]+)'
     _TESTS = [{
         'url': 'http://www.douyutv.com/iseven',
diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py
new file mode 100644 (file)
index 0000000..38e6597
--- /dev/null
@@ -0,0 +1,216 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_HTTPError,
+    compat_urllib_parse,
+    compat_urllib_request,
+    compat_urlparse,
+)
+from ..utils import (
+    ExtractorError,
+    clean_html,
+    determine_ext,
+    int_or_none,
+    parse_iso8601,
+)
+
+
+class DramaFeverBaseIE(InfoExtractor):
+    _LOGIN_URL = 'https://www.dramafever.com/accounts/login/'
+    _NETRC_MACHINE = 'dramafever'
+
+    _CONSUMER_SECRET = 'DA59dtVXYLxajktV'
+
+    _consumer_secret = None
+
+    def _get_consumer_secret(self):
+        mainjs = self._download_webpage(
+            'http://www.dramafever.com/static/51afe95/df2014/scripts/main.js',
+            None, 'Downloading main.js', fatal=False)
+        if not mainjs:
+            return self._CONSUMER_SECRET
+        return self._search_regex(
+            r"var\s+cs\s*=\s*'([^']+)'", mainjs,
+            'consumer secret', default=self._CONSUMER_SECRET)
+
+    def _real_initialize(self):
+        self._login()
+        self._consumer_secret = self._get_consumer_secret()
+
+    def _login(self):
+        (username, password) = self._get_login_info()
+        if username is None:
+            return
+
+        login_form = {
+            'username': username,
+            'password': password,
+        }
+
+        request = compat_urllib_request.Request(
+            self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
+        response = self._download_webpage(
+            request, None, 'Logging in as %s' % username)
+
+        if all(logout_pattern not in response
+               for logout_pattern in ['href="/accounts/logout/"', '>Log out<']):
+            error = self._html_search_regex(
+                r'(?s)class="hidden-xs prompt"[^>]*>(.+?)<',
+                response, 'error message', default=None)
+            if error:
+                raise ExtractorError('Unable to login: %s' % error, expected=True)
+            raise ExtractorError('Unable to log in')
+
+
+class DramaFeverIE(DramaFeverBaseIE):
+    IE_NAME = 'dramafever'
+    _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P<id>[0-9]+/[0-9]+)(?:/|$)'
+    _TEST = {
+        'url': 'http://www.dramafever.com/drama/4512/1/Cooking_with_Shin/',
+        'info_dict': {
+            'id': '4512.1',
+            'ext': 'flv',
+            'title': 'Cooking with Shin 4512.1',
+            'description': 'md5:a8eec7942e1664a6896fcd5e1287bfd0',
+            'thumbnail': 're:^https?://.*\.jpg',
+            'timestamp': 1404336058,
+            'upload_date': '20140702',
+            'duration': 343,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url).replace('/', '.')
+
+        try:
+            feed = self._download_json(
+                'http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id,
+                video_id, 'Downloading episode JSON')['channel']['item']
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError):
+                raise ExtractorError(
+                    'Currently unavailable in your country.', expected=True)
+            raise
+
+        media_group = feed.get('media-group', {})
+
+        formats = []
+        for media_content in media_group['media-content']:
+            src = media_content.get('@attributes', {}).get('url')
+            if not src:
+                continue
+            ext = determine_ext(src)
+            if ext == 'f4m':
+                formats.extend(self._extract_f4m_formats(
+                    src, video_id, f4m_id='hds'))
+            elif ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    src, video_id, 'mp4', m3u8_id='hls'))
+            else:
+                formats.append({
+                    'url': src,
+                })
+        self._sort_formats(formats)
+
+        title = media_group.get('media-title')
+        description = media_group.get('media-description')
+        duration = int_or_none(media_group['media-content'][0].get('@attributes', {}).get('duration'))
+        thumbnail = self._proto_relative_url(
+            media_group.get('media-thumbnail', {}).get('@attributes', {}).get('url'))
+        timestamp = parse_iso8601(feed.get('pubDate'), ' ')
+
+        subtitles = {}
+        for media_subtitle in media_group.get('media-subTitle', []):
+            lang = media_subtitle.get('@attributes', {}).get('lang')
+            href = media_subtitle.get('@attributes', {}).get('href')
+            if not lang or not href:
+                continue
+            subtitles[lang] = [{
+                'ext': 'ttml',
+                'url': href,
+            }]
+
+        series_id, episode_number = video_id.split('.')
+        episode_info = self._download_json(
+            # We only need a single episode info, so restricting page size to one episode
+            # and dealing with page number as with episode number
+            r'http://www.dramafever.com/api/4/episode/series/?cs=%s&series_id=%s&page_number=%s&page_size=1'
+            % (self._consumer_secret, series_id, episode_number),
+            video_id, 'Downloading episode info JSON', fatal=False)
+        if episode_info:
+            value = episode_info.get('value')
+            if value:
+                subfile = value[0].get('subfile') or value[0].get('new_subfile')
+                if subfile and subfile != 'http://www.dramafever.com/st/':
+                    subtitles.setdefault('English', []).append({
+                        'ext': 'srt',
+                        'url': subfile,
+                    })
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
+            'duration': duration,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
+
+
+class DramaFeverSeriesIE(DramaFeverBaseIE):
+    IE_NAME = 'dramafever:series'
+    _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P<id>[0-9]+)(?:/(?:(?!\d+(?:/|$)).+)?)?$'
+    _TESTS = [{
+        'url': 'http://www.dramafever.com/drama/4512/Cooking_with_Shin/',
+        'info_dict': {
+            'id': '4512',
+            'title': 'Cooking with Shin',
+            'description': 'md5:84a3f26e3cdc3fb7f500211b3593b5c1',
+        },
+        'playlist_count': 4,
+    }, {
+        'url': 'http://www.dramafever.com/drama/124/IRIS/',
+        'info_dict': {
+            'id': '124',
+            'title': 'IRIS',
+            'description': 'md5:b3a30e587cf20c59bd1c01ec0ee1b862',
+        },
+        'playlist_count': 20,
+    }]
+
+    _PAGE_SIZE = 60  # max is 60 (see http://api.drama9.com/#get--api-4-episode-series-)
+
+    def _real_extract(self, url):
+        series_id = self._match_id(url)
+
+        series = self._download_json(
+            'http://www.dramafever.com/api/4/series/query/?cs=%s&series_id=%s'
+            % (self._consumer_secret, series_id),
+            series_id, 'Downloading series JSON')['series'][series_id]
+
+        title = clean_html(series['name'])
+        description = clean_html(series.get('description') or series.get('description_short'))
+
+        entries = []
+        for page_num in itertools.count(1):
+            episodes = self._download_json(
+                'http://www.dramafever.com/api/4/episode/series/?cs=%s&series_id=%s&page_size=%d&page_number=%d'
+                % (self._consumer_secret, series_id, self._PAGE_SIZE, page_num),
+                series_id, 'Downloading episodes JSON page #%d' % page_num)
+            for episode in episodes.get('value', []):
+                episode_url = episode.get('episode_url')
+                if not episode_url:
+                    continue
+                entries.append(self.url_result(
+                    compat_urlparse.urljoin(url, episode_url),
+                    'DramaFever', episode.get('guid')))
+            if page_num == episodes['num_pages']:
+                break
+
+        return self.playlist_result(entries, series_id, title, description)
index 7626219baf33522958960bca0d8babe67b8a332e..8b98b013adeee32c67c769acfb88d76edff9a1f7 100644 (file)
@@ -15,7 +15,6 @@ class DRBonanzaIE(InfoExtractor):
 
     _TESTS = [{
         'url': 'http://www.dr.dk/bonanza/serie/portraetter/Talkshowet.htm?assetId=65517',
-        'md5': 'fe330252ddea607635cf2eb2c99a0af3',
         'info_dict': {
             'id': '65517',
             'ext': 'mp4',
@@ -26,6 +25,9 @@ class DRBonanzaIE(InfoExtractor):
             'upload_date': '20110120',
             'duration': 3664,
         },
+        'params': {
+            'skip_download': True,  # requires rtmp
+        },
     }, {
         'url': 'http://www.dr.dk/bonanza/radio/serie/sport/fodbold.htm?assetId=59410',
         'md5': '6dfe039417e76795fb783c52da3de11d',
@@ -93,6 +95,11 @@ class DRBonanzaIE(InfoExtractor):
                         'format_id': file['Type'].replace('Video', ''),
                         'preference': preferencemap.get(file['Type'], -10),
                     })
+                    if format['url'].startswith('rtmp'):
+                        rtmp_url = format['url']
+                        format['rtmp_live'] = True  # --resume does not work
+                        if '/bonanza/' in rtmp_url:
+                            format['play_path'] = rtmp_url.split('/bonanza/')[1]
                     formats.append(format)
                 elif file['Type'] == "Thumb":
                     thumbnail = file['Location']
@@ -111,9 +118,6 @@ class DRBonanzaIE(InfoExtractor):
         description = '%s\n%s\n%s\n' % (
             info['Description'], info['Actors'], info['Colophon'])
 
-        for f in formats:
-            f['url'] = f['url'].replace('rtmp://vod-bonanza.gss.dr.dk/bonanza/', 'http://vodfiles.dr.dk/')
-            f['url'] = f['url'].replace('mp4:bonanza', 'bonanza')
         self._sort_formats(formats)
 
         display_id = re.sub(r'[^\w\d-]', '', re.sub(r' ', '-', title.lower())) + '-' + asset_id
index 05bb22ddf3f0a3b095e1108b88d47f6766484d09..8ac8587be6af564af3674c8ff7e7754364bc311e 100644 (file)
@@ -11,19 +11,25 @@ from ..utils import (
 
 class DreiSatIE(InfoExtractor):
     IE_NAME = '3sat'
-    _VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$'
-    _TEST = {
-        'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918',
-        'md5': 'be37228896d30a88f315b638900a026e',
-        'info_dict': {
-            'id': '45918',
-            'ext': 'mp4',
-            'title': 'Waidmannsheil',
-            'description': 'md5:cce00ca1d70e21425e72c86a98a56817',
-            'uploader': '3sat',
-            'upload_date': '20140913'
-        }
-    }
+    _VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php|mediathek\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$'
+    _TESTS = [
+        {
+            'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918',
+            'md5': 'be37228896d30a88f315b638900a026e',
+            'info_dict': {
+                'id': '45918',
+                'ext': 'mp4',
+                'title': 'Waidmannsheil',
+                'description': 'md5:cce00ca1d70e21425e72c86a98a56817',
+                'uploader': '3sat',
+                'upload_date': '20140913'
+            }
+        },
+        {
+            'url': 'http://www.3sat.de/mediathek/mediathek.php?mode=play&obj=51066',
+            'only_matching': True,
+        },
+    ]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
index 37c5c181f799efd8ee69d850c0b6076130c64073..639f9182c5484a22f0056e25fc6aa7e56f193df5 100644 (file)
@@ -36,25 +36,24 @@ class DrTuberIE(InfoExtractor):
             r'<source src="([^"]+)"', webpage, 'video URL')
 
         title = self._html_search_regex(
-            [r'class="hd_title" style="[^"]+">([^<]+)</h1>', r'<title>([^<]+) - \d+'],
+            [r'<p[^>]+class="title_substrate">([^<]+)</p>', r'<title>([^<]+) - \d+'],
             webpage, 'title')
 
         thumbnail = self._html_search_regex(
             r'poster="([^"]+)"',
             webpage, 'thumbnail', fatal=False)
 
-        like_count = str_to_int(self._html_search_regex(
-            r'<span id="rate_likes">\s*<img[^>]+>\s*<span>([\d,\.]+)</span>',
-            webpage, 'like count', fatal=False))
-        dislike_count = str_to_int(self._html_search_regex(
-            r'<span id="rate_dislikes">\s*<img[^>]+>\s*<span>([\d,\.]+)</span>',
-            webpage, 'like count', fatal=False))
-        comment_count = str_to_int(self._html_search_regex(
-            r'<span class="comments_count">([\d,\.]+)</span>',
-            webpage, 'comment count', fatal=False))
+        def extract_count(id_, name):
+            return str_to_int(self._html_search_regex(
+                r'<span[^>]+(?:class|id)="%s"[^>]*>([\d,\.]+)</span>' % id_,
+                webpage, '%s count' % name, fatal=False))
+
+        like_count = extract_count('rate_likes', 'like')
+        dislike_count = extract_count('rate_dislikes', 'dislike')
+        comment_count = extract_count('comments_count', 'comment')
 
         cats_str = self._search_regex(
-            r'<span>Categories:</span><div>(.+?)</div>', webpage, 'categories', fatal=False)
+            r'<div[^>]+class="categories_list">(.+?)</div>', webpage, 'categories', fatal=False)
         categories = [] if not cats_str else re.findall(r'<a title="([^"]+)"', cats_str)
 
         return {
index f25ab319e66d4d5b151cd9a9d4509807b6a88617..baa24c6d13abe016cceb83bb927db15d7d300509 100644 (file)
@@ -1,8 +1,11 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-from .common import InfoExtractor, ExtractorError
-from ..utils import parse_iso8601
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    parse_iso8601,
+)
 
 
 class DRTVIE(InfoExtractor):
@@ -60,19 +63,31 @@ class DRTVIE(InfoExtractor):
                 restricted_to_denmark = asset['RestrictedToDenmark']
                 spoken_subtitles = asset['Target'] == 'SpokenSubtitles'
                 for link in asset['Links']:
-                    target = link['Target']
                     uri = link['Uri']
+                    target = link['Target']
                     format_id = target
-                    preference = -1 if target == 'HDS' else -2
+                    preference = None
                     if spoken_subtitles:
-                        preference -= 2
+                        preference = -1
                         format_id += '-spoken-subtitles'
-                    formats.append({
-                        'url': uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43' if target == 'HDS' else uri,
-                        'format_id': format_id,
-                        'ext': link['FileFormat'],
-                        'preference': preference,
-                    })
+                    if target == 'HDS':
+                        formats.extend(self._extract_f4m_formats(
+                            uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43',
+                            video_id, preference, f4m_id=format_id))
+                    elif target == 'HLS':
+                        formats.extend(self._extract_m3u8_formats(
+                            uri, video_id, 'mp4', preference=preference,
+                            m3u8_id=format_id))
+                    else:
+                        bitrate = link.get('Bitrate')
+                        if bitrate:
+                            format_id += '-%s' % bitrate
+                        formats.append({
+                            'url': uri,
+                            'format_id': format_id,
+                            'tbr': bitrate,
+                            'ext': link.get('FileFormat'),
+                        })
                 subtitles_list = asset.get('SubtitlesList')
                 if isinstance(subtitles_list, list):
                     LANGS = {
index 9c594b757e4bbe41371cc603441c4dd5230e7b74..999fb5620df2976073122fb95fbad1bb133f357a 100644 (file)
@@ -26,7 +26,7 @@ class DumpertIE(InfoExtractor):
         video_id = self._match_id(url)
 
         req = compat_urllib_request.Request(url)
-        req.add_header('Cookie', 'nsfw=1')
+        req.add_header('Cookie', 'nsfw=1; cpc=10')
         webpage = self._download_webpage(req, video_id)
 
         files_base64 = self._search_regex(
index 9cb1bf301b9ae3e327e4831bdb8a7d2437b43803..b1cd4f5d4e6fe1dbaaf6ec230aad75947887301d 100644 (file)
@@ -1,9 +1,7 @@
 from __future__ import unicode_literals
 
-from ..compat import (
-    compat_urllib_parse,
-)
 from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
 
 
 class EHowIE(InfoExtractor):
@@ -26,7 +24,7 @@ class EHowIE(InfoExtractor):
         webpage = self._download_webpage(url, video_id)
         video_url = self._search_regex(
             r'(?:file|source)=(http[^\'"&]*)', webpage, 'video URL')
-        final_url = compat_urllib_parse.unquote(video_url)
+        final_url = compat_urllib_parse_unquote(video_url)
         uploader = self._html_search_meta('uploader', webpage)
         title = self._og_search_title(webpage).replace(' | eHow', '')
 
index 5154bbd7f8e5a8447a24d8274780648a7eae0ca4..02c6a4615c4436fecda86fb152a131f084640612 100644 (file)
@@ -6,57 +6,42 @@ import json
 from .common import InfoExtractor
 from ..utils import (
     ExtractorError,
-    parse_iso8601,
 )
 
 
 class EllenTVIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?(?:ellentv|ellentube)\.com/videos/(?P<id>[a-z0-9_-]+)'
-    _TESTS = [{
+    _TEST = {
         'url': 'http://www.ellentv.com/videos/0-ipq1gsai/',
         'md5': '8e3c576bf2e9bfff4d76565f56f94c9c',
         'info_dict': {
-            'id': '0-ipq1gsai',
+            'id': '0_ipq1gsai',
             'ext': 'mp4',
             'title': 'Fast Fingers of Fate',
-            'description': 'md5:686114ced0a032926935e9015ee794ac',
-            'timestamp': 1428033600,
+            'description': 'md5:587e79fbbd0d73b148bc596d99ce48e6',
+            'timestamp': 1428035648,
             'upload_date': '20150403',
+            'uploader_id': 'batchUser',
         }
-    }, {
-        'url': 'http://ellentube.com/videos/0-dvzmabd5/',
-        'md5': '98238118eaa2bbdf6ad7f708e3e4f4eb',
-        'info_dict': {
-            'id': '0-dvzmabd5',
-            'ext': 'mp4',
-            'title': '1 year old twin sister makes her brother laugh',
-            'description': '1 year old twin sister makes her brother laugh',
-            'timestamp': 1419542075,
-            'upload_date': '20141225',
-        }
-    }]
+    }
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        webpage = self._download_webpage(url, video_id)
+        webpage = self._download_webpage(
+            'http://widgets.ellentube.com/videos/%s' % video_id,
+            video_id)
 
-        video_url = self._html_search_meta('VideoURL', webpage, 'url', fatal=True)
-        title = self._og_search_title(webpage, default=None) or self._search_regex(
-            r'pageName\s*=\s*"([^"]+)"', webpage, 'title')
-        description = self._html_search_meta(
-            'description', webpage, 'description') or self._og_search_description(webpage)
-        timestamp = parse_iso8601(self._search_regex(
-            r'<span class="publish-date"><time datetime="([^"]+)">',
-            webpage, 'timestamp', fatal=False))
+        partner_id = self._search_regex(
+            r"var\s+partnerId\s*=\s*'([^']+)", webpage, 'partner id')
 
-        return {
-            'id': video_id,
-            'url': video_url,
-            'title': title,
-            'description': description,
-            'timestamp': timestamp,
-        }
+        kaltura_id = self._search_regex(
+            [r'id="kaltura_player_([^"]+)"',
+             r"_wb_entry_id\s*:\s*'([^']+)",
+             r'data-kaltura-entry-id="([^"]+)'],
+            webpage, 'kaltura id')
+
+        return self.url_result('kaltura:%s:%s' % (partner_id, kaltura_id), 'Kaltura')
 
 
 class EllenTVClipsIE(InfoExtractor):
@@ -68,7 +53,7 @@ class EllenTVClipsIE(InfoExtractor):
             'id': 'meryl-streep-vanessa-hudgens',
             'title': 'Meryl Streep, Vanessa Hudgens',
         },
-        'playlist_mincount': 9,
+        'playlist_mincount': 7,
     }
 
     def _real_extract(self, url):
@@ -92,4 +77,8 @@ class EllenTVClipsIE(InfoExtractor):
             raise ExtractorError('Failed to download JSON', cause=ve)
 
     def _extract_entries(self, playlist):
-        return [self.url_result(item['url'], 'EllenTV') for item in playlist]
+        return [
+            self.url_result(
+                'kaltura:%s:%s' % (item['kaltura_partner_id'], item['kaltura_entry_id']),
+                'Kaltura')
+            for item in playlist]
diff --git a/youtube_dl/extractor/empflix.py b/youtube_dl/extractor/empflix.py
deleted file mode 100644 (file)
index 70f8efe..0000000
+++ /dev/null
@@ -1,25 +0,0 @@
-from __future__ import unicode_literals
-
-from .tnaflix import TNAFlixIE
-
-
-class EMPFlixIE(TNAFlixIE):
-    _VALID_URL = r'^https?://www\.empflix\.com/videos/(?P<display_id>[0-9a-zA-Z-]+)-(?P<id>[0-9]+)\.html'
-
-    _TITLE_REGEX = r'name="title" value="(?P<title>[^"]*)"'
-    _DESCRIPTION_REGEX = r'name="description" value="([^"]*)"'
-    _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"'
-
-    _TEST = {
-        'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html',
-        'md5': 'b1bc15b6412d33902d6e5952035fcabc',
-        'info_dict': {
-            'id': '33051',
-            'display_id': 'Amateur-Finger-Fuck',
-            'ext': 'mp4',
-            'title': 'Amateur Finger Fuck',
-            'description': 'Amateur solo finger fucking.',
-            'thumbnail': 're:https?://.*\.jpg$',
-            'age_limit': 18,
-        }
-    }
index 0cbca90b061cf2358600146f37f6da5b61d71709..316033cf18b42cefead780ceca15b361ebbddac7 100644 (file)
@@ -4,7 +4,10 @@ import re
 
 from .common import InfoExtractor
 from ..compat import compat_urllib_parse
-from ..utils import ExtractorError
+from ..utils import (
+    ExtractorError,
+    unescapeHTML
+)
 
 
 class EroProfileIE(InfoExtractor):
@@ -75,8 +78,8 @@ class EroProfileIE(InfoExtractor):
             [r"glbUpdViews\s*\('\d*','(\d+)'", r'p/report/video/(\d+)'],
             webpage, 'video id', default=None)
 
-        video_url = self._search_regex(
-            r'<source src="([^"]+)', webpage, 'video url')
+        video_url = unescapeHTML(self._search_regex(
+            r'<source src="([^"]+)', webpage, 'video url'))
         title = self._html_search_regex(
             r'Title:</th><td>([^<]+)</td>', webpage, 'title')
         thumbnail = self._search_regex(
index e47f3e27a57aa14e3eee526af8998230b524bb4f..c85b4c458d95882f56675fa135aab1f3492b6194 100644 (file)
 from __future__ import unicode_literals
 
+import json
+
 from .common import InfoExtractor
-from ..compat import (
-    compat_urllib_parse,
-    compat_urllib_request,
-)
+from ..compat import compat_urllib_request
+
 from ..utils import (
-    ExtractorError,
-    js_to_json,
-    parse_duration,
+    determine_ext,
+    clean_html,
+    int_or_none,
+    float_or_none,
 )
 
 
+def _decrypt_config(key, string):
+    a = ''
+    i = ''
+    r = ''
+
+    while len(a) < (len(string) / 2):
+        a += key
+
+    a = a[0:int(len(string) / 2)]
+
+    t = 0
+    while t < len(string):
+        i += chr(int(string[t] + string[t + 1], 16))
+        t += 2
+
+    icko = [s for s in i]
+
+    for t, c in enumerate(a):
+        r += chr(ord(c) ^ ord(icko[t]))
+
+    return r
+
+
 class EscapistIE(InfoExtractor):
-    _VALID_URL = r'https?://?(www\.)?escapistmagazine\.com/videos/view/[^/?#]+/(?P<id>[0-9]+)-[^/?#]*(?:$|[?#])'
-    _USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'
-    _TEST = {
+    _VALID_URL = r'https?://?(?:www\.)?escapistmagazine\.com/videos/view/[^/?#]+/(?P<id>[0-9]+)-[^/?#]*(?:$|[?#])'
+    _TESTS = [{
         'url': 'http://www.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate',
         'md5': 'ab3a706c681efca53f0a35f1415cf0d1',
         'info_dict': {
             'id': '6618',
             'ext': 'mp4',
             'description': "Baldur's Gate: Original, Modded or Enhanced Edition? I'll break down what you can expect from the new Baldur's Gate: Enhanced Edition.",
-            'uploader_id': 'the-escapist-presents',
-            'uploader': 'The Escapist Presents',
             'title': "Breaking Down Baldur's Gate",
             'thumbnail': 're:^https?://.*\.jpg$',
             'duration': 264,
+            'uploader': 'The Escapist',
+        }
+    }, {
+        'url': 'http://www.escapistmagazine.com/videos/view/zero-punctuation/10044-Evolve-One-vs-Multiplayer',
+        'md5': '9e8c437b0dbb0387d3bd3255ca77f6bf',
+        'info_dict': {
+            'id': '10044',
+            'ext': 'mp4',
+            'description': 'This week, Zero Punctuation reviews Evolve.',
+            'title': 'Evolve - One vs Multiplayer',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 304,
+            'uploader': 'The Escapist',
         }
-    }
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        webpage_req = compat_urllib_request.Request(url)
-        webpage_req.add_header('User-Agent', self._USER_AGENT)
-        webpage = self._download_webpage(webpage_req, video_id)
-
-        uploader_id = self._html_search_regex(
-            r"<h1\s+class='headline'>\s*<a\s+href='/videos/view/(.*?)'",
-            webpage, 'uploader ID', fatal=False)
-        uploader = self._html_search_regex(
-            r"<h1\s+class='headline'>(.*?)</a>",
-            webpage, 'uploader', fatal=False)
-        description = self._html_search_meta('description', webpage)
-        duration = parse_duration(self._html_search_meta('duration', webpage))
-
-        raw_title = self._html_search_meta('title', webpage, fatal=True)
-        title = raw_title.partition(' : ')[2]
-
-        config_url = compat_urllib_parse.unquote(self._html_search_regex(
-            r'''(?x)
-            (?:
-                <param\s+name="flashvars".*?\s+value="config=|
-                flashvars=&quot;config=
-            )
-            (https?://[^"&]+)
-            ''',
-            webpage, 'config URL'))
-
-        formats = []
-        ad_formats = []
-
-        def _add_format(name, cfg_url, quality):
-            cfg_req = compat_urllib_request.Request(cfg_url)
-            cfg_req.add_header('User-Agent', self._USER_AGENT)
-            config = self._download_json(
-                cfg_req, video_id,
-                'Downloading ' + name + ' configuration',
-                'Unable to download ' + name + ' configuration',
-                transform_source=js_to_json)
-
-            playlist = config['playlist']
-            for p in playlist:
-                if p.get('eventCategory') == 'Video':
-                    ar = formats
-                elif p.get('eventCategory') == 'Video Postroll':
-                    ar = ad_formats
-                else:
-                    continue
-
-                ar.append({
-                    'url': p['url'],
-                    'format_id': name,
-                    'quality': quality,
-                    'http_headers': {
-                        'User-Agent': self._USER_AGENT,
-                    },
-                })
-
-        _add_format('normal', config_url, quality=0)
-        hq_url = (config_url +
-                  ('&hq=1' if '?' in config_url else config_url + '?hq=1'))
-        try:
-            _add_format('hq', hq_url, quality=1)
-        except ExtractorError:
-            pass  # That's fine, we'll just use normal quality
+        webpage = self._download_webpage(url, video_id)
+
+        ims_video = self._parse_json(
+            self._search_regex(
+                r'imsVideo\.play\(({.+?})\);', webpage, 'imsVideo'),
+            video_id)
+        video_id = ims_video['videoID']
+        key = ims_video['hash']
+
+        config_req = compat_urllib_request.Request(
+            'http://www.escapistmagazine.com/videos/'
+            'vidconfig.php?videoID=%s&hash=%s' % (video_id, key))
+        config_req.add_header('Referer', url)
+        config = self._download_webpage(config_req, video_id, 'Downloading video config')
+
+        data = json.loads(_decrypt_config(key, config))
+
+        video_data = data['videoData']
+
+        title = clean_html(video_data['title'])
+        duration = float_or_none(video_data.get('duration'), 1000)
+        uploader = video_data.get('publisher')
+
+        formats = [{
+            'url': video['src'],
+            'format_id': '%s-%sp' % (determine_ext(video['src']), video['res']),
+            'height': int_or_none(video.get('res')),
+        } for video in data['files']['videos']]
         self._sort_formats(formats)
 
-        if '/escapist/sales-marketing/' in formats[-1]['url']:
-            raise ExtractorError('This IP address has been blocked by The Escapist', expected=True)
-
-        res = {
+        return {
             'id': video_id,
             'formats': formats,
-            'uploader': uploader,
-            'uploader_id': uploader_id,
             'title': title,
             'thumbnail': self._og_search_thumbnail(webpage),
-            'description': description,
+            'description': self._og_search_description(webpage),
             'duration': duration,
+            'uploader': uploader,
         }
-
-        if self._downloader.params.get('include_ads') and ad_formats:
-            self._sort_formats(ad_formats)
-            ad_res = {
-                'id': '%s-ad' % video_id,
-                'title': '%s (Postroll)' % title,
-                'formats': ad_formats,
-            }
-            return {
-                '_type': 'playlist',
-                'entries': [res, ad_res],
-                'title': title,
-                'id': video_id,
-            }
-
-        return res
diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py
new file mode 100644 (file)
index 0000000..e6f8f03
--- /dev/null
@@ -0,0 +1,55 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class ESPNIE(InfoExtractor):
+    _VALID_URL = r'https?://espn\.go\.com/(?:[^/]+/)*(?P<id>[^/]+)'
+    _WORKING = False
+    _TESTS = [{
+        'url': 'http://espn.go.com/video/clip?id=10365079',
+        'info_dict': {
+            'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG',
+            'ext': 'mp4',
+            'title': 'dm_140128_30for30Shorts___JudgingJewellv2',
+            'description': '',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://espn.go.com/video/iframe/twitter/?cms=espn&id=10365079',
+        'only_matching': True,
+    }, {
+        'url': 'http://espn.go.com/nba/recap?gameId=400793786',
+        'only_matching': True,
+    }, {
+        'url': 'http://espn.go.com/blog/golden-state-warriors/post/_/id/593/how-warriors-rapidly-regained-a-winning-edge',
+        'only_matching': True,
+    }, {
+        'url': 'http://espn.go.com/sports/endurance/story/_/id/12893522/dzhokhar-tsarnaev-sentenced-role-boston-marathon-bombings',
+        'only_matching': True,
+    }, {
+        'url': 'http://espn.go.com/nba/playoffs/2015/story/_/id/12887571/john-wall-washington-wizards-no-swelling-left-hand-wrist-game-5-return',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_id = self._search_regex(
+            r'class="video-play-button"[^>]+data-id="(\d+)',
+            webpage, 'video id')
+
+        player = self._download_webpage(
+            'https://espn.go.com/video/iframe/twitter/?id=%s' % video_id, video_id)
+
+        pcode = self._search_regex(
+            r'["\']pcode=([^"\']+)["\']', player, 'pcode')
+
+        return self.url_result(
+            'ooyalaexternal:espn:%s:%s' % (video_id, pcode),
+            'OoyalaExternal')
index f0e575320015d435889b1bd610b4871dbd84ae21..e17bb9aeac51e2e10e2b68b4391d3022af35bcd5 100644 (file)
@@ -9,7 +9,7 @@ from ..compat import (
     compat_http_client,
     compat_str,
     compat_urllib_error,
-    compat_urllib_parse,
+    compat_urllib_parse_unquote,
     compat_urllib_request,
 )
 from ..utils import (
@@ -24,8 +24,12 @@ class FacebookIE(InfoExtractor):
     _VALID_URL = r'''(?x)
         https?://(?:\w+\.)?facebook\.com/
         (?:[^#]*?\#!/)?
-        (?:video/video\.php|photo\.php|video\.php|video/embed)\?(?:.*?)
-        (?:v|video_id)=(?P<id>[0-9]+)
+        (?:
+            (?:video/video\.php|photo\.php|video\.php|video/embed)\?(?:.*?)
+            (?:v|video_id)=|
+            [^/]+/videos/(?:[^/]+/)?
+        )
+        (?P<id>[0-9]+)
         (?:.*)'''
     _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1'
     _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1'
@@ -46,10 +50,19 @@ class FacebookIE(InfoExtractor):
             'id': '274175099429670',
             'ext': 'mp4',
             'title': 'Facebook video #274175099429670',
-        }
+        },
+        'expected_warnings': [
+            'title'
+        ]
     }, {
         'url': 'https://www.facebook.com/video.php?v=10204634152394104',
         'only_matching': True,
+    }, {
+        'url': 'https://www.facebook.com/amogood/videos/1618742068337349/?fref=nf',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.facebook.com/ChristyClarkForBC/videos/vb.22819070941/10153870694020942/?type=2&theater',
+        'only_matching': True,
     }]
 
     def _login(self):
@@ -123,7 +136,7 @@ class FacebookIE(InfoExtractor):
             else:
                 raise ExtractorError('Cannot parse data')
         data = dict(json.loads(m.group(1)))
-        params_raw = compat_urllib_parse.unquote(data['params'])
+        params_raw = compat_urllib_parse_unquote(data['params'])
         params = json.loads(params_raw)
         video_data = params['video_data'][0]
 
@@ -139,12 +152,12 @@ class FacebookIE(InfoExtractor):
             raise ExtractorError('Cannot find video formats')
 
         video_title = self._html_search_regex(
-            r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, 'title',
-            fatal=False)
+            r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, 'title',
+            default=None)
         if not video_title:
             video_title = self._html_search_regex(
                 r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>',
-                webpage, 'alternative title', default=None)
+                webpage, 'alternative title', fatal=False)
             video_title = limit_length(video_title, 80)
         if not video_title:
             video_title = 'Facebook video #%s' % video_id
index 3c39ca451a38e69a822968911e758847657380e9..cebdd0193a82eaccc673dffe9d001f766e9e31d1 100644 (file)
@@ -6,9 +6,9 @@ from .common import InfoExtractor
 
 class FazIE(InfoExtractor):
     IE_NAME = 'faz.net'
-    _VALID_URL = r'https?://www\.faz\.net/multimedia/videos/.*?-(?P<id>\d+)\.html'
+    _VALID_URL = r'https?://(?:www\.)?faz\.net/(?:[^/]+/)*.*?-(?P<id>\d+)\.html'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.faz.net/multimedia/videos/stockholm-chemie-nobelpreis-fuer-drei-amerikanische-forscher-12610585.html',
         'info_dict': {
             'id': '12610585',
@@ -16,7 +16,22 @@ class FazIE(InfoExtractor):
             'title': 'Stockholm: Chemie-Nobelpreis für drei amerikanische Forscher',
             'description': 'md5:1453fbf9a0d041d985a47306192ea253',
         },
-    }
+    }, {
+        'url': 'http://www.faz.net/aktuell/politik/berlin-gabriel-besteht-zerreissprobe-ueber-datenspeicherung-13659345.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.faz.net/berlin-gabriel-besteht-zerreissprobe-ueber-datenspeicherung-13659345.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.faz.net/-13659345.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.faz.net/aktuell/politik/-13659345.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.faz.net/foobarblafasel-13659345.html',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
diff --git a/youtube_dl/extractor/firedrive.py b/youtube_dl/extractor/firedrive.py
deleted file mode 100644 (file)
index 3191116..0000000
+++ /dev/null
@@ -1,80 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..compat import (
-    compat_urllib_parse,
-    compat_urllib_request,
-)
-from ..utils import (
-    ExtractorError,
-)
-
-
-class FiredriveIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?firedrive\.com/' + \
-                 '(?:file|embed)/(?P<id>[0-9a-zA-Z]+)'
-    _FILE_DELETED_REGEX = r'<div class="removed_file_image">'
-
-    _TESTS = [{
-        'url': 'https://www.firedrive.com/file/FEB892FA160EBD01',
-        'md5': 'd5d4252f80ebeab4dc2d5ceaed1b7970',
-        'info_dict': {
-            'id': 'FEB892FA160EBD01',
-            'ext': 'flv',
-            'title': 'bbb_theora_486kbit.flv',
-            'thumbnail': 're:^http://.*\.jpg$',
-        },
-    }]
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-        url = 'http://firedrive.com/file/%s' % video_id
-        webpage = self._download_webpage(url, video_id)
-
-        if re.search(self._FILE_DELETED_REGEX, webpage) is not None:
-            raise ExtractorError('Video %s does not exist' % video_id,
-                                 expected=True)
-
-        fields = dict(re.findall(r'''(?x)<input\s+
-            type="hidden"\s+
-            name="([^"]+)"\s+
-            value="([^"]*)"
-            ''', webpage))
-
-        post = compat_urllib_parse.urlencode(fields)
-        req = compat_urllib_request.Request(url, post)
-        req.add_header('Content-type', 'application/x-www-form-urlencoded')
-
-        # Apparently, this header is required for confirmation to work.
-        req.add_header('Host', 'www.firedrive.com')
-
-        webpage = self._download_webpage(req, video_id,
-                                         'Downloading video page')
-
-        title = self._search_regex(r'class="external_title_left">(.+)</div>',
-                                   webpage, 'title')
-        thumbnail = self._search_regex(r'image:\s?"(//[^\"]+)', webpage,
-                                       'thumbnail', fatal=False)
-        if thumbnail is not None:
-            thumbnail = 'http:' + thumbnail
-
-        ext = self._search_regex(r'type:\s?\'([^\']+)\',',
-                                 webpage, 'extension', fatal=False)
-        video_url = self._search_regex(
-            r'file:\s?loadURL\(\'(http[^\']+)\'\),', webpage, 'file url')
-
-        formats = [{
-            'format_id': 'sd',
-            'url': video_url,
-            'ext': ext,
-        }]
-
-        return {
-            'id': video_id,
-            'title': title,
-            'thumbnail': thumbnail,
-            'formats': formats,
-        }
diff --git a/youtube_dl/extractor/fivetv.py b/youtube_dl/extractor/fivetv.py
new file mode 100644 (file)
index 0000000..13fbc4d
--- /dev/null
@@ -0,0 +1,88 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class FiveTVIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    http://
+                        (?:www\.)?5-tv\.ru/
+                        (?:
+                            (?:[^/]+/)+(?P<id>\d+)|
+                            (?P<path>[^/?#]+)(?:[/?#])?
+                        )
+                    '''
+
+    _TESTS = [{
+        'url': 'http://5-tv.ru/news/96814/',
+        'md5': 'bbff554ad415ecf5416a2f48c22d9283',
+        'info_dict': {
+            'id': '96814',
+            'ext': 'mp4',
+            'title': 'Россияне выбрали имя для общенациональной платежной системы',
+            'description': 'md5:a8aa13e2b7ad36789e9f77a74b6de660',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 180,
+        },
+    }, {
+        'url': 'http://5-tv.ru/video/1021729/',
+        'info_dict': {
+            'id': '1021729',
+            'ext': 'mp4',
+            'title': '3D принтер',
+            'description': 'md5:d76c736d29ef7ec5c0cf7d7c65ffcb41',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 180,
+        },
+    }, {
+        'url': 'http://www.5-tv.ru/glavnoe/#itemDetails',
+        'info_dict': {
+            'id': 'glavnoe',
+            'ext': 'mp4',
+            'title': 'Итоги недели с 8 по 14 июня 2015 года',
+            'thumbnail': 're:^https?://.*\.jpg$',
+        },
+    }, {
+        'url': 'http://www.5-tv.ru/glavnoe/broadcasts/508645/',
+        'only_matching': True,
+    }, {
+        'url': 'http://5-tv.ru/films/1507502/',
+        'only_matching': True,
+    }, {
+        'url': 'http://5-tv.ru/programs/broadcast/508713/',
+        'only_matching': True,
+    }, {
+        'url': 'http://5-tv.ru/angel/',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.5-tv.ru/schedule/?iframe=true&width=900&height=450',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id') or mobj.group('path')
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_url = self._search_regex(
+            r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"',
+            webpage, 'video url')
+
+        title = self._og_search_title(webpage, default=None) or self._search_regex(
+            r'<title>([^<]+)</title>', webpage, 'title')
+        duration = int_or_none(self._og_search_property(
+            'video:duration', webpage, 'duration', default=None))
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'description': self._og_search_description(webpage, default=None),
+            'thumbnail': self._og_search_thumbnail(webpage, default=None),
+            'duration': duration,
+        }
index 0c858b6544b919b1b569b4c4102447631298046e..2fe76d661bb432580cd2bd3f48c85035a4b6d7d9 100644 (file)
@@ -3,9 +3,10 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..compat import compat_urllib_request
 from ..utils import (
     ExtractorError,
-    unescapeHTML,
+    find_xpath_attr,
 )
 
 
@@ -29,25 +30,31 @@ class FlickrIE(InfoExtractor):
         video_id = mobj.group('id')
         video_uploader_id = mobj.group('uploader_id')
         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
-        webpage = self._download_webpage(webpage_url, video_id)
+        req = compat_urllib_request.Request(webpage_url)
+        req.add_header(
+            'User-Agent',
+            # it needs a more recent version
+            'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20150101 Firefox/38.0 (Chrome)')
+        webpage = self._download_webpage(req, video_id)
 
-        secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, 'secret')
+        secret = self._search_regex(r'secret"\s*:\s*"(\w+)"', webpage, 'secret')
 
         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
-        first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
+        first_xml = self._download_xml(first_url, video_id, 'Downloading first data webpage')
 
-        node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
-                                          first_xml, 'node_id')
+        node_id = find_xpath_attr(
+            first_xml, './/{http://video.yahoo.com/YEP/1.0/}Item', 'id',
+            'id').text
 
         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
-        second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
+        second_xml = self._download_xml(second_url, video_id, 'Downloading second data webpage')
 
         self.report_extraction(video_id)
 
-        mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
-        if mobj is None:
+        stream = second_xml.find('.//STREAM')
+        if stream is None:
             raise ExtractorError('Unable to extract video url')
-        video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
+        video_url = stream.attrib['APP'] + stream.attrib['FULLPATH']
 
         return {
             'id': video_id,
diff --git a/youtube_dl/extractor/foxsports.py b/youtube_dl/extractor/foxsports.py
new file mode 100644 (file)
index 0000000..df76651
--- /dev/null
@@ -0,0 +1,32 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import smuggle_url
+
+
+class FoxSportsIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?foxsports\.com/(?:[^/]+/)*(?P<id>[^/]+)'
+
+    _TEST = {
+        'url': 'http://www.foxsports.com/video?vid=432609859715',
+        'info_dict': {
+            'id': 'gA0bHB3Ladz3',
+            'ext': 'flv',
+            'title': 'Courtney Lee on going up 2-0 in series vs. Blazers',
+            'description': 'Courtney Lee talks about Memphis being focused.',
+        },
+        'add_ie': ['ThePlatform'],
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        config = self._parse_json(
+            self._search_regex(
+                r"data-player-config='([^']+)'", webpage, 'data player config'),
+            video_id)
+
+        return self.url_result(smuggle_url(
+            config['releaseURL'] + '&manifest=f4m', {'force_smil_url': True}))
index edf555b2987520618b70bf8bd423c5fc1f60e5a9..75723c00dc9e96c018e3b6771e634ff93c293ba1 100644 (file)
@@ -6,18 +6,15 @@ import re
 import json
 
 from .common import InfoExtractor
-from ..compat import (
-    compat_urllib_parse_urlparse,
-    compat_urlparse,
-)
+from ..compat import compat_urlparse
 from ..utils import (
     clean_html,
     ExtractorError,
     int_or_none,
-    float_or_none,
     parse_duration,
     determine_ext,
 )
+from .dailymotion import DailymotionCloudIE
 
 
 class FranceTVBaseInfoExtractor(InfoExtractor):
@@ -58,12 +55,12 @@ class FranceTVBaseInfoExtractor(InfoExtractor):
                     # See https://github.com/rg3/youtube-dl/issues/3963
                     # m3u8 urls work fine
                     continue
-                video_url_parsed = compat_urllib_parse_urlparse(video_url)
                 f4m_url = self._download_webpage(
-                    'http://hdfauth.francetv.fr/esi/urltokengen2.html?url=%s' % video_url_parsed.path,
+                    'http://hdfauth.francetv.fr/esi/TA?url=%s' % video_url,
                     video_id, 'Downloading f4m manifest token', fatal=False)
                 if f4m_url:
-                    formats.extend(self._extract_f4m_formats(f4m_url, video_id, 1, format_id))
+                    formats.extend(self._extract_f4m_formats(
+                        f4m_url + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, 1, format_id))
             elif ext == 'm3u8':
                 formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4', m3u8_id=format_id))
             elif video_url.startswith('rtmp'):
@@ -86,7 +83,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor):
             'title': info['titre'],
             'description': clean_html(info['synopsis']),
             'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', info['image']),
-            'duration': float_or_none(info.get('real_duration'), 1000) or parse_duration(info['duree']),
+            'duration': int_or_none(info.get('real_duration')) or parse_duration(info['duree']),
             'timestamp': int_or_none(info['diffusion']['timestamp']),
             'formats': formats,
         }
@@ -131,12 +128,26 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
             'skip_download': 'HLS (reqires ffmpeg)'
         },
         'skip': 'Ce direct est terminé et sera disponible en rattrapage dans quelques minutes.',
+    }, {
+        'url': 'http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html',
+        'md5': 'f485bda6e185e7d15dbc69b72bae993e',
+        'info_dict': {
+            'id': '556e03339473995ee145930c',
+            'ext': 'mp4',
+            'title': 'Les entreprises familiales : le secret de la réussite',
+            'thumbnail': 're:^https?://.*\.jpe?g$',
+        }
     }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         page_title = mobj.group('title')
         webpage = self._download_webpage(url, page_title)
+
+        dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage)
+        if dmcloud_url:
+            return self.url_result(dmcloud_url, 'DailymotionCloud')
+
         video_id, catalogue = self._search_regex(
             r'id-video=([^@]+@[^"]+)', webpage, 'video id').split('@')
         return self._extract_video(video_id, catalogue)
@@ -145,11 +156,21 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
 class FranceTVIE(FranceTVBaseInfoExtractor):
     IE_NAME = 'francetv'
     IE_DESC = 'France 2, 3, 4, 5 and Ô'
-    _VALID_URL = r'''(?x)https?://www\.france[2345o]\.fr/
-        (?:
-            emissions/.*?/(videos|emissions)/(?P<id>[^/?]+)
-        |   (emissions?|jt)/(?P<key>[^/?]+)
-        )'''
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            (?:www\.)?france[2345o]\.fr/
+                                (?:
+                                    emissions/[^/]+/(?:videos|diffusions)|
+                                    emission/[^/]+|
+                                    videos|
+                                    jt
+                                )
+                            /|
+                            embed\.francetv\.fr/\?ue=
+                        )
+                        (?P<id>[^/?]+)
+                    '''
 
     _TESTS = [
         # france2
@@ -206,24 +227,46 @@ class FranceTVIE(FranceTVBaseInfoExtractor):
         },
         # franceo
         {
-            'url': 'http://www.franceo.fr/jt/info-afrique/04-12-2013',
-            'md5': '52f0bfe202848b15915a2f39aaa8981b',
+            'url': 'http://www.franceo.fr/jt/info-soir/18-07-2015',
+            'md5': '47d5816d3b24351cdce512ad7ab31da8',
             'info_dict': {
-                'id': '108634970',
+                'id': '125377621',
                 'ext': 'flv',
-                'title': 'Infô Afrique',
-                'description': 'md5:ebf346da789428841bee0fd2a935ea55',
-                'upload_date': '20140915',
-                'timestamp': 1410822000,
+                'title': 'Infô soir',
+                'description': 'md5:01b8c6915a3d93d8bbbd692651714309',
+                'upload_date': '20150718',
+                'timestamp': 1437241200,
+                'duration': 414,
+            },
+        },
+        {
+            # francetv embed
+            'url': 'http://embed.francetv.fr/?ue=8d7d3da1e3047c42ade5a5d7dfd3fc87',
+            'info_dict': {
+                'id': 'EV_30231',
+                'ext': 'flv',
+                'title': 'Alcaline, le concert avec Calogero',
+                'description': 'md5:61f08036dcc8f47e9cfc33aed08ffaff',
+                'upload_date': '20150226',
+                'timestamp': 1424989860,
+                'duration': 5400,
             },
         },
+        {
+            'url': 'http://www.france4.fr/emission/highlander/diffusion-du-17-07-2015-04h05',
+            'only_matching': True,
+        },
+        {
+            'url': 'http://www.franceo.fr/videos/125377617',
+            'only_matching': True,
+        }
     ]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        webpage = self._download_webpage(url, mobj.group('key') or mobj.group('id'))
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
         video_id, catalogue = self._html_search_regex(
-            r'href="http://videos\.francetv\.fr/video/([^@]+@[^"]+)"',
+            r'href="http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"',
             webpage, 'video ID').split('@')
         return self._extract_video(video_id, catalogue)
 
index 47373e21540030d4c9a19dbfc1c5943f468fea4f..b3f1bafcc37ee98f1c5b89a644909f3ee0a32049 100644 (file)
@@ -5,7 +5,7 @@ import json
 
 from .common import InfoExtractor
 from ..compat import (
-    compat_urllib_parse,
+    compat_urllib_parse_unquote,
     compat_urlparse,
 )
 from ..utils import (
@@ -14,8 +14,8 @@ from ..utils import (
 
 
 class GameSpotIE(InfoExtractor):
-    _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?'
-    _TEST = {
+    _VALID_URL = r'http://(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?'
+    _TESTS = [{
         'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/',
         'md5': 'b2a30deaa8654fcccd43713a6b6a4825',
         'info_dict': {
@@ -23,8 +23,16 @@ class GameSpotIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'Arma 3 - Community Guide: SITREP I',
             'description': 'Check out this video where some of the basics of Arma 3 is explained.',
-        }
-    }
+        },
+    }, {
+        'url': 'http://www.gamespot.com/videos/the-witcher-3-wild-hunt-xbox-one-now-playing/2300-6424837/',
+        'info_dict': {
+            'id': 'gs-2300-6424837',
+            'ext': 'flv',
+            'title': 'The Witcher 3: Wild Hunt [Xbox ONE]  - Now Playing',
+            'description': 'Join us as we take a look at the early hours of The Witcher 3: Wild Hunt and more.',
+        },
+    }]
 
     def _real_extract(self, url):
         page_id = self._match_id(url)
@@ -32,30 +40,42 @@ class GameSpotIE(InfoExtractor):
         data_video_json = self._search_regex(
             r'data-video=["\'](.*?)["\']', webpage, 'data video')
         data_video = json.loads(unescapeHTML(data_video_json))
+        streams = data_video['videoStreams']
 
-        # Transform the manifest url to a link to the mp4 files
-        # they are used in mobile devices.
-        f4m_url = data_video['videoStreams']['f4m_stream']
-        f4m_path = compat_urlparse.urlparse(f4m_url).path
-        QUALITIES_RE = r'((,\d+)+,?)'
-        qualities = self._search_regex(QUALITIES_RE, f4m_path, 'qualities').strip(',').split(',')
-        http_path = f4m_path[1:].split('/', 1)[1]
-        http_template = re.sub(QUALITIES_RE, r'%s', http_path)
-        http_template = http_template.replace('.csmil/manifest.f4m', '')
-        http_template = compat_urlparse.urljoin(
-            'http://video.gamespotcdn.com/', http_template)
         formats = []
-        for q in qualities:
-            formats.append({
-                'url': http_template % q,
-                'ext': 'mp4',
-                'format_id': q,
-            })
+        f4m_url = streams.get('f4m_stream')
+        if f4m_url is not None:
+            # Transform the manifest url to a link to the mp4 files
+            # they are used in mobile devices.
+            f4m_path = compat_urlparse.urlparse(f4m_url).path
+            QUALITIES_RE = r'((,\d+)+,?)'
+            qualities = self._search_regex(QUALITIES_RE, f4m_path, 'qualities').strip(',').split(',')
+            http_path = f4m_path[1:].split('/', 1)[1]
+            http_template = re.sub(QUALITIES_RE, r'%s', http_path)
+            http_template = http_template.replace('.csmil/manifest.f4m', '')
+            http_template = compat_urlparse.urljoin(
+                'http://video.gamespotcdn.com/', http_template)
+            for q in qualities:
+                formats.append({
+                    'url': http_template % q,
+                    'ext': 'mp4',
+                    'format_id': q,
+                })
+        else:
+            for quality in ['sd', 'hd']:
+                # It's actually a link to a flv file
+                flv_url = streams.get('f4m_{0}'.format(quality))
+                if flv_url is not None:
+                    formats.append({
+                        'url': flv_url,
+                        'ext': 'flv',
+                        'format_id': quality,
+                    })
 
         return {
             'id': data_video['guid'],
             'display_id': page_id,
-            'title': compat_urllib_parse.unquote(data_video['title']),
+            'title': compat_urllib_parse_unquote(data_video['title']),
             'formats': formats,
             'description': self._html_search_meta('description', webpage),
             'thumbnail': self._og_search_thumbnail(webpage),
index 51796f3a40310f484cf1a49fc6e11468264dbe3a..43f916412d9b97f3ca93cea830e5390bdcc70db0 100644 (file)
@@ -11,7 +11,7 @@ from ..utils import remove_end
 
 
 class GDCVaultIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?gdcvault\.com/play/(?P<id>\d+)/(?P<name>(\w|-)+)'
+    _VALID_URL = r'https?://(?:www\.)?gdcvault\.com/play/(?P<id>\d+)/(?P<name>(\w|-)+)?'
     _NETRC_MACHINE = 'gdcvault'
     _TESTS = [
         {
@@ -19,6 +19,7 @@ class GDCVaultIE(InfoExtractor):
             'md5': '7ce8388f544c88b7ac11c7ab1b593704',
             'info_dict': {
                 'id': '1019721',
+                'display_id': 'Doki-Doki-Universe-Sweet-Simple',
                 'ext': 'mp4',
                 'title': 'Doki-Doki Universe: Sweet, Simple and Genuine (GDC Next 10)'
             }
@@ -27,6 +28,7 @@ class GDCVaultIE(InfoExtractor):
             'url': 'http://www.gdcvault.com/play/1015683/Embracing-the-Dark-Art-of',
             'info_dict': {
                 'id': '1015683',
+                'display_id': 'Embracing-the-Dark-Art-of',
                 'ext': 'flv',
                 'title': 'Embracing the Dark Art of Mathematical Modeling in AI'
             },
@@ -39,10 +41,15 @@ class GDCVaultIE(InfoExtractor):
             'md5': 'a5eb77996ef82118afbbe8e48731b98e',
             'info_dict': {
                 'id': '1015301',
+                'display_id': 'Thexder-Meets-Windows-95-or',
                 'ext': 'flv',
                 'title': 'Thexder Meets Windows 95, or Writing Great Games in the Windows 95 Environment',
             },
             'skip': 'Requires login',
+        },
+        {
+            'url': 'http://gdcvault.com/play/1020791/',
+            'only_matching': True,
         }
     ]
 
@@ -90,7 +97,7 @@ class GDCVaultIE(InfoExtractor):
         })
         return video_formats
 
-    def _login(self, webpage_url, video_id):
+    def _login(self, webpage_url, display_id):
         (username, password) = self._get_login_info()
         if username is None or password is None:
             self.report_warning('It looks like ' + webpage_url + ' requires a login. Try specifying a username and password and try again.')
@@ -107,9 +114,9 @@ class GDCVaultIE(InfoExtractor):
 
         request = compat_urllib_request.Request(login_url, compat_urllib_parse.urlencode(login_form))
         request.add_header('Content-Type', 'application/x-www-form-urlencoded')
-        self._download_webpage(request, video_id, 'Logging in')
-        start_page = self._download_webpage(webpage_url, video_id, 'Getting authenticated video page')
-        self._download_webpage(logout_url, video_id, 'Logging out')
+        self._download_webpage(request, display_id, 'Logging in')
+        start_page = self._download_webpage(webpage_url, display_id, 'Getting authenticated video page')
+        self._download_webpage(logout_url, display_id, 'Logging out')
 
         return start_page
 
@@ -117,8 +124,10 @@ class GDCVaultIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
 
         video_id = mobj.group('id')
+        display_id = mobj.group('name') or video_id
+
         webpage_url = 'http://www.gdcvault.com/play/' + video_id
-        start_page = self._download_webpage(webpage_url, video_id)
+        start_page = self._download_webpage(webpage_url, display_id)
 
         direct_url = self._search_regex(
             r's1\.addVariable\("file",\s*encodeURIComponent\("(/[^"]+)"\)\);',
@@ -131,6 +140,7 @@ class GDCVaultIE(InfoExtractor):
 
             return {
                 'id': video_id,
+                'display_id': display_id,
                 'url': video_url,
                 'ext': 'flv',
                 'title': title,
@@ -141,7 +151,7 @@ class GDCVaultIE(InfoExtractor):
             start_page, 'xml root', default=None)
         if xml_root is None:
             # Probably need to authenticate
-            login_res = self._login(webpage_url, video_id)
+            login_res = self._login(webpage_url, display_id)
             if login_res is None:
                 self.report_warning('Could not login.')
             else:
@@ -159,7 +169,7 @@ class GDCVaultIE(InfoExtractor):
             xml_name = self._html_search_regex(r'<iframe src=".*?\?xmlURL=xml/(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename')
 
         xml_decription_url = xml_root + 'xml/' + xml_name
-        xml_description = self._download_xml(xml_decription_url, video_id)
+        xml_description = self._download_xml(xml_decription_url, display_id)
 
         video_title = xml_description.find('./metadata/title').text
         video_formats = self._parse_mp4(xml_description)
@@ -168,6 +178,7 @@ class GDCVaultIE(InfoExtractor):
 
         return {
             'id': video_id,
+            'display_id': display_id,
             'title': video_title,
             'formats': video_formats,
         }
index eaf9c769a239a0e3276e890c959e00c5cee84815..cd133a10c38c6d7bf081d550160bf730b456515e 100644 (file)
@@ -8,7 +8,8 @@ import re
 from .common import InfoExtractor
 from .youtube import YoutubeIE
 from ..compat import (
-    compat_urllib_parse,
+    compat_urllib_parse_unquote,
+    compat_urllib_request,
     compat_urlparse,
     compat_xml_parse_error,
 )
@@ -32,9 +33,21 @@ from .brightcove import BrightcoveIE
 from .nbc import NBCSportsVPlayerIE
 from .ooyala import OoyalaIE
 from .rutv import RUTVIE
+from .tvc import TVCIE
+from .sportbox import SportBoxEmbedIE
 from .smotri import SmotriIE
+from .myvi import MyviIE
 from .condenast import CondeNastIE
 from .udn import UDNEmbedIE
+from .senateisvp import SenateISVPIE
+from .bliptv import BlipTVIE
+from .svt import SVTIE
+from .pornhub import PornHubIE
+from .xhamster import XHamsterEmbedIE
+from .vimeo import VimeoIE
+from .dailymotion import DailymotionCloudIE
+from .onionstudios import OnionStudiosIE
+from .snagfilms import SnagFilmsEmbedIE
 
 
 class GenericIE(InfoExtractor):
@@ -42,6 +55,97 @@ class GenericIE(InfoExtractor):
     _VALID_URL = r'.*'
     IE_NAME = 'generic'
     _TESTS = [
+        # Direct link to a video
+        {
+            'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
+            'md5': '67d406c2bcb6af27fa886f31aa934bbe',
+            'info_dict': {
+                'id': 'trailer',
+                'ext': 'mp4',
+                'title': 'trailer',
+                'upload_date': '20100513',
+            }
+        },
+        # Direct link to media delivered compressed (until Accept-Encoding is *)
+        {
+            'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac',
+            'md5': '128c42e68b13950268b648275386fc74',
+            'info_dict': {
+                'id': 'FictionJunction-Parallel_Hearts',
+                'ext': 'flac',
+                'title': 'FictionJunction-Parallel_Hearts',
+                'upload_date': '20140522',
+            },
+            'expected_warnings': [
+                'URL could be a direct video link, returning it as such.'
+            ]
+        },
+        # Direct download with broken HEAD
+        {
+            'url': 'http://ai-radio.org:8000/radio.opus',
+            'info_dict': {
+                'id': 'radio',
+                'ext': 'opus',
+                'title': 'radio',
+            },
+            'params': {
+                'skip_download': True,  # infinite live stream
+            },
+            'expected_warnings': [
+                r'501.*Not Implemented'
+            ],
+        },
+        # Direct link with incorrect MIME type
+        {
+            'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
+            'md5': '4ccbebe5f36706d85221f204d7eb5913',
+            'info_dict': {
+                'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
+                'id': '5_Lennart_Poettering_-_Systemd',
+                'ext': 'webm',
+                'title': '5_Lennart_Poettering_-_Systemd',
+                'upload_date': '20141120',
+            },
+            'expected_warnings': [
+                'URL could be a direct video link, returning it as such.'
+            ]
+        },
+        # RSS feed
+        {
+            'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
+            'info_dict': {
+                'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
+                'title': 'Zero Punctuation',
+                'description': 're:.*groundbreaking video review series.*'
+            },
+            'playlist_mincount': 11,
+        },
+        # RSS feed with enclosure
+        {
+            'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
+            'info_dict': {
+                'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
+                'ext': 'm4v',
+                'upload_date': '20150228',
+                'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
+            }
+        },
+        # google redirect
+        {
+            'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
+            'info_dict': {
+                'id': 'cmQHVoWB5FY',
+                'ext': 'mp4',
+                'upload_date': '20130224',
+                'uploader_id': 'TheVerge',
+                'description': 're:^Chris Ziegler takes a look at the\.*',
+                'uploader': 'The Verge',
+                'title': 'First Firefox OS phones side-by-side',
+            },
+            'params': {
+                'skip_download': False,
+            }
+        },
         {
             'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
             'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
@@ -121,17 +225,6 @@ class GenericIE(InfoExtractor):
                 'skip_download': True,  # m3u8 download
             },
         },
-        # Direct link to a video
-        {
-            'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
-            'md5': '67d406c2bcb6af27fa886f31aa934bbe',
-            'info_dict': {
-                'id': 'trailer',
-                'ext': 'mp4',
-                'title': 'trailer',
-                'upload_date': '20100513',
-            }
-        },
         # ooyala video
         {
             'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
@@ -156,22 +249,6 @@ class GenericIE(InfoExtractor):
             },
             'add_ie': ['Ooyala'],
         },
-        # google redirect
-        {
-            'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
-            'info_dict': {
-                'id': 'cmQHVoWB5FY',
-                'ext': 'mp4',
-                'upload_date': '20130224',
-                'uploader_id': 'TheVerge',
-                'description': 're:^Chris Ziegler takes a look at the\.*',
-                'uploader': 'The Verge',
-                'title': 'First Firefox OS phones side-by-side',
-            },
-            'params': {
-                'skip_download': False,
-            }
-        },
         # embed.ly video
         {
             'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
@@ -221,6 +298,66 @@ class GenericIE(InfoExtractor):
                 'skip_download': True,
             },
         },
+        # TVC embed
+        {
+            'url': 'http://sch1298sz.mskobr.ru/dou_edu/karamel_ki/filial_galleries/video/iframe_src_http_tvc_ru_video_iframe_id_55304_isplay_false_acc_video_id_channel_brand_id_11_show_episodes_episode_id_32307_frameb/',
+            'info_dict': {
+                'id': '55304',
+                'ext': 'mp4',
+                'title': 'Дошкольное воспитание',
+            },
+        },
+        # SportBox embed
+        {
+            'url': 'http://www.vestifinance.ru/articles/25753',
+            'info_dict': {
+                'id': '25753',
+                'title': 'Вести Экономика ― Прямые трансляции с Форума-выставки "Госзаказ-2013"',
+            },
+            'playlist': [{
+                'info_dict': {
+                    'id': '370908',
+                    'title': 'Госзаказ. День 3',
+                    'ext': 'mp4',
+                }
+            }, {
+                'info_dict': {
+                    'id': '370905',
+                    'title': 'Госзаказ. День 2',
+                    'ext': 'mp4',
+                }
+            }, {
+                'info_dict': {
+                    'id': '370902',
+                    'title': 'Госзаказ. День 1',
+                    'ext': 'mp4',
+                }
+            }],
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
+        # Myvi.ru embed
+        {
+            'url': 'http://www.kinomyvi.tv/news/detail/Pervij-dublirovannij-trejler--Uzhastikov-_nOw1',
+            'info_dict': {
+                'id': 'f4dafcad-ff21-423d-89b5-146cfd89fa1e',
+                'ext': 'mp4',
+                'title': 'Ужастики, русский трейлер (2015)',
+                'thumbnail': 're:^https?://.*\.jpg$',
+                'duration': 153,
+            }
+        },
+        # XHamster embed
+        {
+            'url': 'http://www.numisc.com/forum/showthread.php?11696-FM15-which-pumiscer-was-this-%28-vid-%29-%28-alfa-as-fuck-srx-%29&s=711f5db534502e22260dec8c5e2d66d8',
+            'info_dict': {
+                'id': 'showthread',
+                'title': '[NSFL] [FM15] which pumiscer was this ( vid ) ( alfa as fuck srx )',
+            },
+            'playlist_mincount': 7,
+        },
         # Embedded TED video
         {
             'url': 'http://en.support.wordpress.com/videos/ted-talks/',
@@ -270,6 +407,26 @@ class GenericIE(InfoExtractor):
                 'skip_download': 'Requires rtmpdump'
             }
         },
+        # francetv embed
+        {
+            'url': 'http://www.tsprod.com/replay-du-concert-alcaline-de-calogero',
+            'info_dict': {
+                'id': 'EV_30231',
+                'ext': 'mp4',
+                'title': 'Alcaline, le concert avec Calogero',
+                'description': 'md5:61f08036dcc8f47e9cfc33aed08ffaff',
+                'upload_date': '20150226',
+                'timestamp': 1424989860,
+                'duration': 5400,
+            },
+            'params': {
+                # m3u8 downloads
+                'skip_download': True,
+            },
+            'expected_warnings': [
+                'Forbidden'
+            ]
+        },
         # Condé Nast embed
         {
             'url': 'http://www.wired.com/2014/04/honda-asimo/',
@@ -372,16 +529,6 @@ class GenericIE(InfoExtractor):
                 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
             }
         },
-        # RSS feed
-        {
-            'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
-            'info_dict': {
-                'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
-                'title': 'Zero Punctuation',
-                'description': 're:.*groundbreaking video review series.*'
-            },
-            'playlist_mincount': 11,
-        },
         # Multiple brightcove videos
         # https://github.com/rg3/youtube-dl/issues/2283
         {
@@ -435,21 +582,6 @@ class GenericIE(InfoExtractor):
                 'uploader': 'thoughtworks.wistia.com',
             },
         },
-        # Direct download with broken HEAD
-        {
-            'url': 'http://ai-radio.org:8000/radio.opus',
-            'info_dict': {
-                'id': 'radio',
-                'ext': 'opus',
-                'title': 'radio',
-            },
-            'params': {
-                'skip_download': True,  # infinite live stream
-            },
-            'expected_warnings': [
-                r'501.*Not Implemented'
-            ],
-        },
         # Soundcloud embed
         {
             'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
@@ -481,21 +613,6 @@ class GenericIE(InfoExtractor):
             },
             'playlist_mincount': 2,
         },
-        # Direct link with incorrect MIME type
-        {
-            'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
-            'md5': '4ccbebe5f36706d85221f204d7eb5913',
-            'info_dict': {
-                'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
-                'id': '5_Lennart_Poettering_-_Systemd',
-                'ext': 'webm',
-                'title': '5_Lennart_Poettering_-_Systemd',
-                'upload_date': '20141120',
-            },
-            'expected_warnings': [
-                'URL could be a direct video link, returning it as such.'
-            ]
-        },
         # Cinchcast embed
         {
             'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
@@ -583,6 +700,18 @@ class GenericIE(InfoExtractor):
                 'title': 'John Carlson Postgame 2/25/15',
             },
         },
+        # Kaltura embed (different embed code)
+        {
+            'url': 'http://www.premierchristianradio.com/Shows/Saturday/Unbelievable/Conference-Videos/Os-Guinness-Is-It-Fools-Talk-Unbelievable-Conference-2014',
+            'info_dict': {
+                'id': '1_a52wc67y',
+                'ext': 'flv',
+                'upload_date': '20150127',
+                'uploader_id': 'PremierMedia',
+                'timestamp': int,
+                'title': 'Os Guinness // Is It Fools Talk? // Unbelievable? Conference 2014',
+            },
+        },
         # Eagle.Platform embed (generic URL)
         {
             'url': 'http://lenta.ru/news/2015/03/06/navalny/',
@@ -615,13 +744,24 @@ class GenericIE(InfoExtractor):
             'info_dict': {
                 'id': '100183293',
                 'ext': 'mp4',
-                'title': 'Тайны перевала Дятлова • Тайна перевала Дятлова 1 серия 2 часть',
+                'title': 'Тайны перевала Дятлова • 1 серия 2 часть',
                 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
                 'thumbnail': 're:^https?://.*\.jpg$',
                 'duration': 694,
                 'age_limit': 0,
             },
         },
+        # Playwire embed
+        {
+            'url': 'http://www.cinemablend.com/new/First-Joe-Dirt-2-Trailer-Teaser-Stupid-Greatness-70874.html',
+            'info_dict': {
+                'id': '3519514',
+                'ext': 'mp4',
+                'title': 'Joe Dirt 2 Beautiful Loser Teaser Trailer',
+                'thumbnail': 're:^https?://.*\.png$',
+                'duration': 45.115,
+            },
+        },
         # 5min embed
         {
             'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
@@ -632,15 +772,16 @@ class GenericIE(InfoExtractor):
                 'title': 'Facebook Creates "On This Day" | Crunch Report',
             },
         },
-        # RSS feed with enclosure
+        # SVT embed
         {
-            'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
+            'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun',
             'info_dict': {
-                'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
-                'ext': 'm4v',
-                'upload_date': '20150228',
-                'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
-            }
+                'id': '2900353',
+                'ext': 'flv',
+                'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)',
+                'duration': 27,
+                'age_limit': 0,
+            },
         },
         # Crooks and Liars embed
         {
@@ -702,6 +843,76 @@ class GenericIE(InfoExtractor):
                 # m3u8 downloads
                 'skip_download': True,
             }
+        },
+        # Contains a SMIL manifest
+        {
+            'url': 'http://www.telewebion.com/fa/1263668/%D9%82%D8%B1%D8%B9%D9%87%E2%80%8C%DA%A9%D8%B4%DB%8C-%D9%84%DB%8C%DA%AF-%D9%82%D9%87%D8%B1%D9%85%D8%A7%D9%86%D8%A7%D9%86-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7/%2B-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84.html',
+            'info_dict': {
+                'id': 'file',
+                'ext': 'flv',
+                'title': '+ Football: Lottery Champions League Europe',
+                'uploader': 'www.telewebion.com',
+            },
+            'params': {
+                # rtmpe downloads
+                'skip_download': True,
+            }
+        },
+        # Brightcove URL in single quotes
+        {
+            'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/',
+            'md5': '4ae374f1f8b91c889c4b9203c8c752af',
+            'info_dict': {
+                'id': '4255764656001',
+                'ext': 'mp4',
+                'title': 'SN Presents: Russell Martin, World Citizen',
+                'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.',
+                'uploader': 'Rogers Sportsnet',
+            },
+        },
+        # Dailymotion Cloud video
+        {
+            'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910',
+            'md5': '49444254273501a64675a7e68c502681',
+            'info_dict': {
+                'id': '5585de919473990de4bee11b',
+                'ext': 'mp4',
+                'title': 'Le débat',
+                'thumbnail': 're:^https?://.*\.jpe?g$',
+            }
+        },
+        # OnionStudios embed
+        {
+            'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537',
+            'info_dict': {
+                'id': '2855',
+                'ext': 'mp4',
+                'title': 'Don’t Understand Bitcoin? This Man Will Mumble An Explanation At You',
+                'thumbnail': 're:^https?://.*\.jpe?g$',
+                'uploader': 'ClickHole',
+                'uploader_id': 'clickhole',
+            }
+        },
+        # SnagFilms embed
+        {
+            'url': 'http://whilewewatch.blogspot.ru/2012/06/whilewewatch-whilewewatch-gripping.html',
+            'info_dict': {
+                'id': '74849a00-85a9-11e1-9660-123139220831',
+                'ext': 'mp4',
+                'title': '#whilewewatch',
+            }
+        },
+        # AdobeTVVideo embed
+        {
+            'url': 'https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners',
+            'md5': '43662b577c018ad707a63766462b1e87',
+            'info_dict': {
+                'id': '2456',
+                'ext': 'mp4',
+                'title': 'New experience with Acrobat DC',
+                'description': 'New experience with Acrobat DC',
+                'duration': 248.667,
+            },
         }
     ]
 
@@ -823,7 +1034,7 @@ class GenericIE(InfoExtractor):
             force_videoid = smuggled_data['force_videoid']
             video_id = force_videoid
         else:
-            video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
+            video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
 
         self.to_screen('%s: Requesting header' % video_id)
 
@@ -845,7 +1056,9 @@ class GenericIE(InfoExtractor):
 
         full_response = None
         if head_response is False:
-            full_response = self._request_webpage(url, video_id)
+            request = compat_urllib_request.Request(url)
+            request.add_header('Accept-Encoding', '*')
+            full_response = self._request_webpage(request, video_id)
             head_response = full_response
 
         # Check for direct link to a video
@@ -856,7 +1069,7 @@ class GenericIE(InfoExtractor):
                 head_response.headers.get('Last-Modified'))
             return {
                 'id': video_id,
-                'title': os.path.splitext(url_basename(url))[0],
+                'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
                 'direct': True,
                 'formats': [{
                     'format_id': m.group('format_id'),
@@ -867,10 +1080,22 @@ class GenericIE(InfoExtractor):
             }
 
         if not self._downloader.params.get('test', False) and not is_intentional:
-            self._downloader.report_warning('Falling back on generic information extractor.')
+            force = self._downloader.params.get('force_generic_extractor', False)
+            self._downloader.report_warning(
+                '%s on generic information extractor.' % ('Forcing' if force else 'Falling back'))
 
         if not full_response:
-            full_response = self._request_webpage(url, video_id)
+            request = compat_urllib_request.Request(url)
+            # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
+            # making it impossible to download only chunk of the file (yet we need only 512kB to
+            # test whether it's HTML or not). According to youtube-dl default Accept-Encoding
+            # that will always result in downloading the whole file that is not desirable.
+            # Therefore for extraction pass we have to override Accept-Encoding to any in order
+            # to accept raw bytes and being able to download only a chunk.
+            # It may probably better to solve this by checking Content-Type for application/octet-stream
+            # after HEAD request finishes, but not sure if we can rely on this.
+            request.add_header('Accept-Encoding', '*')
+            full_response = self._request_webpage(request, video_id)
 
         # Maybe it's a direct link to a video?
         # Be careful not to download the whole thing!
@@ -882,7 +1107,7 @@ class GenericIE(InfoExtractor):
                 head_response.headers.get('Last-Modified'))
             return {
                 'id': video_id,
-                'title': os.path.splitext(url_basename(url))[0],
+                'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
                 'direct': True,
                 'url': url,
                 'upload_date': upload_date,
@@ -909,7 +1134,7 @@ class GenericIE(InfoExtractor):
         # Sometimes embedded video player is hidden behind percent encoding
         # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
         # Unescaping the whole page allows to handle those cases in a generic way
-        webpage = compat_urllib_parse.unquote(webpage)
+        webpage = compat_urllib_parse_unquote(webpage)
 
         # it's tempting to parse this further, but you would
         # have to take into account all the variations like
@@ -962,23 +1187,20 @@ class GenericIE(InfoExtractor):
 
         # Look for embedded rtl.nl player
         matches = re.findall(
-            r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+video_embed[^"]+)"',
+            r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',
             webpage)
         if matches:
             return _playlist_from_matches(matches, ie='RtlNl')
 
-        # Look for embedded (iframe) Vimeo player
-        mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
-        if mobj:
-            player_url = unescapeHTML(mobj.group('url'))
-            surl = smuggle_url(player_url, {'Referer': url})
-            return self.url_result(surl)
-        # Look for embedded (swf embed) Vimeo player
-        mobj = re.search(
-            r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
-        if mobj:
-            return self.url_result(mobj.group(1))
+        vimeo_url = VimeoIE._extract_vimeo_url(url, webpage)
+        if vimeo_url is not None:
+            return self.url_result(vimeo_url)
+
+        vid_me_embed_url = self._search_regex(
+            r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]',
+            webpage, 'vid.me embed', default=None)
+        if vid_me_embed_url is not None:
+            return self.url_result(vid_me_embed_url, 'Vidme')
 
         # Look for embedded YouTube player
         matches = re.findall(r'''(?x)
@@ -1047,12 +1269,14 @@ class GenericIE(InfoExtractor):
             }
 
         # Look for embedded blip.tv player
-        mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)
-        if mobj:
-            return self.url_result('http://blip.tv/a/a-' + mobj.group(1), 'BlipTV')
-        mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage)
-        if mobj:
-            return self.url_result(mobj.group(1), 'BlipTV')
+        bliptv_url = BlipTVIE._extract_url(webpage)
+        if bliptv_url:
+            return self.url_result(bliptv_url, 'BlipTV')
+
+        # Look for SVT player
+        svt_url = SVTIE._extract_url(webpage)
+        if svt_url:
+            return self.url_result(svt_url, 'SVT')
 
         # Look for embedded condenast player
         matches = re.findall(
@@ -1170,7 +1394,7 @@ class GenericIE(InfoExtractor):
             return self.url_result(mobj.group('url'))
         mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
         if mobj is not None:
-            return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
+            return self.url_result(compat_urllib_parse_unquote(mobj.group('url')))
 
         # Look for funnyordie embed
         matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
@@ -1188,6 +1412,32 @@ class GenericIE(InfoExtractor):
         if rutv_url:
             return self.url_result(rutv_url, 'RUTV')
 
+        # Look for embedded TVC player
+        tvc_url = TVCIE._extract_url(webpage)
+        if tvc_url:
+            return self.url_result(tvc_url, 'TVC')
+
+        # Look for embedded SportBox player
+        sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)
+        if sportbox_urls:
+            return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed')
+
+        # Look for embedded PornHub player
+        pornhub_url = PornHubIE._extract_url(webpage)
+        if pornhub_url:
+            return self.url_result(pornhub_url, 'PornHub')
+
+        # Look for embedded XHamster player
+        xhamster_urls = XHamsterEmbedIE._extract_urls(webpage)
+        if xhamster_urls:
+            return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed')
+
+        # Look for embedded Tvigle player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'Tvigle')
+
         # Look for embedded TED player
         mobj = re.search(
             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
@@ -1207,11 +1457,23 @@ class GenericIE(InfoExtractor):
         if mobj is not None:
             return self.url_result(mobj.group('url'), 'ArteTVEmbed')
 
+        # Look for embedded francetv player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1',
+            webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'))
+
         # Look for embedded smotri.com player
         smotri_url = SmotriIE._extract_url(webpage)
         if smotri_url:
             return self.url_result(smotri_url, 'Smotri')
 
+        # Look for embedded Myvi.ru player
+        myvi_url = MyviIE._extract_url(webpage)
+        if myvi_url:
+            return self.url_result(myvi_url)
+
         # Look for embeded soundcloud player
         mobj = re.search(
             r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
@@ -1265,6 +1527,10 @@ class GenericIE(InfoExtractor):
         mobj = re.search(
             r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
             webpage)
+        if not mobj:
+            mobj = re.search(
+                r'data-video-link=["\'](?P<url>http://m.mlb.com/video/[^"\']+)',
+                webpage)
         if mobj is not None:
             return self.url_result(mobj.group('url'), 'MLB')
 
@@ -1287,8 +1553,8 @@ class GenericIE(InfoExtractor):
             return self.url_result(mobj.group('url'), 'Zapiks')
 
         # Look for Kaltura embeds
-        mobj = re.search(
-            r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage)
+        mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage) or
+                re.search(r'(?s)(["\'])(?:https?:)?//cdnapisec\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?\1.*?entry_id\s*:\s*(["\'])(?P<id>[^\2]+?)\2', webpage))
         if mobj is not None:
             return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
 
@@ -1310,6 +1576,12 @@ class GenericIE(InfoExtractor):
         if mobj is not None:
             return self.url_result(mobj.group('url'), 'Pladform')
 
+        # Look for Playwire embeds
+        mobj = re.search(
+            r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'))
+
         # Look for 5min embeds
         mobj = re.search(
             r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
@@ -1334,6 +1606,35 @@ class GenericIE(InfoExtractor):
             return self.url_result(
                 compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
 
+        # Look for Senate ISVP iframe
+        senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
+        if senate_isvp_url:
+            return self.url_result(senate_isvp_url, 'SenateISVP')
+
+        # Look for Dailymotion Cloud videos
+        dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage)
+        if dmcloud_url:
+            return self.url_result(dmcloud_url, 'DailymotionCloud')
+
+        # Look for OnionStudios embeds
+        onionstudios_url = OnionStudiosIE._extract_url(webpage)
+        if onionstudios_url:
+            return self.url_result(onionstudios_url)
+
+        # Look for SnagFilms embeds
+        snagfilms_url = SnagFilmsEmbedIE._extract_url(webpage)
+        if snagfilms_url:
+            return self.url_result(snagfilms_url)
+
+        # Look for AdobeTVVideo embeds
+        mobj = re.search(
+            r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]',
+            webpage)
+        if mobj is not None:
+            return self.url_result(
+                self._proto_relative_url(unescapeHTML(mobj.group(1))),
+                'AdobeTVVideo')
+
         def check_video(vurl):
             if YoutubeIE.suitable(vurl):
                 return True
@@ -1401,7 +1702,7 @@ class GenericIE(InfoExtractor):
                 if refresh_header:
                     found = re.search(REDIRECT_REGEX, refresh_header)
             if found:
-                new_url = found.group(1)
+                new_url = compat_urlparse.urljoin(url, found.group(1))
                 self.report_following_redirect(new_url)
                 return {
                     '_type': 'url',
@@ -1413,7 +1714,7 @@ class GenericIE(InfoExtractor):
         entries = []
         for video_url in found:
             video_url = compat_urlparse.urljoin(url, video_url)
-            video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
+            video_id = compat_urllib_parse_unquote(os.path.basename(video_url))
 
             # Sometimes, jwplayer extraction will result in a YouTube URL
             if YoutubeIE.suitable(video_url):
@@ -1423,13 +1724,22 @@ class GenericIE(InfoExtractor):
             # here's a fun little line of code for you:
             video_id = os.path.splitext(video_id)[0]
 
-            entries.append({
-                'id': video_id,
-                'url': video_url,
-                'uploader': video_uploader,
-                'title': video_title,
-                'age_limit': age_limit,
-            })
+            if determine_ext(video_url) == 'smil':
+                entries.append({
+                    'id': video_id,
+                    'formats': self._extract_smil_formats(video_url, video_id),
+                    'uploader': video_uploader,
+                    'title': video_title,
+                    'age_limit': age_limit,
+                })
+            else:
+                entries.append({
+                    'id': video_id,
+                    'url': video_url,
+                    'uploader': video_uploader,
+                    'title': video_title,
+                    'age_limit': age_limit,
+                })
 
         if len(entries) == 1:
             return entries[0]
diff --git a/youtube_dl/extractor/gfycat.py b/youtube_dl/extractor/gfycat.py
new file mode 100644 (file)
index 0000000..884700c
--- /dev/null
@@ -0,0 +1,110 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    float_or_none,
+    qualities,
+    ExtractorError,
+)
+
+
+class GfycatIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?gfycat\.com/(?:ifr/)?(?P<id>[^/?#]+)'
+    _TESTS = [{
+        'url': 'http://gfycat.com/DeadlyDecisiveGermanpinscher',
+        'info_dict': {
+            'id': 'DeadlyDecisiveGermanpinscher',
+            'ext': 'mp4',
+            'title': 'Ghost in the Shell',
+            'timestamp': 1410656006,
+            'upload_date': '20140914',
+            'uploader': 'anonymous',
+            'duration': 10.4,
+            'view_count': int,
+            'like_count': int,
+            'dislike_count': int,
+            'categories': list,
+            'age_limit': 0,
+        }
+    }, {
+        'url': 'http://gfycat.com/ifr/JauntyTimelyAmazontreeboa',
+        'info_dict': {
+            'id': 'JauntyTimelyAmazontreeboa',
+            'ext': 'mp4',
+            'title': 'JauntyTimelyAmazontreeboa',
+            'timestamp': 1411720126,
+            'upload_date': '20140926',
+            'uploader': 'anonymous',
+            'duration': 3.52,
+            'view_count': int,
+            'like_count': int,
+            'dislike_count': int,
+            'categories': list,
+            'age_limit': 0,
+        }
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        gfy = self._download_json(
+            'http://gfycat.com/cajax/get/%s' % video_id,
+            video_id, 'Downloading video info')
+        if 'error' in gfy:
+            raise ExtractorError('Gfycat said: ' + gfy['error'], expected=True)
+        gfy = gfy['gfyItem']
+
+        title = gfy.get('title') or gfy['gfyName']
+        description = gfy.get('description')
+        timestamp = int_or_none(gfy.get('createDate'))
+        uploader = gfy.get('userName')
+        view_count = int_or_none(gfy.get('views'))
+        like_count = int_or_none(gfy.get('likes'))
+        dislike_count = int_or_none(gfy.get('dislikes'))
+        age_limit = 18 if gfy.get('nsfw') == '1' else 0
+
+        width = int_or_none(gfy.get('width'))
+        height = int_or_none(gfy.get('height'))
+        fps = int_or_none(gfy.get('frameRate'))
+        num_frames = int_or_none(gfy.get('numFrames'))
+
+        duration = float_or_none(num_frames, fps) if num_frames and fps else None
+
+        categories = gfy.get('tags') or gfy.get('extraLemmas') or []
+
+        FORMATS = ('gif', 'webm', 'mp4')
+        quality = qualities(FORMATS)
+
+        formats = []
+        for format_id in FORMATS:
+            video_url = gfy.get('%sUrl' % format_id)
+            if not video_url:
+                continue
+            filesize = gfy.get('%sSize' % format_id)
+            formats.append({
+                'url': video_url,
+                'format_id': format_id,
+                'width': width,
+                'height': height,
+                'fps': fps,
+                'filesize': filesize,
+                'quality': quality(format_id),
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'timestamp': timestamp,
+            'uploader': uploader,
+            'duration': duration,
+            'view_count': view_count,
+            'like_count': like_count,
+            'dislike_count': dislike_count,
+            'categories': categories,
+            'age_limit': age_limit,
+            'formats': formats,
+        }
index 775890112d219cf14a7c78d8504c62a65c612e28..28eb733e2bac89818a54952f77b342fec6ebe4ff 100644 (file)
@@ -85,7 +85,8 @@ class GigaIE(InfoExtractor):
             r'class="author">([^<]+)</a>', webpage, 'uploader', fatal=False)
 
         view_count = str_to_int(self._search_regex(
-            r'<span class="views"><strong>([\d.]+)</strong>', webpage, 'view count', fatal=False))
+            r'<span class="views"><strong>([\d.,]+)</strong>',
+            webpage, 'view count', fatal=False))
 
         return {
             'id': video_id,
index ae24aff84fd85c6796c7a4374964f70629175f43..f006f0cb105dc9d7b0c1f495dbcd4c840597858a 100644 (file)
@@ -15,10 +15,10 @@ from ..utils import (
 
 
 class GorillaVidIE(InfoExtractor):
-    IE_DESC = 'GorillaVid.in, daclips.in, movpod.in and fastvideo.in'
+    IE_DESC = 'GorillaVid.in, daclips.in, movpod.in, fastvideo.in and realvid.net'
     _VALID_URL = r'''(?x)
         https?://(?P<host>(?:www\.)?
-            (?:daclips\.in|gorillavid\.in|movpod\.in|fastvideo\.in))/
+            (?:daclips\.in|gorillavid\.in|movpod\.in|fastvideo\.in|realvid\.net))/
         (?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:-[0-9]+x[0-9]+\.html)?
     '''
 
@@ -35,13 +35,7 @@ class GorillaVidIE(InfoExtractor):
         },
     }, {
         'url': 'http://gorillavid.in/embed-z08zf8le23c6-960x480.html',
-        'md5': 'c9e293ca74d46cad638e199c3f3fe604',
-        'info_dict': {
-            'id': 'z08zf8le23c6',
-            'ext': 'mp4',
-            'title': 'Say something nice',
-            'thumbnail': 're:http://.*\.jpg',
-        },
+        'only_matching': True,
     }, {
         'url': 'http://daclips.in/3rso4kdn6f9m',
         'md5': '1ad8fd39bb976eeb66004d3a4895f106',
@@ -61,6 +55,15 @@ class GorillaVidIE(InfoExtractor):
             'title': 'Man of Steel - Trailer',
             'thumbnail': 're:http://.*\.jpg',
         },
+    }, {
+        'url': 'http://realvid.net/ctn2y6p2eviw',
+        'md5': 'b2166d2cf192efd6b6d764c18fd3710e',
+        'info_dict': {
+            'id': 'ctn2y6p2eviw',
+            'ext': 'flv',
+            'title': 'rdx 1955',
+            'thumbnail': 're:http://.*\.jpg',
+        },
     }, {
         'url': 'http://movpod.in/0wguyyxi1yca',
         'only_matching': True,
@@ -75,12 +78,7 @@ class GorillaVidIE(InfoExtractor):
         if re.search(self._FILE_NOT_FOUND_REGEX, webpage) is not None:
             raise ExtractorError('Video %s does not exist' % video_id, expected=True)
 
-        fields = dict(re.findall(r'''(?x)<input\s+
-            type="hidden"\s+
-            name="([^"]+)"\s+
-            (?:id="[^"]+"\s+)?
-            value="([^"]*)"
-            ''', webpage))
+        fields = self._hidden_inputs(webpage)
 
         if fields['op'] == 'download1':
             countdown = int_or_none(self._search_regex(
@@ -97,7 +95,7 @@ class GorillaVidIE(InfoExtractor):
             webpage = self._download_webpage(req, video_id, 'Downloading video page')
 
         title = self._search_regex(
-            r'style="z-index: [0-9]+;">([^<]+)</span>',
+            [r'style="z-index: [0-9]+;">([^<]+)</span>', r'>Watch (.+) '],
             webpage, 'title', default=None) or self._og_search_title(webpage)
         video_url = self._search_regex(
             r'file\s*:\s*["\'](http[^"\']+)["\'],', webpage, 'file url')
diff --git a/youtube_dl/extractor/grooveshark.py b/youtube_dl/extractor/grooveshark.py
deleted file mode 100644 (file)
index 36ad491..0000000
+++ /dev/null
@@ -1,191 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import time
-import math
-import os.path
-import re
-
-
-from .common import InfoExtractor
-from ..compat import (
-    compat_html_parser,
-    compat_urllib_parse,
-    compat_urllib_request,
-    compat_urlparse,
-)
-from ..utils import ExtractorError
-
-
-class GroovesharkHtmlParser(compat_html_parser.HTMLParser):
-    def __init__(self):
-        self._current_object = None
-        self.objects = []
-        compat_html_parser.HTMLParser.__init__(self)
-
-    def handle_starttag(self, tag, attrs):
-        attrs = dict((k, v) for k, v in attrs)
-        if tag == 'object':
-            self._current_object = {'attrs': attrs, 'params': []}
-        elif tag == 'param':
-            self._current_object['params'].append(attrs)
-
-    def handle_endtag(self, tag):
-        if tag == 'object':
-            self.objects.append(self._current_object)
-            self._current_object = None
-
-    @classmethod
-    def extract_object_tags(cls, html):
-        p = cls()
-        p.feed(html)
-        p.close()
-        return p.objects
-
-
-class GroovesharkIE(InfoExtractor):
-    _VALID_URL = r'https?://(www\.)?grooveshark\.com/#!/s/([^/]+)/([^/]+)'
-    _TEST = {
-        'url': 'http://grooveshark.com/#!/s/Jolene+Tenth+Key+Remix+Ft+Will+Sessions/6SS1DW?src=5',
-        'md5': '7ecf8aefa59d6b2098517e1baa530023',
-        'info_dict': {
-            'id': '6SS1DW',
-            'title': 'Jolene (Tenth Key Remix ft. Will Sessions)',
-            'ext': 'mp3',
-            'duration': 227,
-        }
-    }
-
-    do_playerpage_request = True
-    do_bootstrap_request = True
-
-    def _parse_target(self, target):
-        uri = compat_urlparse.urlparse(target)
-        hash = uri.fragment[1:].split('?')[0]
-        token = os.path.basename(hash.rstrip('/'))
-        return (uri, hash, token)
-
-    def _build_bootstrap_url(self, target):
-        (uri, hash, token) = self._parse_target(target)
-        query = 'getCommunicationToken=1&hash=%s&%d' % (compat_urllib_parse.quote(hash, safe=''), self.ts)
-        return (compat_urlparse.urlunparse((uri.scheme, uri.netloc, '/preload.php', None, query, None)), token)
-
-    def _build_meta_url(self, target):
-        (uri, hash, token) = self._parse_target(target)
-        query = 'hash=%s&%d' % (compat_urllib_parse.quote(hash, safe=''), self.ts)
-        return (compat_urlparse.urlunparse((uri.scheme, uri.netloc, '/preload.php', None, query, None)), token)
-
-    def _build_stream_url(self, meta):
-        return compat_urlparse.urlunparse(('http', meta['streamKey']['ip'], '/stream.php', None, None, None))
-
-    def _build_swf_referer(self, target, obj):
-        (uri, _, _) = self._parse_target(target)
-        return compat_urlparse.urlunparse((uri.scheme, uri.netloc, obj['attrs']['data'], None, None, None))
-
-    def _transform_bootstrap(self, js):
-        return re.split('(?m)^\s*try\s*\{', js)[0] \
-                 .split(' = ', 1)[1].strip().rstrip(';')
-
-    def _transform_meta(self, js):
-        return js.split('\n')[0].split('=')[1].rstrip(';')
-
-    def _get_meta(self, target):
-        (meta_url, token) = self._build_meta_url(target)
-        self.to_screen('Metadata URL: %s' % meta_url)
-
-        headers = {'Referer': compat_urlparse.urldefrag(target)[0]}
-        req = compat_urllib_request.Request(meta_url, headers=headers)
-        res = self._download_json(req, token,
-                                  transform_source=self._transform_meta)
-
-        if 'getStreamKeyWithSong' not in res:
-            raise ExtractorError(
-                'Metadata not found. URL may be malformed, or Grooveshark API may have changed.')
-
-        if res['getStreamKeyWithSong'] is None:
-            raise ExtractorError(
-                'Metadata download failed, probably due to Grooveshark anti-abuse throttling. Wait at least an hour before retrying from this IP.',
-                expected=True)
-
-        return res['getStreamKeyWithSong']
-
-    def _get_bootstrap(self, target):
-        (bootstrap_url, token) = self._build_bootstrap_url(target)
-
-        headers = {'Referer': compat_urlparse.urldefrag(target)[0]}
-        req = compat_urllib_request.Request(bootstrap_url, headers=headers)
-        res = self._download_json(req, token, fatal=False,
-                                  note='Downloading player bootstrap data',
-                                  errnote='Unable to download player bootstrap data',
-                                  transform_source=self._transform_bootstrap)
-        return res
-
-    def _get_playerpage(self, target):
-        (_, _, token) = self._parse_target(target)
-
-        webpage = self._download_webpage(
-            target, token,
-            note='Downloading player page',
-            errnote='Unable to download player page',
-            fatal=False)
-
-        if webpage is not None:
-            # Search (for example German) error message
-            error_msg = self._html_search_regex(
-                r'<div id="content">\s*<h2>(.*?)</h2>', webpage,
-                'error message', default=None)
-            if error_msg is not None:
-                error_msg = error_msg.replace('\n', ' ')
-                raise ExtractorError('Grooveshark said: %s' % error_msg)
-
-        if webpage is not None:
-            o = GroovesharkHtmlParser.extract_object_tags(webpage)
-            return webpage, [x for x in o if x['attrs']['id'] == 'jsPlayerEmbed']
-
-        return webpage, None
-
-    def _real_initialize(self):
-        self.ts = int(time.time() * 1000)  # timestamp in millis
-
-    def _real_extract(self, url):
-        (target_uri, _, token) = self._parse_target(url)
-
-        # 1. Fill cookiejar by making a request to the player page
-        swf_referer = None
-        if self.do_playerpage_request:
-            (_, player_objs) = self._get_playerpage(url)
-            if player_objs:
-                swf_referer = self._build_swf_referer(url, player_objs[0])
-                self.to_screen('SWF Referer: %s' % swf_referer)
-
-        # 2. Ask preload.php for swf bootstrap data to better mimic webapp
-        if self.do_bootstrap_request:
-            bootstrap = self._get_bootstrap(url)
-            self.to_screen('CommunicationToken: %s' % bootstrap['getCommunicationToken'])
-
-        # 3. Ask preload.php for track metadata.
-        meta = self._get_meta(url)
-
-        # 4. Construct stream request for track.
-        stream_url = self._build_stream_url(meta)
-        duration = int(math.ceil(float(meta['streamKey']['uSecs']) / 1000000))
-        post_dict = {'streamKey': meta['streamKey']['streamKey']}
-        post_data = compat_urllib_parse.urlencode(post_dict).encode('utf-8')
-        headers = {
-            'Content-Length': len(post_data),
-            'Content-Type': 'application/x-www-form-urlencoded'
-        }
-        if swf_referer is not None:
-            headers['Referer'] = swf_referer
-
-        return {
-            'id': token,
-            'title': meta['song']['Name'],
-            'http_method': 'POST',
-            'url': stream_url,
-            'ext': 'mp3',
-            'format': 'mp3 audio',
-            'duration': duration,
-            'http_post_data': post_data,
-            'http_headers': headers,
-        }
index 63d87b74cc2d5258960fce3b0c6cd5eca48a0b11..f5aa73d18b47ff225b7e7e332051b97583ca8237 100644 (file)
@@ -1,7 +1,5 @@
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
 
 
@@ -19,20 +17,19 @@ class HentaiStigmaIE(InfoExtractor):
     }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = self._match_id(url)
 
         webpage = self._download_webpage(url, video_id)
 
         title = self._html_search_regex(
-            r'<h2 class="posttitle"><a[^>]*>([^<]+)</a>',
+            r'<h2[^>]+class="posttitle"[^>]*><a[^>]*>([^<]+)</a>',
             webpage, 'title')
         wrap_url = self._html_search_regex(
-            r'<iframe src="([^"]+mp4)"', webpage, 'wrapper url')
+            r'<iframe[^>]+src="([^"]+mp4)"', webpage, 'wrapper url')
         wrap_webpage = self._download_webpage(wrap_url, video_id)
 
         video_url = self._html_search_regex(
-            r'clip:\s*{\s*url: "([^"]*)"', wrap_webpage, 'video url')
+            r'file\s*:\s*"([^"]+)"', wrap_webpage, 'video url')
 
         return {
             'id': video_id,
index 40afbe537c6eb930216afbe2afda23002151d8f2..6a36933ac2c98ada87b21af4089aa158d42a3112 100644 (file)
@@ -25,7 +25,8 @@ class HistoricFilmsIE(InfoExtractor):
         webpage = self._download_webpage(url, video_id)
 
         tape_id = self._search_regex(
-            r'class="tapeId">([^<]+)<', webpage, 'tape id')
+            [r'class="tapeId"[^>]*>([^<]+)<', r'tapeId\s*:\s*"([^"]+)"'],
+            webpage, 'tape id')
 
         title = self._og_search_title(webpage)
         description = self._og_search_description(webpage)
index d606429ca88a51138f3108594e7933f74638de34..421f55bbeaed2c1249833e5136ff479557c1bccc 100644 (file)
@@ -43,7 +43,8 @@ class HitboxIE(InfoExtractor):
     def _extract_metadata(self, url, video_id):
         thumb_base = 'https://edge.sf.hitbox.tv'
         metadata = self._download_json(
-            '%s/%s' % (url, video_id), video_id)
+            '%s/%s' % (url, video_id), video_id,
+            'Downloading metadata JSON')
 
         date = 'media_live_since'
         media_type = 'livestream'
@@ -88,21 +89,41 @@ class HitboxIE(InfoExtractor):
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        metadata = self._extract_metadata(
-            'https://www.hitbox.tv/api/media/video',
-            video_id)
-
         player_config = self._download_json(
             'https://www.hitbox.tv/api/player/config/video/%s' % video_id,
-            video_id)
+            video_id, 'Downloading video JSON')
 
-        clip = player_config.get('clip')
-        video_url = clip.get('url')
-        res = clip.get('bitrates', [])[0].get('label')
+        formats = []
+        for video in player_config['clip']['bitrates']:
+            label = video.get('label')
+            if label == 'Auto':
+                continue
+            video_url = video.get('url')
+            if not video_url:
+                continue
+            bitrate = int_or_none(video.get('bitrate'))
+            if determine_ext(video_url) == 'm3u8':
+                if not video_url.startswith('http'):
+                    continue
+                formats.append({
+                    'url': video_url,
+                    'ext': 'mp4',
+                    'tbr': bitrate,
+                    'format_note': label,
+                    'protocol': 'm3u8_native',
+                })
+            else:
+                formats.append({
+                    'url': video_url,
+                    'tbr': bitrate,
+                    'format_note': label,
+                })
+        self._sort_formats(formats)
 
-        metadata['resolution'] = res
-        metadata['url'] = video_url
-        metadata['protocol'] = 'm3u8'
+        metadata = self._extract_metadata(
+            'https://www.hitbox.tv/api/media/video',
+            video_id)
+        metadata['formats'] = formats
 
         return metadata
 
@@ -130,10 +151,6 @@ class HitboxLiveIE(HitboxIE):
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        metadata = self._extract_metadata(
-            'https://www.hitbox.tv/api/media/live',
-            video_id)
-
         player_config = self._download_json(
             'https://www.hitbox.tv/api/player/config/live/%s' % video_id,
             video_id)
@@ -174,9 +191,13 @@ class HitboxLiveIE(HitboxIE):
                             'page_url': url,
                             'player_url': 'http://www.hitbox.tv/static/player/flowplayer/flowplayer.commercial-3.2.16.swf',
                         })
-
         self._sort_formats(formats)
+
+        metadata = self._extract_metadata(
+            'https://www.hitbox.tv/api/media/live',
+            video_id)
         metadata['formats'] = formats
         metadata['is_live'] = True
         metadata['title'] = self._live_title(metadata.get('title'))
+
         return metadata
index 704d0285d3e1c2ce10e8f3929543c6c66b0fd58a..a3154cfdeccf9b4c18cf5a5b01f7944243fb1509 100644 (file)
@@ -58,11 +58,7 @@ class HostingBulkIE(InfoExtractor):
             r'<img src="([^"]+)".+?class="pic"',
             webpage, 'thumbnail', fatal=False)
 
-        fields = dict(re.findall(r'''(?x)<input\s+
-            type="hidden"\s+
-            name="([^"]+)"\s+
-            value="([^"]*)"
-            ''', webpage))
+        fields = self._hidden_inputs(webpage)
 
         request = compat_urllib_request.Request(url, urlencode_postdata(fields))
         request.add_header('Content-type', 'application/x-www-form-urlencoded')
index 3f7d6666c0810e545c0f285dcf70689806c4dac7..16677f179ecd77040e8fdc42bfb9e4095aa1774a 100644 (file)
@@ -1,8 +1,7 @@
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
+from ..utils import parse_iso8601
 
 
 class HowcastIE(InfoExtractor):
@@ -13,29 +12,31 @@ class HowcastIE(InfoExtractor):
         'info_dict': {
             'id': '390161',
             'ext': 'mp4',
-            'description': 'The square knot, also known as the reef knot, is one of the oldest, most basic knots to tie, and can be used in many different ways. Here\'s the proper way to tie a square knot.',
             'title': 'How to Tie a Square Knot Properly',
-        }
+            'description': 'md5:dbe792e5f6f1489027027bf2eba188a3',
+            'timestamp': 1276081287,
+            'upload_date': '20100609',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
     }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
+        video_id = self._match_id(url)
 
-        video_id = mobj.group('id')
         webpage = self._download_webpage(url, video_id)
 
-        self.report_extraction(video_id)
-
-        video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
-                                       webpage, 'video URL')
-
-        video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
-                                                    webpage, 'description', fatal=False)
+        embed_code = self._search_regex(
+            r'<iframe[^>]+src="[^"]+\bembed_code=([^\b]+)\b',
+            webpage, 'ooyala embed code')
 
         return {
+            '_type': 'url_transparent',
+            'ie_key': 'Ooyala',
+            'url': 'ooyala:%s' % embed_code,
             'id': video_id,
-            'url': video_url,
-            'title': self._og_search_title(webpage),
-            'description': video_description,
-            'thumbnail': self._og_search_thumbnail(webpage),
+            'timestamp': parse_iso8601(self._html_search_meta(
+                'article:published_time', webpage, 'timestamp')),
         }
index e9733912132798d99be18bb935dcd3c3b190525d..663e6632a194d8ee271a0c031a921d7eed139005 100644 (file)
@@ -10,7 +10,7 @@ from ..utils import (
 
 
 class HowStuffWorksIE(InfoExtractor):
-    _VALID_URL = r'https?://[\da-z-]+\.howstuffworks\.com/(?:[^/]+/)*\d+-(?P<id>.+?)-video\.htm'
+    _VALID_URL = r'https?://[\da-z-]+\.howstuffworks\.com/(?:[^/]+/)*(?:\d+-)?(?P<id>.+?)-video\.htm'
     _TESTS = [
         {
             'url': 'http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm',
@@ -46,6 +46,10 @@ class HowStuffWorksIE(InfoExtractor):
                 'thumbnail': 're:^https?://.*\.jpg$',
             },
         },
+        {
+            'url': 'http://shows.howstuffworks.com/stuff-to-blow-your-mind/optical-illusions-video.htm',
+            'only_matching': True,
+        }
     ]
 
     def _real_extract(self, url):
index 370e86e5ac7ce497c8b3c658805374246fb9690a..70e4c0d4173816e990749759cf2d36fe902904ee 100644 (file)
@@ -1,36 +1,75 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
+from ..utils import int_or_none
 
 
 class IconosquareIE(InfoExtractor):
-    _VALID_URL = r'https?://(www\.)?(?:iconosquare\.com|statigr\.am)/p/(?P<id>[^/]+)'
+    _VALID_URL = r'https?://(?:www\.)?(?:iconosquare\.com|statigr\.am)/p/(?P<id>[^/]+)'
     _TEST = {
         'url': 'http://statigr.am/p/522207370455279102_24101272',
         'md5': '6eb93b882a3ded7c378ee1d6884b1814',
         'info_dict': {
             'id': '522207370455279102_24101272',
             'ext': 'mp4',
-            'uploader_id': 'aguynamedpatrick',
-            'title': 'Instagram photo by @aguynamedpatrick (Patrick Janelle)',
+            'title': 'Instagram media by @aguynamedpatrick (Patrick Janelle)',
             'description': 'md5:644406a9ec27457ed7aa7a9ebcd4ce3d',
+            'timestamp': 1376471991,
+            'upload_date': '20130814',
+            'uploader': 'aguynamedpatrick',
+            'uploader_id': '24101272',
+            'comment_count': int,
+            'like_count': int,
         },
     }
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
+
         webpage = self._download_webpage(url, video_id)
+
+        media = self._parse_json(
+            self._search_regex(
+                r'window\.media\s*=\s*({.+?});\n', webpage, 'media'),
+            video_id)
+
+        formats = [{
+            'url': f['url'],
+            'format_id': format_id,
+            'width': int_or_none(f.get('width')),
+            'height': int_or_none(f.get('height'))
+        } for format_id, f in media['videos'].items()]
+        self._sort_formats(formats)
+
         title = self._html_search_regex(
             r'<title>(.+?)(?: *\(Videos?\))? \| (?:Iconosquare|Statigram)</title>',
             webpage, 'title')
-        uploader_id = self._html_search_regex(
-            r'@([^ ]+)', title, 'uploader name', fatal=False)
+
+        timestamp = int_or_none(media.get('created_time') or media.get('caption', {}).get('created_time'))
+        description = media.get('caption', {}).get('text')
+
+        uploader = media.get('user', {}).get('username')
+        uploader_id = media.get('user', {}).get('id')
+
+        comment_count = int_or_none(media.get('comments', {}).get('count'))
+        like_count = int_or_none(media.get('likes', {}).get('count'))
+
+        thumbnails = [{
+            'url': t['url'],
+            'id': thumbnail_id,
+            'width': int_or_none(t.get('width')),
+            'height': int_or_none(t.get('height'))
+        } for thumbnail_id, t in media.get('images', {}).items()]
 
         return {
             'id': video_id,
-            'url': self._og_search_video_url(webpage),
             'title': title,
-            'description': self._og_search_description(webpage),
-            'thumbnail': self._og_search_thumbnail(webpage),
-            'uploader_id': uploader_id
+            'description': description,
+            'thumbnails': thumbnails,
+            'timestamp': timestamp,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'comment_count': comment_count,
+            'like_count': like_count,
+            'formats': formats,
         }
index 3aade9e740673da3193324add6a8a3ac4eff8b1f..bf2d2041b91a261d81e891bffee42966e0e53146 100644 (file)
@@ -61,7 +61,7 @@ class IGNIE(InfoExtractor):
         },
         {
             'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch',
-            'md5': '4e9a0bda1e5eebd31ddcf86ec0b9b3c7',
+            'md5': '618fedb9c901fd086f6f093564ef8558',
             'info_dict': {
                 'id': '078fdd005f6d3c02f63d795faa1b984f',
                 'ext': 'mp4',
@@ -77,10 +77,10 @@ class IGNIE(InfoExtractor):
     def _find_video_id(self, webpage):
         res_id = [
             r'"video_id"\s*:\s*"(.*?)"',
+            r'class="hero-poster[^"]*?"[^>]*id="(.+?)"',
             r'data-video-id="(.+?)"',
             r'<object id="vid_(.+?)"',
             r'<meta name="og:image" content=".*/(.+?)-(.+?)/.+.jpg"',
-            r'class="hero-poster[^"]*?"[^>]*id="(.+?)"',
         ]
         return self._search_regex(res_id, webpage, 'video id')
 
index f29df36b5bf6bd7e732ad84cbfd7d3eeb412f5ff..4bb574cf37df2421721b088ded37a3fc66c8c2ea 100644 (file)
@@ -46,7 +46,7 @@ class ImdbIE(InfoExtractor):
             format_info = info['videoPlayerObject']['video']
             formats.append({
                 'format_id': f_id,
-                'url': format_info['url'],
+                'url': format_info['videoInfoList'][0]['videoUrl'],
             })
 
         return {
index fe5d95e2c9cad488f342233e7ebfd52e42a86de3..d692ea79ab493174038c9649445e6a592a86687c 100644 (file)
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..compat import compat_urlparse
 from ..utils import (
     int_or_none,
     js_to_json,
@@ -12,7 +13,7 @@ from ..utils import (
 
 
 class ImgurIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?P<id>[a-zA-Z0-9]+)(?:\.mp4|\.gifv)?'
+    _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?P<id>[a-zA-Z0-9]+)'
 
     _TESTS = [{
         'url': 'https://i.imgur.com/A61SaA1.gifv',
@@ -34,7 +35,8 @@ class ImgurIE(InfoExtractor):
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
+        webpage = self._download_webpage(
+            compat_urlparse.urljoin(url, video_id), video_id)
 
         width = int_or_none(self._search_regex(
             r'<param name="width" value="([0-9]+)"',
index 0847074eeb0f0f258db079495c7f30777b2d1838..65712abc28c3cc68cab7052ab709b2c1e6500cb5 100644 (file)
@@ -7,7 +7,7 @@ from .common import InfoExtractor
 
 
 class InaIE(InfoExtractor):
-    _VALID_URL = r'http://(?:www\.)?ina\.fr/video/(?P<id>I?[A-Z0-9]+)'
+    _VALID_URL = r'https?://(?:www\.)?ina\.fr/video/(?P<id>I?[A-Z0-9]+)'
     _TEST = {
         'url': 'http://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html',
         'md5': 'a667021bf2b41f8dc6049479d9bb38a3',
index f25f43664e262b25473557c5f11dae91e697e3f6..71cfd12c56549d0be540c9daee6a2732959039de 100644 (file)
@@ -4,14 +4,15 @@ import base64
 
 from .common import InfoExtractor
 from ..compat import (
-    compat_urllib_parse,
+    compat_urllib_parse_unquote,
+    compat_urlparse,
 )
 
 
 class InfoQIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?infoq\.com/[^/]+/(?P<id>[^/]+)$'
+    _VALID_URL = r'https?://(?:www\.)?infoq\.com/(?:[^/]+/)+(?P<id>[^/]+)'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things',
         'md5': 'b5ca0e0a8c1fed93b0e65e48e462f9a2',
         'info_dict': {
@@ -20,7 +21,10 @@ class InfoQIE(InfoExtractor):
             'description': 'Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.',
             'title': 'A Few of My Favorite [Python] Things',
         },
-    }
+    }, {
+        'url': 'http://www.infoq.com/fr/presentations/changez-avis-sur-javascript',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
@@ -35,14 +39,14 @@ class InfoQIE(InfoExtractor):
         # Extract video URL
         encoded_id = self._search_regex(
             r"jsclassref\s*=\s*'([^']*)'", webpage, 'encoded id')
-        real_id = compat_urllib_parse.unquote(base64.b64decode(encoded_id.encode('ascii')).decode('utf-8'))
+        real_id = compat_urllib_parse_unquote(base64.b64decode(encoded_id.encode('ascii')).decode('utf-8'))
         playpath = 'mp4:' + real_id
 
         video_filename = playpath.split('/')[-1]
         video_id, extension = video_filename.split('.')
 
         http_base = self._search_regex(
-            r'EXPRESSINSTALL_SWF\s*=\s*"(https?://[^/"]+/)', webpage,
+            r'EXPRESSINSTALL_SWF\s*=\s*[^"]*"((?:https?:)?//[^/"]+/)', webpage,
             'HTTP base URL')
 
         formats = [{
@@ -52,7 +56,7 @@ class InfoQIE(InfoExtractor):
             'play_path': playpath,
         }, {
             'format_id': 'http',
-            'url': http_base + real_id,
+            'url': compat_urlparse.urljoin(url, http_base) + real_id,
         }]
         self._sort_formats(formats)
 
index b020e2621a5cc3c8d7ef6a1bc2cb6aaea989f779..3d78f78c46d1ad004339bc33ebcb09d1286e5092 100644 (file)
@@ -5,13 +5,14 @@ import re
 from .common import InfoExtractor
 from ..utils import (
     int_or_none,
+    limit_length,
 )
 
 
 class InstagramIE(InfoExtractor):
-    _VALID_URL = r'http://instagram\.com/p/(?P<id>.*?)/'
+    _VALID_URL = r'https://instagram\.com/p/(?P<id>[\da-zA-Z]+)'
     _TEST = {
-        'url': 'http://instagram.com/p/aye83DjauH/?foo=bar#abc',
+        'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc',
         'md5': '0d2da106a9d2631273e192b372806516',
         'info_dict': {
             'id': 'aye83DjauH',
@@ -23,8 +24,8 @@ class InstagramIE(InfoExtractor):
     }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = self._match_id(url)
+
         webpage = self._download_webpage(url, video_id)
         uploader_id = self._search_regex(r'"owner":{"username":"(.+?)"',
                                          webpage, 'uploader id', fatal=False)
@@ -43,11 +44,11 @@ class InstagramIE(InfoExtractor):
 
 
 class InstagramUserIE(InfoExtractor):
-    _VALID_URL = r'http://instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])'
+    _VALID_URL = r'https://instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])'
     IE_DESC = 'Instagram user profile'
     IE_NAME = 'instagram:user'
     _TEST = {
-        'url': 'http://instagram.com/porsche',
+        'url': 'https://instagram.com/porsche',
         'info_dict': {
             'id': 'porsche',
             'title': 'porsche',
@@ -102,11 +103,13 @@ class InstagramUserIE(InfoExtractor):
                 thumbnails_el = it.get('images', {})
                 thumbnail = thumbnails_el.get('thumbnail', {}).get('url')
 
-                title = it.get('caption', {}).get('text', it['id'])
+                # In some cases caption is null, which corresponds to None
+                # in python. As a result, it.get('caption', {}) gives None
+                title = (it.get('caption') or {}).get('text', it['id'])
 
                 entries.append({
                     'id': it['id'],
-                    'title': title,
+                    'title': limit_length(title, 80),
                     'formats': formats,
                     'thumbnail': thumbnail,
                     'webpage_url': it.get('link'),
index 8529bedfc0ab283790e74144bc9d570df19dc4b3..821c8ec109236b787b9afa2985e450ff8a647595 100644 (file)
@@ -11,11 +11,12 @@ from ..compat import (
 )
 from ..utils import (
     ExtractorError,
+    remove_end,
 )
 
 
 class IPrimaIE(InfoExtractor):
-    _VALID_URL = r'https?://play\.iprima\.cz/[^?#]+/(?P<id>[^?#]+)'
+    _VALID_URL = r'https?://play\.iprima\.cz/(?:[^/]+/)*(?P<id>[^?#]+)'
 
     _TESTS = [{
         'url': 'http://play.iprima.cz/particka/particka-92',
@@ -23,7 +24,7 @@ class IPrimaIE(InfoExtractor):
             'id': '39152',
             'ext': 'flv',
             'title': 'Partička (92)',
-            'description': 'md5:3740fda51464da35a2d4d0670b8e4fd6',
+            'description': 'md5:74e9617e51bca67c3ecfb2c6f9766f45',
             'thumbnail': 'http://play.iprima.cz/sites/default/files/image_crops/image_620x349/3/491483_particka-92_image_620x349.jpg',
         },
         'params': {
@@ -35,13 +36,14 @@ class IPrimaIE(InfoExtractor):
             'id': '9718337',
             'ext': 'flv',
             'title': 'Tchibo Partička - Jarní móda',
-            'description': 'md5:589f8f59f414220621ff8882eb3ce7be',
             'thumbnail': 're:^http:.*\.jpg$',
         },
         'params': {
             'skip_download': True,  # requires rtmpdump
         },
-        'skip': 'Do not have permission to access this page',
+    }, {
+        'url': 'http://play.iprima.cz/zpravy-ftv-prima-2752015',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
@@ -102,8 +104,10 @@ class IPrimaIE(InfoExtractor):
 
         return {
             'id': real_id,
-            'title': self._og_search_title(webpage),
+            'title': remove_end(self._og_search_title(webpage), ' | Prima PLAY'),
             'thumbnail': self._og_search_thumbnail(webpage),
             'formats': formats,
-            'description': self._og_search_description(webpage),
+            'description': self._search_regex(
+                r'<p[^>]+itemprop="description"[^>]*>([^<]+)',
+                webpage, 'description', default=None),
         }
diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py
new file mode 100644 (file)
index 0000000..afb7f4e
--- /dev/null
@@ -0,0 +1,273 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import hashlib
+import math
+import random
+import time
+import uuid
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse
+from ..utils import ExtractorError
+
+
+class IqiyiIE(InfoExtractor):
+    IE_NAME = 'iqiyi'
+    IE_DESC = '爱奇艺'
+
+    _VALID_URL = r'http://(?:www\.)iqiyi.com/v_.+?\.html'
+
+    _TESTS = [{
+        'url': 'http://www.iqiyi.com/v_19rrojlavg.html',
+        'md5': '2cb594dc2781e6c941a110d8f358118b',
+        'info_dict': {
+            'id': '9c1fb1b99d192b21c559e5a1a2cb3c73',
+            'title': '美国德州空中惊现奇异云团 酷似UFO',
+            'ext': 'f4v',
+        }
+    }, {
+        'url': 'http://www.iqiyi.com/v_19rrhnnclk.html',
+        'info_dict': {
+            'id': 'e3f585b550a280af23c98b6cb2be19fb',
+            'title': '名侦探柯南第752集',
+        },
+        'playlist': [{
+            'info_dict': {
+                'id': 'e3f585b550a280af23c98b6cb2be19fb_part1',
+                'ext': 'f4v',
+                'title': '名侦探柯南第752集',
+            },
+        }, {
+            'info_dict': {
+                'id': 'e3f585b550a280af23c98b6cb2be19fb_part2',
+                'ext': 'f4v',
+                'title': '名侦探柯南第752集',
+            },
+        }, {
+            'info_dict': {
+                'id': 'e3f585b550a280af23c98b6cb2be19fb_part3',
+                'ext': 'f4v',
+                'title': '名侦探柯南第752集',
+            },
+        }, {
+            'info_dict': {
+                'id': 'e3f585b550a280af23c98b6cb2be19fb_part4',
+                'ext': 'f4v',
+                'title': '名侦探柯南第752集',
+            },
+        }, {
+            'info_dict': {
+                'id': 'e3f585b550a280af23c98b6cb2be19fb_part5',
+                'ext': 'f4v',
+                'title': '名侦探柯南第752集',
+            },
+        }, {
+            'info_dict': {
+                'id': 'e3f585b550a280af23c98b6cb2be19fb_part6',
+                'ext': 'f4v',
+                'title': '名侦探柯南第752集',
+            },
+        }, {
+            'info_dict': {
+                'id': 'e3f585b550a280af23c98b6cb2be19fb_part7',
+                'ext': 'f4v',
+                'title': '名侦探柯南第752集',
+            },
+        }, {
+            'info_dict': {
+                'id': 'e3f585b550a280af23c98b6cb2be19fb_part8',
+                'ext': 'f4v',
+                'title': '名侦探柯南第752集',
+            },
+        }],
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    _FORMATS_MAP = [
+        ('1', 'h6'),
+        ('2', 'h5'),
+        ('3', 'h4'),
+        ('4', 'h3'),
+        ('5', 'h2'),
+        ('10', 'h1'),
+    ]
+
+    def construct_video_urls(self, data, video_id, _uuid):
+        def do_xor(x, y):
+            a = y % 3
+            if a == 1:
+                return x ^ 121
+            if a == 2:
+                return x ^ 72
+            return x ^ 103
+
+        def get_encode_code(l):
+            a = 0
+            b = l.split('-')
+            c = len(b)
+            s = ''
+            for i in range(c - 1, -1, -1):
+                a = do_xor(int(b[c - i - 1], 16), i)
+                s += chr(a)
+            return s[::-1]
+
+        def get_path_key(x, format_id, segment_index):
+            mg = ')(*&^flash@#$%a'
+            tm = self._download_json(
+                'http://data.video.qiyi.com/t?tn=' + str(random.random()), video_id,
+                note='Download path key of segment %d for format %s' % (segment_index + 1, format_id)
+            )['t']
+            t = str(int(math.floor(int(tm) / (600.0))))
+            return hashlib.md5((t + mg + x).encode('utf8')).hexdigest()
+
+        video_urls_dict = {}
+        for format_item in data['vp']['tkl'][0]['vs']:
+            if 0 < int(format_item['bid']) <= 10:
+                format_id = self.get_format(format_item['bid'])
+            else:
+                continue
+
+            video_urls = []
+
+            video_urls_info = format_item['fs']
+            if not format_item['fs'][0]['l'].startswith('/'):
+                t = get_encode_code(format_item['fs'][0]['l'])
+                if t.endswith('mp4'):
+                    video_urls_info = format_item['flvs']
+
+            for segment_index, segment in enumerate(video_urls_info):
+                vl = segment['l']
+                if not vl.startswith('/'):
+                    vl = get_encode_code(vl)
+                key = get_path_key(
+                    vl.split('/')[-1].split('.')[0], format_id, segment_index)
+                filesize = segment['b']
+                base_url = data['vp']['du'].split('/')
+                base_url.insert(-1, key)
+                base_url = '/'.join(base_url)
+                param = {
+                    'su': _uuid,
+                    'qyid': uuid.uuid4().hex,
+                    'client': '',
+                    'z': '',
+                    'bt': '',
+                    'ct': '',
+                    'tn': str(int(time.time()))
+                }
+                api_video_url = base_url + vl + '?' + \
+                    compat_urllib_parse.urlencode(param)
+                js = self._download_json(
+                    api_video_url, video_id,
+                    note='Download video info of segment %d for format %s' % (segment_index + 1, format_id))
+                video_url = js['l']
+                video_urls.append(
+                    (video_url, filesize))
+
+            video_urls_dict[format_id] = video_urls
+        return video_urls_dict
+
+    def get_format(self, bid):
+        matched_format_ids = [_format_id for _bid, _format_id in self._FORMATS_MAP if _bid == str(bid)]
+        return matched_format_ids[0] if len(matched_format_ids) else None
+
+    def get_bid(self, format_id):
+        matched_bids = [_bid for _bid, _format_id in self._FORMATS_MAP if _format_id == format_id]
+        return matched_bids[0] if len(matched_bids) else None
+
+    def get_raw_data(self, tvid, video_id, enc_key, _uuid):
+        tm = str(int(time.time()))
+        param = {
+            'key': 'fvip',
+            'src': hashlib.md5(b'youtube-dl').hexdigest(),
+            'tvId': tvid,
+            'vid': video_id,
+            'vinfo': 1,
+            'tm': tm,
+            'enc': hashlib.md5(
+                (enc_key + tm + tvid).encode('utf8')).hexdigest(),
+            'qyid': _uuid,
+            'tn': random.random(),
+            'um': 0,
+            'authkey': hashlib.md5(
+                (tm + tvid).encode('utf8')).hexdigest()
+        }
+
+        api_url = 'http://cache.video.qiyi.com/vms' + '?' + \
+            compat_urllib_parse.urlencode(param)
+        raw_data = self._download_json(api_url, video_id)
+        return raw_data
+
+    def get_enc_key(self, swf_url, video_id):
+        enc_key = '8e29ab5666d041c3a1ea76e06dabdffb'
+        return enc_key
+
+    def _real_extract(self, url):
+        webpage = self._download_webpage(
+            url, 'temp_id', note='download video page')
+        tvid = self._search_regex(
+            r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid')
+        video_id = self._search_regex(
+            r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id')
+        swf_url = self._search_regex(
+            r'(http://[^\'"]+MainPlayer[^.]+\.swf)', webpage, 'swf player URL')
+        _uuid = uuid.uuid4().hex
+
+        enc_key = self.get_enc_key(swf_url, video_id)
+
+        raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid)
+
+        if raw_data['code'] != 'A000000':
+            raise ExtractorError('Unable to load data. Error code: ' + raw_data['code'])
+
+        if not raw_data['data']['vp']['tkl']:
+            raise ExtractorError('No support iQiqy VIP video')
+
+        data = raw_data['data']
+
+        title = data['vi']['vn']
+
+        # generate video_urls_dict
+        video_urls_dict = self.construct_video_urls(
+            data, video_id, _uuid)
+
+        # construct info
+        entries = []
+        for format_id in video_urls_dict:
+            video_urls = video_urls_dict[format_id]
+            for i, video_url_info in enumerate(video_urls):
+                if len(entries) < i + 1:
+                    entries.append({'formats': []})
+                entries[i]['formats'].append(
+                    {
+                        'url': video_url_info[0],
+                        'filesize': video_url_info[-1],
+                        'format_id': format_id,
+                        'preference': int(self.get_bid(format_id))
+                    }
+                )
+
+        for i in range(len(entries)):
+            self._sort_formats(entries[i]['formats'])
+            entries[i].update(
+                {
+                    'id': '%s_part%d' % (video_id, i + 1),
+                    'title': title,
+                }
+            )
+
+        if len(entries) > 1:
+            info = {
+                '_type': 'multi_video',
+                'id': video_id,
+                'title': title,
+                'entries': entries,
+            }
+        else:
+            info = entries[0]
+            info['id'] = video_id
+            info['title'] = title
+
+        return info
index 99a1361f844c15520c842cd9fffa1e5c2e9b6974..bc226fa67c064b991674a510b1eba54d40dc67e0 100644 (file)
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
 from ..utils import (
     determine_ext,
     float_or_none,
@@ -30,7 +31,7 @@ class IzleseneIE(InfoExtractor):
                 'description': 'md5:253753e2655dde93f59f74b572454f6d',
                 'thumbnail': 're:^http://.*\.jpg',
                 'uploader_id': 'pelikzzle',
-                'timestamp': 1404302298,
+                'timestamp': int,
                 'upload_date': '20140702',
                 'duration': 95.395,
                 'age_limit': 0,
@@ -46,7 +47,7 @@ class IzleseneIE(InfoExtractor):
                 'description': 'Tarkan Dortmund 2006 Konseri',
                 'thumbnail': 're:^http://.*\.jpg',
                 'uploader_id': 'parlayankiz',
-                'timestamp': 1163322193,
+                'timestamp': int,
                 'upload_date': '20061112',
                 'duration': 253.666,
                 'age_limit': 0,
@@ -67,9 +68,9 @@ class IzleseneIE(InfoExtractor):
 
         uploader = self._html_search_regex(
             r"adduserUsername\s*=\s*'([^']+)';",
-            webpage, 'uploader', fatal=False, default='')
+            webpage, 'uploader', fatal=False)
         timestamp = parse_iso8601(self._html_search_meta(
-            'uploadDate', webpage, 'upload date', fatal=False))
+            'uploadDate', webpage, 'upload date'))
 
         duration = float_or_none(self._html_search_regex(
             r'"videoduration"\s*:\s*"([^"]+)"',
@@ -86,8 +87,7 @@ class IzleseneIE(InfoExtractor):
 
         # Might be empty for some videos.
         streams = self._html_search_regex(
-            r'"qualitylevel"\s*:\s*"([^"]+)"',
-            webpage, 'streams', fatal=False, default='')
+            r'"qualitylevel"\s*:\s*"([^"]+)"', webpage, 'streams', default='')
 
         formats = []
         if streams:
@@ -95,15 +95,15 @@ class IzleseneIE(InfoExtractor):
                 quality, url = re.search(r'\[(\w+)\](.+)', stream).groups()
                 formats.append({
                     'format_id': '%sp' % quality if quality else 'sd',
-                    'url': url,
+                    'url': compat_urllib_parse_unquote(url),
                     'ext': ext,
                 })
         else:
             stream_url = self._search_regex(
-                r'"streamurl"\s?:\s?"([^"]+)"', webpage, 'stream URL')
+                r'"streamurl"\s*:\s*"([^"]+)"', webpage, 'stream URL')
             formats.append({
                 'format_id': 'sd',
-                'url': stream_url,
+                'url': compat_urllib_parse_unquote(stream_url),
                 'ext': ext,
             })
 
index d0720ff561c16e8c0816c5ff7ab333e54c297dbc..1df084d87ae4c712d9bcfa1aac6d6367641287b5 100644 (file)
@@ -8,9 +8,9 @@ from .common import InfoExtractor
 
 
 class JeuxVideoIE(InfoExtractor):
-    _VALID_URL = r'http://.*?\.jeuxvideo\.com/.*/(.*?)-\d+\.htm'
+    _VALID_URL = r'http://.*?\.jeuxvideo\.com/.*/(.*?)\.htm'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.jeuxvideo.com/reportages-videos-jeux/0004/00046170/tearaway-playstation-vita-gc-2013-tearaway-nous-presente-ses-papiers-d-identite-00115182.htm',
         'md5': '046e491afb32a8aaac1f44dd4ddd54ee',
         'info_dict': {
@@ -19,7 +19,10 @@ class JeuxVideoIE(InfoExtractor):
             'title': 'Tearaway : GC 2013 : Tearaway nous présente ses papiers d\'identité',
             'description': 'Lorsque les développeurs de LittleBigPlanet proposent un nouveau titre, on ne peut que s\'attendre à un résultat original et fort attrayant.',
         },
-    }
+    }, {
+        'url': 'http://www.jeuxvideo.com/videos/chroniques/434220/l-histoire-du-jeu-video-la-saturn.htm',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
index 2bb078036391e3c370a6e3b72a61eccc299b3e67..4597d1b961a0fcae8137d3ec919fc2ce6ac31777 100644 (file)
@@ -7,6 +7,7 @@ from .common import InfoExtractor
 from ..utils import (
     ExtractorError,
     float_or_none,
+    srt_subtitles_timecode,
 )
 
 
@@ -39,8 +40,8 @@ class KanalPlayIE(InfoExtractor):
             '%s\r\n%s --> %s\r\n%s'
             % (
                 num,
-                self._subtitles_timecode(item['startMillis'] / 1000.0),
-                self._subtitles_timecode(item['endMillis'] / 1000.0),
+                srt_subtitles_timecode(item['startMillis'] / 1000.0),
+                srt_subtitles_timecode(item['endMillis'] / 1000.0),
                 item['text'],
             ) for num, item in enumerate(subs, 1))
 
index e3b43ff8dbfec5e065aa069a0fb140dcfb5822c9..06daf5a89ce3ffde4d71d7dc8ceee9441840b72b 100644 (file)
@@ -2,7 +2,7 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..compat import compat_urllib_parse
+from ..compat import compat_urllib_parse_unquote_plus
 from ..utils import (
     js_to_json,
 )
@@ -24,7 +24,7 @@ class KaraoketvIE(InfoExtractor):
         webpage = self._download_webpage(url, video_id)
 
         page_video_url = self._og_search_video_url(webpage, video_id)
-        config_json = compat_urllib_parse.unquote_plus(self._search_regex(
+        config_json = compat_urllib_parse_unquote_plus(self._search_regex(
             r'config=(.*)', page_video_url, 'configuration'))
 
         urls_info_json = self._download_json(
diff --git a/youtube_dl/extractor/karrierevideos.py b/youtube_dl/extractor/karrierevideos.py
new file mode 100644 (file)
index 0000000..bed94bc
--- /dev/null
@@ -0,0 +1,96 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+    fix_xml_ampersands,
+    float_or_none,
+    xpath_with_ns,
+    xpath_text,
+)
+
+
+class KarriereVideosIE(InfoExtractor):
+    _VALID_URL = r'http://(?:www\.)?karrierevideos\.at(?:/[^/]+)+/(?P<id>[^/]+)'
+    _TESTS = [{
+        'url': 'http://www.karrierevideos.at/berufsvideos/mittlere-hoehere-schulen/altenpflegerin',
+        'info_dict': {
+            'id': '32c91',
+            'ext': 'flv',
+            'title': 'AltenpflegerIn',
+            'description': 'md5:dbadd1259fde2159a9b28667cb664ae2',
+            'thumbnail': 're:^http://.*\.png',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }, {
+        # broken ampersands
+        'url': 'http://www.karrierevideos.at/orientierung/vaeterkarenz-und-neue-chancen-fuer-muetter-baby-was-nun',
+        'info_dict': {
+            'id': '5sniu',
+            'ext': 'flv',
+            'title': 'Väterkarenz und neue Chancen für Mütter - "Baby - was nun?"',
+            'description': 'md5:97092c6ad1fd7d38e9d6a5fdeb2bcc33',
+            'thumbnail': 're:^http://.*\.png',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = (self._html_search_meta('title', webpage, default=None) or
+                 self._search_regex(r'<h1 class="title">([^<]+)</h1>'))
+
+        video_id = self._search_regex(
+            r'/config/video/(.+?)\.xml', webpage, 'video id')
+        playlist = self._download_xml(
+            'http://www.karrierevideos.at/player-playlist.xml.php?p=%s' % video_id,
+            video_id, transform_source=fix_xml_ampersands)
+
+        NS_MAP = {
+            'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats'
+        }
+
+        def ns(path):
+            return xpath_with_ns(path, NS_MAP)
+
+        item = playlist.find('./tracklist/item')
+        video_file = xpath_text(
+            item, ns('./jwplayer:file'), 'video url', fatal=True)
+        streamer = xpath_text(
+            item, ns('./jwplayer:streamer'), 'streamer', fatal=True)
+
+        uploader = xpath_text(
+            item, ns('./jwplayer:author'), 'uploader')
+        duration = float_or_none(
+            xpath_text(item, ns('./jwplayer:duration'), 'duration'))
+
+        description = self._html_search_regex(
+            r'(?s)<div class="leadtext">(.+?)</div>',
+            webpage, 'description')
+
+        thumbnail = self._html_search_meta(
+            'thumbnail', webpage, 'thumbnail')
+        if thumbnail:
+            thumbnail = compat_urlparse.urljoin(url, thumbnail)
+
+        return {
+            'id': video_id,
+            'url': streamer.replace('rtmpt', 'rtmp'),
+            'play_path': 'mp4:%s' % video_file,
+            'ext': 'flv',
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'duration': duration,
+        }
index 7d4b57056509383fdc082a68c1650f38dc258763..1d391e69ff7e0aba1b78ae5e32792b2dca839943 100644 (file)
@@ -28,6 +28,14 @@ class KickStarterIE(InfoExtractor):
             'uploader': 'Pebble Technology',
             'title': 'Pebble iOS Notifications',
         }
+    }, {
+        'url': 'https://www.kickstarter.com/projects/1420158244/power-drive-2000/widget/video.html',
+        'info_dict': {
+            'id': '1420158244',
+            'ext': 'mp4',
+            'title': 'Power Drive 2000',
+        },
+        'expected_warnings': ['OpenGraph description'],
     }]
 
     def _real_extract(self, url):
@@ -48,10 +56,15 @@ class KickStarterIE(InfoExtractor):
                 'title': title,
             }
 
+        thumbnail = self._og_search_thumbnail(webpage, default=None)
+        if thumbnail is None:
+            thumbnail = self._html_search_regex(
+                r'<img[^>]+class="[^"]+\s*poster\s*[^"]+"[^>]+src="([^"]+)"',
+                webpage, 'thumbnail image', fatal=False)
         return {
             'id': video_id,
             'url': video_url,
             'title': title,
             'description': self._og_search_description(webpage),
-            'thumbnail': self._og_search_thumbnail(webpage),
+            'thumbnail': thumbnail,
         }
diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py
new file mode 100644 (file)
index 0000000..1077846
--- /dev/null
@@ -0,0 +1,314 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import itertools
+
+from .common import InfoExtractor
+from ..utils import (
+    get_element_by_id,
+    clean_html,
+    ExtractorError,
+    remove_start,
+)
+
+
+class KuwoBaseIE(InfoExtractor):
+    _FORMATS = [
+        {'format': 'ape', 'ext': 'ape', 'preference': 100},
+        {'format': 'mp3-320', 'ext': 'mp3', 'br': '320kmp3', 'abr': 320, 'preference': 80},
+        {'format': 'mp3-192', 'ext': 'mp3', 'br': '192kmp3', 'abr': 192, 'preference': 70},
+        {'format': 'mp3-128', 'ext': 'mp3', 'br': '128kmp3', 'abr': 128, 'preference': 60},
+        {'format': 'wma', 'ext': 'wma', 'preference': 20},
+        {'format': 'aac', 'ext': 'aac', 'abr': 48, 'preference': 10}
+    ]
+
+    def _get_formats(self, song_id):
+        formats = []
+        for file_format in self._FORMATS:
+            song_url = self._download_webpage(
+                'http://antiserver.kuwo.cn/anti.s?format=%s&br=%s&rid=MUSIC_%s&type=convert_url&response=url' %
+                (file_format['ext'], file_format.get('br', ''), song_id),
+                song_id, note='Download %s url info' % file_format['format'],
+            )
+            if song_url.startswith('http://') or song_url.startswith('https://'):
+                formats.append({
+                    'url': song_url,
+                    'format_id': file_format['format'],
+                    'format': file_format['format'],
+                    'preference': file_format['preference'],
+                    'abr': file_format.get('abr'),
+                })
+        self._sort_formats(formats)
+        return formats
+
+
+class KuwoIE(KuwoBaseIE):
+    IE_NAME = 'kuwo:song'
+    IE_DESC = '酷我音乐'
+    _VALID_URL = r'http://www\.kuwo\.cn/yinyue/(?P<id>\d+?)/'
+    _TESTS = [{
+        'url': 'http://www.kuwo.cn/yinyue/635632/',
+        'info_dict': {
+            'id': '635632',
+            'ext': 'ape',
+            'title': '爱我别走',
+            'creator': '张震岳',
+            'upload_date': '20080122',
+            'description': 'md5:ed13f58e3c3bf3f7fd9fbc4e5a7aa75c'
+        },
+    }, {
+        'url': 'http://www.kuwo.cn/yinyue/6446136/',
+        'info_dict': {
+            'id': '6446136',
+            'ext': 'mp3',
+            'title': '心',
+            'creator': 'IU',
+            'upload_date': '20150518',
+        },
+        'params': {
+            'format': 'mp3-320'
+        },
+    }]
+
+    def _real_extract(self, url):
+        song_id = self._match_id(url)
+        webpage = self._download_webpage(
+            url, song_id, note='Download song detail info',
+            errnote='Unable to get song detail info')
+
+        song_name = self._html_search_regex(
+            r'<h1[^>]+title="([^"]+)">', webpage, 'song name')
+        singer_name = self._html_search_regex(
+            r'<div[^>]+class="s_img">\s*<a[^>]+title="([^>]+)"',
+            webpage, 'singer name', fatal=False)
+        lrc_content = clean_html(get_element_by_id('lrcContent', webpage))
+        if lrc_content == '暂无':     # indicates no lyrics
+            lrc_content = None
+
+        formats = self._get_formats(song_id)
+
+        album_id = self._html_search_regex(
+            r'<p[^>]+class="album"[^<]+<a[^>]+href="http://www\.kuwo\.cn/album/(\d+)/"',
+            webpage, 'album id', fatal=False)
+
+        publish_time = None
+        if album_id is not None:
+            album_info_page = self._download_webpage(
+                'http://www.kuwo.cn/album/%s/' % album_id, song_id,
+                note='Download album detail info',
+                errnote='Unable to get album detail info')
+
+            publish_time = self._html_search_regex(
+                r'发行时间:(\d{4}-\d{2}-\d{2})', album_info_page,
+                'publish time', fatal=False)
+            if publish_time:
+                publish_time = publish_time.replace('-', '')
+
+        return {
+            'id': song_id,
+            'title': song_name,
+            'creator': singer_name,
+            'upload_date': publish_time,
+            'description': lrc_content,
+            'formats': formats,
+        }
+
+
+class KuwoAlbumIE(InfoExtractor):
+    IE_NAME = 'kuwo:album'
+    IE_DESC = '酷我音乐 - 专辑'
+    _VALID_URL = r'http://www\.kuwo\.cn/album/(?P<id>\d+?)/'
+    _TEST = {
+        'url': 'http://www.kuwo.cn/album/502294/',
+        'info_dict': {
+            'id': '502294',
+            'title': 'M',
+            'description': 'md5:6a7235a84cc6400ec3b38a7bdaf1d60c',
+        },
+        'playlist_count': 2,
+    }
+
+    def _real_extract(self, url):
+        album_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            url, album_id, note='Download album info',
+            errnote='Unable to get album info')
+
+        album_name = self._html_search_regex(
+            r'<div[^>]+class="comm"[^<]+<h1[^>]+title="([^"]+)"', webpage,
+            'album name')
+        album_intro = remove_start(
+            clean_html(get_element_by_id('intro', webpage)),
+            '%s简介:' % album_name)
+
+        entries = [
+            self.url_result(song_url, 'Kuwo') for song_url in re.findall(
+                r'<p[^>]+class="listen"><a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+/)"',
+                webpage)
+        ]
+        return self.playlist_result(entries, album_id, album_name, album_intro)
+
+
+class KuwoChartIE(InfoExtractor):
+    IE_NAME = 'kuwo:chart'
+    IE_DESC = '酷我音乐 - 排行榜'
+    _VALID_URL = r'http://yinyue\.kuwo\.cn/billboard_(?P<id>[^.]+).htm'
+    _TEST = {
+        'url': 'http://yinyue.kuwo.cn/billboard_香港中文龙虎榜.htm',
+        'info_dict': {
+            'id': '香港中文龙虎榜',
+            'title': '香港中文龙虎榜',
+            'description': 're:\d{4}第\d{2}期',
+        },
+        'playlist_mincount': 10,
+    }
+
+    def _real_extract(self, url):
+        chart_id = self._match_id(url)
+        webpage = self._download_webpage(
+            url, chart_id, note='Download chart info',
+            errnote='Unable to get chart info')
+
+        chart_name = self._html_search_regex(
+            r'<h1[^>]+class="unDis">([^<]+)</h1>', webpage, 'chart name')
+
+        chart_desc = self._html_search_regex(
+            r'<p[^>]+class="tabDef">(\d{4}第\d{2}期)</p>', webpage, 'chart desc')
+
+        entries = [
+            self.url_result(song_url, 'Kuwo') for song_url in re.findall(
+                r'<a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)/"', webpage)
+        ]
+        return self.playlist_result(entries, chart_id, chart_name, chart_desc)
+
+
+class KuwoSingerIE(InfoExtractor):
+    IE_NAME = 'kuwo:singer'
+    IE_DESC = '酷我音乐 - 歌手'
+    _VALID_URL = r'http://www\.kuwo\.cn/mingxing/(?P<id>[^/]+)'
+    _TESTS = [{
+        'url': 'http://www.kuwo.cn/mingxing/bruno+mars/',
+        'info_dict': {
+            'id': 'bruno+mars',
+            'title': 'Bruno Mars',
+        },
+        'playlist_count': 10,
+    }, {
+        'url': 'http://www.kuwo.cn/mingxing/Ali/music.htm',
+        'info_dict': {
+            'id': 'Ali',
+            'title': 'Ali',
+        },
+        'playlist_mincount': 95,
+    }]
+
+    def _real_extract(self, url):
+        singer_id = self._match_id(url)
+        webpage = self._download_webpage(
+            url, singer_id, note='Download singer info',
+            errnote='Unable to get singer info')
+
+        singer_name = self._html_search_regex(
+            r'<div class="title clearfix">\s*<h1>([^<]+)<span', webpage, 'singer name'
+        )
+
+        entries = []
+        first_page_only = False if re.search(r'/music(?:_\d+)?\.htm', url) else True
+        for page_num in itertools.count(1):
+            webpage = self._download_webpage(
+                'http://www.kuwo.cn/mingxing/%s/music_%d.htm' % (singer_id, page_num),
+                singer_id, note='Download song list page #%d' % page_num,
+                errnote='Unable to get song list page #%d' % page_num)
+
+            entries.extend([
+                self.url_result(song_url, 'Kuwo') for song_url in re.findall(
+                    r'<p[^>]+class="m_name"><a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)/',
+                    webpage)
+            ][:10 if first_page_only else None])
+
+            if first_page_only or not re.search(r'<a[^>]+href="[^"]+">下一页</a>', webpage):
+                break
+
+        return self.playlist_result(entries, singer_id, singer_name)
+
+
+class KuwoCategoryIE(InfoExtractor):
+    IE_NAME = 'kuwo:category'
+    IE_DESC = '酷我音乐 - 分类'
+    _VALID_URL = r'http://yinyue\.kuwo\.cn/yy/cinfo_(?P<id>\d+?).htm'
+    _TEST = {
+        'url': 'http://yinyue.kuwo.cn/yy/cinfo_86375.htm',
+        'info_dict': {
+            'id': '86375',
+            'title': '八十年代精选',
+            'description': '这些都是属于八十年代的回忆!',
+        },
+        'playlist_count': 30,
+    }
+
+    def _real_extract(self, url):
+        category_id = self._match_id(url)
+        webpage = self._download_webpage(
+            url, category_id, note='Download category info',
+            errnote='Unable to get category info')
+
+        category_name = self._html_search_regex(
+            r'<h1[^>]+title="([^<>]+?)">[^<>]+?</h1>', webpage, 'category name')
+
+        category_desc = remove_start(
+            get_element_by_id('intro', webpage).strip(),
+            '%s简介:' % category_name)
+
+        jsonm = self._parse_json(self._html_search_regex(
+            r'var\s+jsonm\s*=\s*([^;]+);', webpage, 'category songs'), category_id)
+
+        entries = [
+            self.url_result('http://www.kuwo.cn/yinyue/%s/' % song['musicrid'], 'Kuwo')
+            for song in jsonm['musiclist']
+        ]
+        return self.playlist_result(entries, category_id, category_name, category_desc)
+
+
+class KuwoMvIE(KuwoBaseIE):
+    IE_NAME = 'kuwo:mv'
+    IE_DESC = '酷我音乐 - MV'
+    _VALID_URL = r'http://www\.kuwo\.cn/mv/(?P<id>\d+?)/'
+    _TEST = {
+        'url': 'http://www.kuwo.cn/mv/6480076/',
+        'info_dict': {
+            'id': '6480076',
+            'ext': 'mkv',
+            'title': '我们家MV',
+            'creator': '2PM',
+        },
+    }
+    _FORMATS = KuwoBaseIE._FORMATS + [
+        {'format': 'mkv', 'ext': 'mkv', 'preference': 250},
+        {'format': 'mp4', 'ext': 'mp4', 'preference': 200},
+    ]
+
+    def _real_extract(self, url):
+        song_id = self._match_id(url)
+        webpage = self._download_webpage(
+            url, song_id, note='Download mv detail info: %s' % song_id,
+            errnote='Unable to get mv detail info: %s' % song_id)
+
+        mobj = re.search(
+            r'<h1[^>]+title="(?P<song>[^"]+)">[^<]+<span[^>]+title="(?P<singer>[^"]+)"',
+            webpage)
+        if mobj:
+            song_name = mobj.group('song')
+            singer_name = mobj.group('singer')
+        else:
+            raise ExtractorError('Unable to find song or singer names')
+
+        formats = self._get_formats(song_id)
+
+        return {
+            'id': song_id,
+            'title': song_name,
+            'creator': singer_name,
+            'formats': formats,
+        }
index 1484ac0d267697dceb34c9e406e3a26b26a37f54..ba2ae80853d36ce1b9a98d109c2580bcdde65d5a 100644 (file)
@@ -19,6 +19,7 @@ from ..utils import (
 
 
 class LetvIE(InfoExtractor):
+    IE_DESC = '乐视网'
     _VALID_URL = r'http://www\.letv\.com/ptv/vplay/(?P<id>\d+).html'
 
     _TESTS = [{
@@ -50,9 +51,7 @@ class LetvIE(InfoExtractor):
             'title': '与龙共舞 完整版',
             'description': 'md5:7506a5eeb1722bb9d4068f85024e3986',
         },
-        'params': {
-            'cn_verification_proxy': 'http://proxy.uku.im:8888'
-        },
+        'skip': 'Only available in China',
     }]
 
     @staticmethod
index 1dfe7f77f4ccdefa0b076f71f6467644e465cb52..f8cbca7b36afab1890b71806d6761bbe67d7d924 100644 (file)
@@ -4,8 +4,11 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..compat import compat_urlparse
 from ..utils import (
+    determine_ext,
     int_or_none,
+    remove_end,
     unified_strdate,
     ExtractorError,
 )
@@ -14,9 +17,9 @@ from ..utils import (
 class LifeNewsIE(InfoExtractor):
     IE_NAME = 'lifenews'
     IE_DESC = 'LIFE | NEWS'
-    _VALID_URL = r'http://lifenews\.ru/(?:mobile/)?news/(?P<id>\d+)'
+    _VALID_URL = r'http://lifenews\.ru/(?:mobile/)?(?P<section>news|video)/(?P<id>\d+)'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://lifenews.ru/news/126342',
         'md5': 'e1b50a5c5fb98a6a544250f2e0db570a',
         'info_dict': {
@@ -27,48 +30,139 @@ class LifeNewsIE(InfoExtractor):
             'thumbnail': 're:http://.*\.jpg',
             'upload_date': '20140130',
         }
-    }
+    }, {
+        # video in <iframe>
+        'url': 'http://lifenews.ru/news/152125',
+        'md5': '77d19a6f0886cd76bdbf44b4d971a273',
+        'info_dict': {
+            'id': '152125',
+            'ext': 'mp4',
+            'title': 'В Сети появилось видео захвата «Правым сектором» колхозных полей ',
+            'description': 'Жители двух поселков Днепропетровской области не простили радикалам угрозу лишения плодородных земель и пошли в лобовую. ',
+            'upload_date': '20150402',
+        }
+    }, {
+        'url': 'http://lifenews.ru/news/153461',
+        'md5': '9b6ef8bc0ffa25aebc8bdb40d89ab795',
+        'info_dict': {
+            'id': '153461',
+            'ext': 'mp4',
+            'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве',
+            'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.',
+            'upload_date': '20150505',
+        }
+    }, {
+        'url': 'http://lifenews.ru/video/13035',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
+        section = mobj.group('section')
 
-        webpage = self._download_webpage('http://lifenews.ru/news/%s' % video_id, video_id, 'Downloading page')
+        webpage = self._download_webpage(
+            'http://lifenews.ru/%s/%s' % (section, video_id),
+            video_id, 'Downloading page')
 
         videos = re.findall(r'<video.*?poster="(?P<poster>[^"]+)".*?src="(?P<video>[^"]+)".*?></video>', webpage)
-        if not videos:
+        iframe_link = self._html_search_regex(
+            '<iframe[^>]+src=["\']([^"\']+)["\']', webpage, 'iframe link', default=None)
+        if not videos and not iframe_link:
             raise ExtractorError('No media links available for %s' % video_id)
 
-        title = self._og_search_title(webpage)
-        TITLE_SUFFIX = ' - Первый по срочным новостям — LIFE | NEWS'
-        if title.endswith(TITLE_SUFFIX):
-            title = title[:-len(TITLE_SUFFIX)]
+        title = remove_end(
+            self._og_search_title(webpage),
+            ' - Первый по срочным новостям — LIFE | NEWS')
 
         description = self._og_search_description(webpage)
 
         view_count = self._html_search_regex(
-            r'<div class=\'views\'>(\d+)</div>', webpage, 'view count', fatal=False)
+            r'<div class=\'views\'>\s*(\d+)\s*</div>', webpage, 'view count', fatal=False)
         comment_count = self._html_search_regex(
-            r'<div class=\'comments\'>\s*<span class=\'counter\'>(\d+)</span>', webpage, 'comment count', fatal=False)
+            r'=\'commentCount\'[^>]*>\s*(\d+)\s*<',
+            webpage, 'comment count', fatal=False)
 
         upload_date = self._html_search_regex(
-            r'<time datetime=\'([^\']+)\'>', webpage, 'upload date', fatal=False)
+            r'<time[^>]*datetime=\'([^\']+)\'', webpage, 'upload date', fatal=False)
         if upload_date is not None:
             upload_date = unified_strdate(upload_date)
 
+        common_info = {
+            'description': description,
+            'view_count': int_or_none(view_count),
+            'comment_count': int_or_none(comment_count),
+            'upload_date': upload_date,
+        }
+
         def make_entry(video_id, media, video_number=None):
-            return {
+            cur_info = dict(common_info)
+            cur_info.update({
                 'id': video_id,
                 'url': media[1],
                 'thumbnail': media[0],
                 'title': title if video_number is None else '%s-video%s' % (title, video_number),
-                'description': description,
-                'view_count': int_or_none(view_count),
-                'comment_count': int_or_none(comment_count),
-                'upload_date': upload_date,
-            }
+            })
+            return cur_info
+
+        if iframe_link:
+            iframe_link = self._proto_relative_url(iframe_link, 'http:')
+            cur_info = dict(common_info)
+            cur_info.update({
+                '_type': 'url_transparent',
+                'id': video_id,
+                'title': title,
+                'url': iframe_link,
+            })
+            return cur_info
 
         if len(videos) == 1:
             return make_entry(video_id, videos[0])
         else:
             return [make_entry(video_id, media, video_number + 1) for video_number, media in enumerate(videos)]
+
+
+class LifeEmbedIE(InfoExtractor):
+    IE_NAME = 'life:embed'
+    _VALID_URL = r'http://embed\.life\.ru/embed/(?P<id>[\da-f]{32})'
+
+    _TEST = {
+        'url': 'http://embed.life.ru/embed/e50c2dec2867350528e2574c899b8291',
+        'md5': 'b889715c9e49cb1981281d0e5458fbbe',
+        'info_dict': {
+            'id': 'e50c2dec2867350528e2574c899b8291',
+            'ext': 'mp4',
+            'title': 'e50c2dec2867350528e2574c899b8291',
+            'thumbnail': 're:http://.*\.jpg',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        formats = []
+        for video_url in re.findall(r'"file"\s*:\s*"([^"]+)', webpage):
+            video_url = compat_urlparse.urljoin(url, video_url)
+            ext = determine_ext(video_url)
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    video_url, video_id, 'mp4', m3u8_id='m3u8'))
+            else:
+                formats.append({
+                    'url': video_url,
+                    'format_id': ext,
+                    'preference': 1,
+                })
+        self._sort_formats(formats)
+
+        thumbnail = self._search_regex(
+            r'"image"\s*:\s*"([^"]+)', webpage, 'thumbnail', default=None)
+
+        return {
+            'id': video_id,
+            'title': video_id,
+            'thumbnail': thumbnail,
+            'formats': formats,
+        }
index 35822067f908f0567e8dcb8c9c8265df4d3421c2..857edfde263196d9bf2811568cc9f9de90eed92b 100644 (file)
@@ -40,6 +40,17 @@ class LiveLeakIE(InfoExtractor):
             'title': 'Man is Fatally Struck by Reckless Car While Packing up a Moving Truck',
             'age_limit': 18,
         }
+    }, {
+        # Covers https://github.com/rg3/youtube-dl/pull/5983
+        'url': 'http://www.liveleak.com/view?i=801_1409392012',
+        'md5': '0b3bec2d888c20728ca2ad3642f0ef15',
+        'info_dict': {
+            'id': '801_1409392012',
+            'ext': 'mp4',
+            'description': "Happened on 27.7.2014. \r\nAt 0:53 you can see people still swimming at near beach.",
+            'uploader': 'bony333',
+            'title': 'Crazy Hungarian tourist films close call waterspout in Croatia'
+        }
     }]
 
     def _real_extract(self, url):
@@ -85,7 +96,10 @@ class LiveLeakIE(InfoExtractor):
             'url': s['file'],
         } for i, s in enumerate(sources)]
         for i, s in enumerate(sources):
-            orig_url = s['file'].replace('.h264_base.mp4', '')
+            # Removing '.h264_*.mp4' gives the raw video, which is essentially
+            # the same video without the LiveLeak logo at the top (see
+            # https://github.com/rg3/youtube-dl/pull/4768)
+            orig_url = re.sub(r'\.h264_.+?\.mp4', '', s['file'])
             if s['file'] != orig_url:
                 formats.append({
                     'format_id': 'original-%s' % i,
index ec309dadd848f7c1ae46b28c3be329c32cef48c9..6d7733e4111355a5011765336333f229596b8356 100644 (file)
@@ -194,23 +194,19 @@ class LivestreamIE(InfoExtractor):
 # The original version of Livestream uses a different system
 class LivestreamOriginalIE(InfoExtractor):
     IE_NAME = 'livestream:original'
-    _VALID_URL = r'''(?x)https?://www\.livestream\.com/
+    _VALID_URL = r'''(?x)https?://original\.livestream\.com/
         (?P<user>[^/]+)/(?P<type>video|folder)
         (?:\?.*?Id=|/)(?P<id>.*?)(&|$)
         '''
     _TESTS = [{
-        'url': 'http://www.livestream.com/dealbook/video?clipId=pla_8aa4a3f1-ba15-46a4-893b-902210e138fb',
+        'url': 'http://original.livestream.com/dealbook/video?clipId=pla_8aa4a3f1-ba15-46a4-893b-902210e138fb',
         'info_dict': {
             'id': 'pla_8aa4a3f1-ba15-46a4-893b-902210e138fb',
-            'ext': 'flv',
+            'ext': 'mp4',
             'title': 'Spark 1 (BitCoin) with Cameron Winklevoss & Tyler Winklevoss of Winklevoss Capital',
         },
-        'params': {
-            # rtmp
-            'skip_download': True,
-        },
     }, {
-        'url': 'https://www.livestream.com/newplay/folder?dirId=a07bf706-d0e4-4e75-a747-b021d84f2fd3',
+        'url': 'https://original.livestream.com/newplay/folder?dirId=a07bf706-d0e4-4e75-a747-b021d84f2fd3',
         'info_dict': {
             'id': 'a07bf706-d0e4-4e75-a747-b021d84f2fd3',
         },
@@ -221,19 +217,17 @@ class LivestreamOriginalIE(InfoExtractor):
         api_url = 'http://x{0}x.api.channel.livestream.com/2.0/clipdetails?extendedInfo=true&id={1}'.format(user, video_id)
 
         info = self._download_xml(api_url, video_id)
+        # this url is used on mobile devices
+        stream_url = 'http://x{0}x.api.channel.livestream.com/3.0/getstream.json?id={1}'.format(user, video_id)
+        stream_info = self._download_json(stream_url, video_id)
         item = info.find('channel').find('item')
         ns = {'media': 'http://search.yahoo.com/mrss'}
         thumbnail_url = item.find(xpath_with_ns('media:thumbnail', ns)).attrib['url']
-        # Remove the extension and number from the path (like 1.jpg)
-        path = self._search_regex(r'(user-files/.+)_.*?\.jpg$', thumbnail_url, 'path')
 
         return {
             'id': video_id,
             'title': item.find('title').text,
-            'url': 'rtmp://extondemand.livestream.com/ondemand',
-            'play_path': 'trans/dv15/mogulus-{0}'.format(path),
-            'player_url': 'http://static.livestream.com/chromelessPlayer/v21/playerapi.swf?hash=5uetk&v=0803&classid=D27CDB6E-AE6D-11cf-96B8-444553540000&jsEnabled=false&wmode=opaque',
-            'ext': 'flv',
+            'url': stream_info['progressiveUrl'],
             'thumbnail': thumbnail_url,
         }
 
index cfd3b14f4bfd755a7600701e86900ece12b0c3ac..a00f6e5e5eb1d398ef0776d8b20dcb1dd51ec082 100644 (file)
@@ -30,13 +30,13 @@ class LyndaBaseIE(InfoExtractor):
             return
 
         login_form = {
-            'username': username,
-            'password': password,
+            'username': username.encode('utf-8'),
+            'password': password.encode('utf-8'),
             'remember': 'false',
             'stayPut': 'false'
         }
         request = compat_urllib_request.Request(
-            self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
+            self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
         login_page = self._download_webpage(
             request, None, 'Logging in as %s' % username)
 
@@ -65,7 +65,7 @@ class LyndaBaseIE(InfoExtractor):
                     'stayPut': 'false',
                 }
                 request = compat_urllib_request.Request(
-                    self._LOGIN_URL, compat_urllib_parse.urlencode(confirm_form))
+                    self._LOGIN_URL, compat_urllib_parse.urlencode(confirm_form).encode('utf-8'))
                 login_page = self._download_webpage(
                     request, None,
                     'Confirming log in and log out from another device')
index 0b85a59d1c644d7d04e573aae0bdd03ebd4f6c80..92511a671ae300287fe6eb57b91b9c708dba45c5 100644 (file)
@@ -2,9 +2,7 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..compat import (
-    compat_urllib_parse,
-)
+from ..compat import compat_urllib_parse_unquote
 
 
 class MalemotionIE(InfoExtractor):
@@ -24,7 +22,7 @@ class MalemotionIE(InfoExtractor):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
 
-        video_url = compat_urllib_parse.unquote(self._search_regex(
+        video_url = compat_urllib_parse_unquote(self._search_regex(
             r'<source type="video/mp4" src="(.+?)"', webpage, 'video URL'))
         video_title = self._html_search_regex(
             r'<title>(.*?)</title', webpage, 'title')
diff --git a/youtube_dl/extractor/megavideoz.py b/youtube_dl/extractor/megavideoz.py
new file mode 100644 (file)
index 0000000..af7ff07
--- /dev/null
@@ -0,0 +1,56 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    float_or_none,
+    xpath_text,
+)
+
+
+class MegaVideozIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?megavideoz\.eu/video/(?P<id>[^/]+)(?:/(?P<display_id>[^/]+))?'
+    _TEST = {
+        'url': 'http://megavideoz.eu/video/WM6UB919XMXH/SMPTE-Universal-Film-Leader',
+        'info_dict': {
+            'id': '48723',
+            'display_id': 'SMPTE-Universal-Film-Leader',
+            'ext': 'mp4',
+            'title': 'SMPTE Universal Film Leader',
+            'thumbnail': 're:https?://.*?\.jpg',
+            'duration': 10.93,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id') or video_id
+
+        webpage = self._download_webpage(url, display_id)
+
+        if any(p in webpage for p in ('>Video Not Found<', '>404 Error<')):
+            raise ExtractorError('Video %s does not exist' % video_id, expected=True)
+
+        config = self._download_xml(
+            self._search_regex(
+                r"var\s+cnf\s*=\s*'([^']+)'", webpage, 'cnf url'),
+            display_id)
+
+        video_url = xpath_text(config, './file', 'video url', fatal=True)
+        title = xpath_text(config, './title', 'title', fatal=True)
+        thumbnail = xpath_text(config, './image', 'thumbnail')
+        duration = float_or_none(xpath_text(config, './duration', 'duration'))
+        video_id = xpath_text(config, './mediaid', 'video id') or video_id
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'url': video_url,
+            'title': title,
+            'thumbnail': thumbnail,
+            'duration': duration
+        }
index 8bc333b0277e27e6fd8f3d4f11b3c9c7eabdd7d7..6e2e73a5162f10ea5818b636da579c932b4f2e7d 100644 (file)
@@ -6,6 +6,7 @@ from .common import InfoExtractor
 from ..compat import (
     compat_parse_qs,
     compat_urllib_parse,
+    compat_urllib_parse_unquote,
     compat_urllib_request,
 )
 from ..utils import (
@@ -155,7 +156,7 @@ class MetacafeIE(InfoExtractor):
         video_url = None
         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
         if mobj is not None:
-            mediaURL = compat_urllib_parse.unquote(mobj.group(1))
+            mediaURL = compat_urllib_parse_unquote(mobj.group(1))
             video_ext = mediaURL[-3:]
 
             # Extract gdaKey if available
index d41195a9647a7cdd329009c5ce448388f0ec8f20..a784fc5fba41c5931f6b1f040042e1900a6ff791 100644 (file)
@@ -31,6 +31,14 @@ class MioMioIE(InfoExtractor):
             'title': '《动漫同人插画绘制》',
         },
         'playlist_mincount': 86,
+        'skip': 'This video takes time too long for retrieving the URL',
+    }, {
+        'url': 'http://www.miomio.tv/watch/cc173113/',
+        'info_dict': {
+            'id': '173113',
+            'title': 'The New Macbook 2015 上手试玩与简评'
+        },
+        'playlist_mincount': 2,
     }]
 
     def _real_extract(self, url):
index d8897eb90d526b7b7d2e5a5ace5bec84ebb40031..852d722664a3d63aafed0f8246949335b4150c09 100644 (file)
@@ -5,6 +5,7 @@ import json
 from .common import InfoExtractor
 from ..compat import (
     compat_urllib_parse,
+    compat_urllib_parse_unquote,
     compat_urlparse,
 )
 from ..utils import (
@@ -20,7 +21,6 @@ class MiTeleIE(InfoExtractor):
 
     _TESTS = [{
         'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/',
-        'md5': '6a75fe9d0d3275bead0cb683c616fddb',
         'info_dict': {
             'id': '0fce117d',
             'ext': 'mp4',
@@ -29,6 +29,10 @@ class MiTeleIE(InfoExtractor):
             'display_id': 'programa-144',
             'duration': 2913,
         },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
     }]
 
     def _real_extract(self, url):
@@ -45,7 +49,7 @@ class MiTeleIE(InfoExtractor):
             domain = 'http://' + domain
         info_url = compat_urlparse.urljoin(
             domain,
-            compat_urllib_parse.unquote(embed_data['flashvars']['host'])
+            compat_urllib_parse_unquote(embed_data['flashvars']['host'])
         )
         info_el = self._download_xml(info_url, episode).find('./video/info')
 
@@ -56,12 +60,14 @@ class MiTeleIE(InfoExtractor):
             episode,
             transform_source=strip_jsonp
         )
+        formats = self._extract_m3u8_formats(
+            token_info['tokenizedUrl'], episode, ext='mp4')
 
         return {
             'id': embed_data['videoId'],
             'display_id': episode,
             'title': info_el.find('title').text,
-            'url': token_info['tokenizedUrl'],
+            'formats': formats,
             'description': get_element_by_attribute('class', 'text', webpage),
             'thumbnail': info_el.find('thumb').text,
             'duration': parse_duration(info_el.find('duration').text),
index 84f29155841007f3088a86470040407073726067..d47aecedae388829babaed8642611c5a6b7d29fe 100644 (file)
@@ -1,12 +1,9 @@
 from __future__ import unicode_literals
 
 import re
-import itertools
 
 from .common import InfoExtractor
-from ..compat import (
-    compat_urllib_parse,
-)
+from ..compat import compat_urllib_parse_unquote
 from ..utils import (
     ExtractorError,
     HEADRequest,
@@ -46,41 +43,32 @@ class MixcloudIE(InfoExtractor):
         },
     }]
 
-    def _get_url(self, track_id, template_url, server_number):
-        boundaries = (1, 30)
-        for nr in server_numbers(server_number, boundaries):
-            url = template_url % nr
-            try:
-                # We only want to know if the request succeed
-                # don't download the whole file
-                self._request_webpage(
-                    HEADRequest(url), track_id,
-                    'Checking URL %d/%d ...' % (nr, boundaries[-1]))
-                return url
-            except ExtractorError:
-                pass
-        return None
+    def _check_url(self, url, track_id, ext):
+        try:
+            # We only want to know if the request succeed
+            # don't download the whole file
+            self._request_webpage(
+                HEADRequest(url), track_id,
+                'Trying %s URL' % ext)
+            return True
+        except ExtractorError:
+            return False
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         uploader = mobj.group(1)
         cloudcast_name = mobj.group(2)
-        track_id = compat_urllib_parse.unquote('-'.join((uploader, cloudcast_name)))
+        track_id = compat_urllib_parse_unquote('-'.join((uploader, cloudcast_name)))
 
         webpage = self._download_webpage(url, track_id)
 
         preview_url = self._search_regex(
             r'\s(?:data-preview-url|m-preview)="([^"]+)"', webpage, 'preview url')
         song_url = preview_url.replace('/previews/', '/c/originals/')
-        server_number = int(self._search_regex(r'stream(\d+)', song_url, 'server number'))
-        template_url = re.sub(r'(stream\d*)', 'stream%d', song_url)
-        final_song_url = self._get_url(track_id, template_url, server_number)
-        if final_song_url is None:
-            self.to_screen('Trying with m4a extension')
-            template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/')
-            final_song_url = self._get_url(track_id, template_url, server_number)
-        if final_song_url is None:
-            raise ExtractorError('Unable to extract track url')
+        if not self._check_url(song_url, track_id, 'mp3'):
+            song_url = song_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/')
+            if not self._check_url(song_url, track_id, 'm4a'):
+                raise ExtractorError('Unable to extract track url')
 
         PREFIX = (
             r'm-play-on-spacebar[^>]+'
@@ -107,7 +95,7 @@ class MixcloudIE(InfoExtractor):
         return {
             'id': track_id,
             'title': title,
-            'url': final_song_url,
+            'url': song_url,
             'description': description,
             'thumbnail': thumbnail,
             'uploader': uploader,
@@ -115,35 +103,3 @@ class MixcloudIE(InfoExtractor):
             'view_count': view_count,
             'like_count': like_count,
         }
-
-
-def server_numbers(first, boundaries):
-    """ Server numbers to try in descending order of probable availability.
-    Starting from first (i.e. the number of the server hosting the preview file)
-    and going further and further up to the higher boundary and down to the
-    lower one in an alternating fashion. Namely:
-
-        server_numbers(2, (1, 5))
-
-        # Where the preview server is 2, min number is 1 and max is 5.
-        # Yields: 2, 3, 1, 4, 5
-
-    Why not random numbers or increasing sequences? Since from what I've seen,
-    full length files seem to be hosted on servers whose number is closer to
-    that of the preview; to be confirmed.
-    """
-    zip_longest = getattr(itertools, 'zip_longest', None)
-    if zip_longest is None:
-        # python 2.x
-        zip_longest = itertools.izip_longest
-
-    if len(boundaries) != 2:
-        raise ValueError("boundaries should be a two-element tuple")
-    min, max = boundaries
-    highs = range(first + 1, max + 1)
-    lows = range(first - 1, min - 1, -1)
-    rest = filter(
-        None, itertools.chain.from_iterable(zip_longest(highs, lows)))
-    yield first
-    for n in rest:
-        yield n
index e369551c2fb5730377d863cf3bc2bdde31eb5f60..e242b897f2b63cf624805c7564cf7e2f02a9d16b 100644 (file)
@@ -10,7 +10,21 @@ from ..utils import (
 
 
 class MLBIE(InfoExtractor):
-    _VALID_URL = r'https?://m(?:lb)?\.(?:[\da-z_-]+\.)?mlb\.com/(?:(?:.*?/)?video/(?:topic/[\da-z_-]+/)?v|(?:shared/video/embed/embed\.html|[^/]+/video/play\.jsp)\?.*?\bcontent_id=)(?P<id>n?\d+)'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:[\da-z_-]+\.)*mlb\.com/
+                        (?:
+                            (?:
+                                (?:.*?/)?video/(?:topic/[\da-z_-]+/)?v|
+                                (?:
+                                    shared/video/embed/(?:embed|m-internal-embed)\.html|
+                                    (?:[^/]+/)+(?:play|index)\.jsp|
+                                )\?.*?\bcontent_id=
+                            )
+                            (?P<id>n?\d+)|
+                            (?:[^/]+/)*(?P<path>[^/]+)
+                        )
+                    '''
     _TESTS = [
         {
             'url': 'http://m.mlb.com/sea/video/topic/51231442/v34698933/nymsea-ackley-robs-a-home-run-with-an-amazing-catch/?c_id=sea',
@@ -68,6 +82,18 @@ class MLBIE(InfoExtractor):
                 'thumbnail': 're:^https?://.*\.jpg$',
             },
         },
+        {
+            'url': 'http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer',
+            'md5': 'b190e70141fb9a1552a85426b4da1b5d',
+            'info_dict': {
+                'id': '75609783',
+                'ext': 'mp4',
+                'title': 'Must C: Pillar climbs for catch',
+                'description': '4/15/15: Blue Jays outfielder Kevin Pillar continues his defensive dominance by climbing the wall in left to rob Tim Beckham of a home run',
+                'timestamp': 1429124820,
+                'upload_date': '20150415',
+            }
+        },
         {
             'url': 'http://m.mlb.com/shared/video/embed/embed.html?content_id=35692085&topic_id=6479266&width=400&height=224&property=mlb',
             'only_matching': True,
@@ -83,6 +109,15 @@ class MLBIE(InfoExtractor):
         {
             'url': 'http://m.cardinals.mlb.com/stl/video/v51175783/atlstl-piscotty-makes-great-sliding-catch-on-line/?partnerId=as_mlb_20150321_42500876&adbid=579409712979910656&adbpl=tw&adbpr=52847728',
             'only_matching': True,
+        },
+        {
+            # From http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer
+            'url': 'http://mlb.mlb.com/shared/video/embed/m-internal-embed.html?content_id=75609783&property=mlb&autoplay=true&hashmode=false&siteSection=mlb/multimedia/article_118550098/article_embed&club=mlb',
+            'only_matching': True,
+        },
+        {
+            'url': 'http://washington.nationals.mlb.com/mlb/gameday/index.jsp?c_id=was&gid=2015_05_09_atlmlb_wasmlb_1&lang=en&content_id=108309983&mode=video#',
+            'only_matching': True,
         }
     ]
 
@@ -90,6 +125,12 @@ class MLBIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
 
+        if not video_id:
+            video_path = mobj.group('path')
+            webpage = self._download_webpage(url, video_path)
+            video_id = self._search_regex(
+                [r'data-video-?id="(\d+)"', r'content_id=(\d+)'], webpage, 'video id')
+
         detail = self._download_xml(
             'http://m.mlb.com/gen/multimedia/detail/%s/%s/%s/%s.xml'
             % (video_id[-3], video_id[-2], video_id[-1], video_id), video_id)
index 2cec12d35ec1797dd7612ad49c5739e87f77e6c9..9bf99a54a98c4838c2b878db3ec165c867602110 100644 (file)
@@ -5,9 +5,9 @@ import re
 
 from .common import InfoExtractor
 from ..compat import (
+    compat_urllib_parse_unquote,
     compat_urllib_parse_urlparse,
     compat_urllib_request,
-    compat_urllib_parse,
 )
 
 
@@ -34,7 +34,7 @@ class MofosexIE(InfoExtractor):
         webpage = self._download_webpage(req, video_id)
 
         video_title = self._html_search_regex(r'<h1>(.+?)<', webpage, 'title')
-        video_url = compat_urllib_parse.unquote(self._html_search_regex(r'flashvars.video_url = \'([^\']+)', webpage, 'video_url'))
+        video_url = compat_urllib_parse_unquote(self._html_search_regex(r'flashvars.video_url = \'([^\']+)', webpage, 'video_url'))
         path = compat_urllib_parse_urlparse(video_url).path
         extension = os.path.splitext(path)[1][1:]
         format = path.split('/')[5].split('_')[:2]
index 5de719bdc41d2af56d6133a85b998c4ed85af726..88dcd4f737544356091220d53078bc1c2e222d76 100644 (file)
@@ -9,6 +9,7 @@ from ..compat import (
     compat_urllib_parse,
     compat_urllib_request,
 )
+from ..utils import ExtractorError
 
 
 class MonikerIE(InfoExtractor):
@@ -40,6 +41,15 @@ class MonikerIE(InfoExtractor):
         video_id = self._match_id(url)
         orig_webpage = self._download_webpage(url, video_id)
 
+        if '>File Not Found<' in orig_webpage:
+            raise ExtractorError('Video %s does not exist' % video_id, expected=True)
+
+        error = self._search_regex(
+            r'class="err">([^<]+)<', orig_webpage, 'error', default=None)
+        if error:
+            raise ExtractorError(
+                '%s returned error: %s' % (self.IE_NAME, error), expected=True)
+
         fields = re.findall(r'type="hidden" name="(.+?)"\s* value="?(.+?)">', orig_webpage)
         data = dict(fields)
 
index c11de1cb61b28d03ab2430ff1db3a82d317dc718..b48fac5e3e434569642284d0b6388cab34696b01 100644 (file)
@@ -25,6 +25,7 @@ def _media_xml_tag(tag):
 
 class MTVServicesInfoExtractor(InfoExtractor):
     _MOBILE_TEMPLATE = None
+    _LANG = None
 
     @staticmethod
     def _id_from_uri(uri):
@@ -118,6 +119,14 @@ class MTVServicesInfoExtractor(InfoExtractor):
         mediagen_doc = self._download_xml(mediagen_url, video_id,
                                           'Downloading video urls')
 
+        item = mediagen_doc.find('./video/item')
+        if item is not None and item.get('type') == 'text':
+            message = '%s returned error: ' % self.IE_NAME
+            if item.get('code') is not None:
+                message += '%s - ' % item.get('code')
+            message += item.text
+            raise ExtractorError(message, expected=True)
+
         description_node = itemdoc.find('description')
         if description_node is not None:
             description = description_node.text.strip()
@@ -161,8 +170,12 @@ class MTVServicesInfoExtractor(InfoExtractor):
         video_id = self._id_from_uri(uri)
         feed_url = self._get_feed_url(uri)
         data = compat_urllib_parse.urlencode({'uri': uri})
+        info_url = feed_url + '?'
+        if self._LANG:
+            info_url += 'lang=%s&' % self._LANG
+        info_url += data
         idoc = self._download_xml(
-            feed_url + '?' + data, video_id,
+            info_url, video_id,
             'Downloading info', transform_source=fix_xml_ampersands)
         return self.playlist_result(
             [self._get_video_info(item) for item in idoc.findall('.//item')])
index 5b9b9fbcd0844897d6d63305ed00729e70c7f4fb..4557a2b13b3e47a75242ebd4a5c095bf17cbaacf 100644 (file)
@@ -35,7 +35,8 @@ class MySpassIE(InfoExtractor):
 
         # get metadata
         metadata_url = META_DATA_URL_TEMPLATE % video_id
-        metadata = self._download_xml(metadata_url, video_id)
+        metadata = self._download_xml(
+            metadata_url, video_id, transform_source=lambda s: s.strip())
 
         # extract values from metadata
         url_flv_el = metadata.find('url_flv')
diff --git a/youtube_dl/extractor/myvi.py b/youtube_dl/extractor/myvi.py
new file mode 100644 (file)
index 0000000..4c65be1
--- /dev/null
@@ -0,0 +1,60 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .vimple import SprutoBaseIE
+
+
+class MyviIE(SprutoBaseIE):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        myvi\.(?:ru/player|tv)/
+                            (?:
+                                (?:
+                                    embed/html|
+                                    flash|
+                                    api/Video/Get
+                                )/|
+                                content/preloader\.swf\?.*\bid=
+                            )
+                            (?P<id>[\da-zA-Z_-]+)
+                    '''
+    _TESTS = [{
+        'url': 'http://myvi.ru/player/embed/html/oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wObeRTZaCATzucDQIDph8hQU0',
+        'md5': '571bbdfba9f9ed229dc6d34cc0f335bf',
+        'info_dict': {
+            'id': 'f16b2bbd-cde8-481c-a981-7cd48605df43',
+            'ext': 'mp4',
+            'title': 'хозяин жизни',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 25,
+        },
+    }, {
+        'url': 'http://myvi.ru/player/content/preloader.swf?id=oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wOYf1WFpPfc_bWTKGVf_Zafr0',
+        'only_matching': True,
+    }, {
+        'url': 'http://myvi.ru/player/api/Video/Get/oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wObeRTZaCATzucDQIDph8hQU0',
+        'only_matching': True,
+    }, {
+        'url': 'http://myvi.tv/embed/html/oTGTNWdyz4Zwy_u1nraolwZ1odenTd9WkTnRfIL9y8VOgHYqOHApE575x4_xxS9Vn0?ap=0',
+        'only_matching': True,
+    }, {
+        'url': 'http://myvi.ru/player/flash/ocp2qZrHI-eZnHKQBK4cZV60hslH8LALnk0uBfKsB-Q4WnY26SeGoYPi8HWHxu0O30',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def _extract_url(cls, webpage):
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//myvi\.(?:ru/player|tv)/(?:embed/html|flash)/[^"]+)\1', webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        spruto = self._download_json(
+            'http://myvi.ru/player/api/Video/Get/%s?sig' % video_id, video_id)['sprutoData']
+
+        return self._extract_spruto(spruto, video_id)
index 5e754fcffb6403cbd359b9b358ad3879ef279f53..c96f472a39e569c7dfb88682d36fad9ed6ce2c10 100644 (file)
@@ -10,6 +10,7 @@ from .common import InfoExtractor
 from ..compat import (
     compat_ord,
     compat_urllib_parse,
+    compat_urllib_parse_unquote,
     compat_urllib_request,
 )
 from ..utils import (
@@ -107,7 +108,7 @@ class MyVideoIE(InfoExtractor):
             if not a == '_encxml':
                 params[a] = b
             else:
-                encxml = compat_urllib_parse.unquote(b)
+                encxml = compat_urllib_parse_unquote(b)
         if not params.get('domain'):
             params['domain'] = 'www.myvideo.de'
         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
@@ -135,7 +136,7 @@ class MyVideoIE(InfoExtractor):
         video_url = None
         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
         if mobj:
-            video_url = compat_urllib_parse.unquote(mobj.group(1))
+            video_url = compat_urllib_parse_unquote(mobj.group(1))
             if 'myvideo2flash' in video_url:
                 self.report_warning(
                     'Rewriting URL to use unencrypted rtmp:// ...',
@@ -147,10 +148,10 @@ class MyVideoIE(InfoExtractor):
             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
             if mobj is None:
                 raise ExtractorError('unable to extract url')
-            video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
+            video_url = compat_urllib_parse_unquote(mobj.group(1)) + compat_urllib_parse_unquote(mobj.group(2))
 
         video_file = self._search_regex('source=\'(.*?)\'', dec_data, 'video file')
-        video_file = compat_urllib_parse.unquote(video_file)
+        video_file = compat_urllib_parse_unquote(video_file)
 
         if not video_file.endswith('f4m'):
             ppath, prefix = video_file.split('.')
@@ -159,7 +160,7 @@ class MyVideoIE(InfoExtractor):
             video_playpath = ''
 
         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, 'swfobj')
-        video_swfobj = compat_urllib_parse.unquote(video_swfobj)
+        video_swfobj = compat_urllib_parse_unquote(video_swfobj)
 
         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
                                               webpage, 'title')
index c18640c5a9f0093344475ad649643705522cddb9..f793b72f5c4398901d2c5faf8e2f283baef94cfc 100644 (file)
@@ -25,8 +25,11 @@ class NationalGeographicIE(InfoExtractor):
         name = url_basename(url)
 
         webpage = self._download_webpage(url, name)
-        feed_url = self._search_regex(r'data-feed-url="([^"]+)"', webpage, 'feed url')
-        guid = self._search_regex(r'data-video-guid="([^"]+)"', webpage, 'guid')
+        feed_url = self._search_regex(
+            r'data-feed-url="([^"]+)"', webpage, 'feed url')
+        guid = self._search_regex(
+            r'id="(?:videoPlayer|player-container)"[^>]+data-guid="([^"]+)"',
+            webpage, 'guid')
 
         feed = self._download_xml('%s?byGuid=%s' % (feed_url, guid), name)
         content = feed.find('.//{http://search.yahoo.com/mrss/}content')
index c10405f04d3cc1b3e89004029b7502112e9baa29..925967753bd12816005b5ed8929f0438e9ec0214 100644 (file)
@@ -6,6 +6,7 @@ import re
 from .common import InfoExtractor
 from ..compat import (
     compat_urllib_parse,
+    compat_urlparse,
 )
 from ..utils import (
     ExtractorError,
@@ -16,7 +17,7 @@ from ..utils import (
 class NaverIE(InfoExtractor):
     _VALID_URL = r'https?://(?:m\.)?tvcast\.naver\.com/v/(?P<id>\d+)'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://tvcast.naver.com/v/81652',
         'info_dict': {
             'id': '81652',
@@ -25,7 +26,18 @@ class NaverIE(InfoExtractor):
             'description': '합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.',
             'upload_date': '20130903',
         },
-    }
+    }, {
+        'url': 'http://tvcast.naver.com/v/395837',
+        'md5': '638ed4c12012c458fefcddfd01f173cd',
+        'info_dict': {
+            'id': '395837',
+            'ext': 'mp4',
+            'title': '9년이 지나도 아픈 기억, 전효성의 아버지',
+            'description': 'md5:5bf200dcbf4b66eb1b350d1eb9c753f7',
+            'upload_date': '20150519',
+        },
+        'skip': 'Georestricted',
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
@@ -35,7 +47,7 @@ class NaverIE(InfoExtractor):
                          webpage)
         if m_id is None:
             m_error = re.search(
-                r'(?s)<div class="nation_error">\s*(?:<!--.*?-->)?\s*<p class="[^"]+">(?P<msg>.+?)</p>\s*</div>',
+                r'(?s)<div class="(?:nation_error|nation_box)">\s*(?:<!--.*?-->)?\s*<p class="[^"]+">(?P<msg>.+?)</p>\s*</div>',
                 webpage)
             if m_error:
                 raise ExtractorError(clean_html(m_error.group('msg')), expected=True)
@@ -58,14 +70,18 @@ class NaverIE(InfoExtractor):
         formats = []
         for format_el in urls.findall('EncodingOptions/EncodingOption'):
             domain = format_el.find('Domain').text
+            uri = format_el.find('uri').text
             f = {
-                'url': domain + format_el.find('uri').text,
+                'url': compat_urlparse.urljoin(domain, uri),
                 'ext': 'mp4',
                 'width': int(format_el.find('width').text),
                 'height': int(format_el.find('height').text),
             }
             if domain.startswith('rtmp'):
+                # urlparse does not support custom schemes
+                # https://bugs.python.org/issue18828
                 f.update({
+                    'url': domain + uri,
                     'ext': 'flv',
                     'rtmp_protocol': '1',  # rtmpt
                 })
index 862b706bf96719aa071f1f89c73f2a4ef45a20b1..944096e1ca15de964fcdf896adf988c9aa2264bd 100644 (file)
@@ -22,6 +22,18 @@ class NBAIE(InfoExtractor):
     }, {
         'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/',
         'only_matching': True,
+    }, {
+        'url': 'http://watch.nba.com/nba/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba',
+        'info_dict': {
+            'id': '0041400301-cle-atl-recap.nba',
+            'ext': 'mp4',
+            'title': 'NBA GAME TIME | Video: Hawks vs. Cavaliers Game 1',
+            'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d',
+            'duration': 228,
+        },
+        'params': {
+            'skip_download': True,
+        }
     }]
 
     def _real_extract(self, url):
@@ -35,8 +47,12 @@ class NBAIE(InfoExtractor):
             self._og_search_title(webpage, default=shortened_video_id), ' : NBA.com')
 
         description = self._og_search_description(webpage)
-        duration = parse_duration(
-            self._html_search_meta('duration', webpage, 'duration'))
+        duration_str = self._html_search_meta(
+            'duration', webpage, 'duration', default=None)
+        if not duration_str:
+            duration_str = self._html_search_regex(
+                r'Duration:</b>\s*(\d+:\d+)', webpage, 'duration', fatal=False)
+        duration = parse_duration(duration_str)
 
         return {
             'id': shortened_video_id,
index ecd0ac8b1b501d9ad97261f57a5b0fee1cd68ce7..dc2091be0d0c8706b2f3b6d78d88fa22fcb8b6d1 100644 (file)
@@ -10,6 +10,8 @@ from ..compat import (
 from ..utils import (
     ExtractorError,
     find_xpath_attr,
+    lowercase_escape,
+    unescapeHTML,
 )
 
 
@@ -37,14 +39,32 @@ class NBCIE(InfoExtractor):
             },
             'skip': 'Only works from US',
         },
+        {
+            'url': 'http://www.nbc.com/saturday-night-live/video/star-wars-teaser/2832821',
+            'info_dict': {
+                'id': '8iUuyzWDdYUZ',
+                'ext': 'flv',
+                'title': 'Star Wars Teaser',
+                'description': 'md5:0b40f9cbde5b671a7ff62fceccc4f442',
+            },
+            'skip': 'Only works from US',
+        },
+        {
+            # This video has expired but with an escaped embedURL
+            'url': 'http://www.nbc.com/parenthood/episode-guide/season-5/just-like-at-home/515',
+            'skip': 'Expired'
+        }
     ]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
-        theplatform_url = self._search_regex(
-            '(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"',
-            webpage, 'theplatform url').replace('_no_endcard', '')
+        theplatform_url = unescapeHTML(lowercase_escape(self._html_search_regex(
+            [
+                r'(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"',
+                r'"embedURL"\s*:\s*"([^"]+)"'
+            ],
+            webpage, 'theplatform url').replace('_no_endcard', '').replace('\\/', '/')))
         if theplatform_url.startswith('//'):
             theplatform_url = 'http:' + theplatform_url
         return self.url_result(theplatform_url)
index f49c666909a270ad18e36a2d1177ef681adc3121..79a13958b05e25a1c9e586168bb3a10742fbe01f 100644 (file)
@@ -8,41 +8,11 @@ from ..utils import (
     ExtractorError,
     int_or_none,
     qualities,
+    parse_duration,
 )
 
 
-class NDRIE(InfoExtractor):
-    IE_NAME = 'ndr'
-    IE_DESC = 'NDR.de - Mediathek'
-    _VALID_URL = r'https?://www\.ndr\.de/.+?(?P<id>\d+)\.html'
-
-    _TESTS = [
-        {
-            'url': 'http://www.ndr.de/fernsehen/sendungen/nordmagazin/Kartoffeltage-in-der-Lewitz,nordmagazin25866.html',
-            'md5': '5bc5f5b92c82c0f8b26cddca34f8bb2c',
-            'note': 'Video file',
-            'info_dict': {
-                'id': '25866',
-                'ext': 'mp4',
-                'title': 'Kartoffeltage in der Lewitz',
-                'description': 'md5:48c4c04dde604c8a9971b3d4e3b9eaa8',
-                'duration': 166,
-            }
-        },
-        {
-            'url': 'http://www.ndr.de/info/audio51535.html',
-            'md5': 'bb3cd38e24fbcc866d13b50ca59307b8',
-            'note': 'Audio file',
-            'info_dict': {
-                'id': '51535',
-                'ext': 'mp3',
-                'title': 'La Valette entgeht der Hinrichtung',
-                'description': 'md5:22f9541913a40fe50091d5cdd7c9f536',
-                'duration': 884,
-            }
-        }
-    ]
-
+class NDRBaseIE(InfoExtractor):
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
@@ -54,7 +24,11 @@ class NDRIE(InfoExtractor):
         if description:
             description = description.strip()
 
-        duration = int_or_none(self._html_search_regex(r'duration: (\d+),\n', page, 'duration', fatal=False))
+        duration = int_or_none(self._html_search_regex(r'duration: (\d+),\n', page, 'duration', default=None))
+        if not duration:
+            duration = parse_duration(self._html_search_regex(
+                r'(<span class="min">\d+</span>:<span class="sec">\d+</span>)',
+                page, 'duration', default=None))
 
         formats = []
 
@@ -92,3 +66,65 @@ class NDRIE(InfoExtractor):
             'duration': duration,
             'formats': formats,
         }
+
+
+class NDRIE(NDRBaseIE):
+    IE_NAME = 'ndr'
+    IE_DESC = 'NDR.de - Mediathek'
+    _VALID_URL = r'https?://www\.ndr\.de/.+?(?P<id>\d+)\.html'
+
+    _TESTS = [
+        {
+            'url': 'http://www.ndr.de/fernsehen/sendungen/nordmagazin/Kartoffeltage-in-der-Lewitz,nordmagazin25866.html',
+            'md5': '5bc5f5b92c82c0f8b26cddca34f8bb2c',
+            'note': 'Video file',
+            'info_dict': {
+                'id': '25866',
+                'ext': 'mp4',
+                'title': 'Kartoffeltage in der Lewitz',
+                'description': 'md5:48c4c04dde604c8a9971b3d4e3b9eaa8',
+                'duration': 166,
+            },
+            'skip': '404 Not found',
+        },
+        {
+            'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html',
+            'md5': 'dadc003c55ae12a5d2f6bd436cd73f59',
+            'info_dict': {
+                'id': '988',
+                'ext': 'mp4',
+                'title': 'Party, Pötte und Parade',
+                'description': 'Hunderttausende feiern zwischen Speicherstadt und St. Pauli den 826. Hafengeburtstag. Die NDR Sondersendung zeigt die schönsten und spektakulärsten Bilder vom Auftakt.',
+                'duration': 3498,
+            },
+        },
+        {
+            'url': 'http://www.ndr.de/info/audio51535.html',
+            'md5': 'bb3cd38e24fbcc866d13b50ca59307b8',
+            'note': 'Audio file',
+            'info_dict': {
+                'id': '51535',
+                'ext': 'mp3',
+                'title': 'La Valette entgeht der Hinrichtung',
+                'description': 'md5:22f9541913a40fe50091d5cdd7c9f536',
+                'duration': 884,
+            }
+        }
+    ]
+
+
+class NJoyIE(NDRBaseIE):
+    IE_NAME = 'N-JOY'
+    _VALID_URL = r'https?://www\.n-joy\.de/.+?(?P<id>\d+)\.html'
+
+    _TEST = {
+        'url': 'http://www.n-joy.de/entertainment/comedy/comedy_contest/Benaissa-beim-NDR-Comedy-Contest,comedycontest2480.html',
+        'md5': 'cb63be60cd6f9dd75218803146d8dc67',
+        'info_dict': {
+            'id': '2480',
+            'ext': 'mp4',
+            'title': 'Benaissa beim NDR Comedy Contest',
+            'description': 'Von seinem sehr "behaarten" Leben lässt sich Benaissa trotz aller Schwierigkeiten nicht unterkriegen.',
+            'duration': 654,
+        }
+    }
diff --git a/youtube_dl/extractor/neteasemusic.py b/youtube_dl/extractor/neteasemusic.py
new file mode 100644 (file)
index 0000000..a8e0a64
--- /dev/null
@@ -0,0 +1,459 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from hashlib import md5
+from base64 import b64encode
+from datetime import datetime
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_request,
+    compat_urllib_parse,
+    compat_str,
+    compat_itertools_count,
+)
+
+
+class NetEaseMusicBaseIE(InfoExtractor):
+    _FORMATS = ['bMusic', 'mMusic', 'hMusic']
+    _NETEASE_SALT = '3go8&$8*3*3h0k(2)2'
+    _API_BASE = 'http://music.163.com/api/'
+
+    @classmethod
+    def _encrypt(cls, dfsid):
+        salt_bytes = bytearray(cls._NETEASE_SALT.encode('utf-8'))
+        string_bytes = bytearray(compat_str(dfsid).encode('ascii'))
+        salt_len = len(salt_bytes)
+        for i in range(len(string_bytes)):
+            string_bytes[i] = string_bytes[i] ^ salt_bytes[i % salt_len]
+        m = md5()
+        m.update(bytes(string_bytes))
+        result = b64encode(m.digest()).decode('ascii')
+        return result.replace('/', '_').replace('+', '-')
+
+    @classmethod
+    def extract_formats(cls, info):
+        formats = []
+        for song_format in cls._FORMATS:
+            details = info.get(song_format)
+            if not details:
+                continue
+            formats.append({
+                'url': 'http://m1.music.126.net/%s/%s.%s' %
+                       (cls._encrypt(details['dfsId']), details['dfsId'],
+                        details['extension']),
+                'ext': details.get('extension'),
+                'abr': details.get('bitrate', 0) / 1000,
+                'format_id': song_format,
+                'filesize': details.get('size'),
+                'asr': details.get('sr')
+            })
+        return formats
+
+    @classmethod
+    def convert_milliseconds(cls, ms):
+        return int(round(ms / 1000.0))
+
+    def query_api(self, endpoint, video_id, note):
+        req = compat_urllib_request.Request('%s%s' % (self._API_BASE, endpoint))
+        req.add_header('Referer', self._API_BASE)
+        return self._download_json(req, video_id, note)
+
+
+class NetEaseMusicIE(NetEaseMusicBaseIE):
+    IE_NAME = 'netease:song'
+    IE_DESC = '网易云音乐'
+    _VALID_URL = r'https?://music\.163\.com/(#/)?song\?id=(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://music.163.com/#/song?id=32102397',
+        'md5': 'f2e97280e6345c74ba9d5677dd5dcb45',
+        'info_dict': {
+            'id': '32102397',
+            'ext': 'mp3',
+            'title': 'Bad Blood (feat. Kendrick Lamar)',
+            'creator': 'Taylor Swift / Kendrick Lamar',
+            'upload_date': '20150517',
+            'timestamp': 1431878400,
+            'description': 'md5:a10a54589c2860300d02e1de821eb2ef',
+        },
+    }, {
+        'note': 'No lyrics translation.',
+        'url': 'http://music.163.com/#/song?id=29822014',
+        'info_dict': {
+            'id': '29822014',
+            'ext': 'mp3',
+            'title': '听见下雨的声音',
+            'creator': '周杰伦',
+            'upload_date': '20141225',
+            'timestamp': 1419523200,
+            'description': 'md5:a4d8d89f44656af206b7b2555c0bce6c',
+        },
+    }, {
+        'note': 'No lyrics.',
+        'url': 'http://music.163.com/song?id=17241424',
+        'info_dict': {
+            'id': '17241424',
+            'ext': 'mp3',
+            'title': 'Opus 28',
+            'creator': 'Dustin O\'Halloran',
+            'upload_date': '20080211',
+            'timestamp': 1202745600,
+        },
+    }, {
+        'note': 'Has translated name.',
+        'url': 'http://music.163.com/#/song?id=22735043',
+        'info_dict': {
+            'id': '22735043',
+            'ext': 'mp3',
+            'title': '소원을 말해봐 (Genie)',
+            'creator': '少女时代',
+            'description': 'md5:79d99cc560e4ca97e0c4d86800ee4184',
+            'upload_date': '20100127',
+            'timestamp': 1264608000,
+            'alt_title': '说出愿望吧(Genie)',
+        }
+    }]
+
+    def _process_lyrics(self, lyrics_info):
+        original = lyrics_info.get('lrc', {}).get('lyric')
+        translated = lyrics_info.get('tlyric', {}).get('lyric')
+
+        if not translated:
+            return original
+
+        lyrics_expr = r'(\[[0-9]{2}:[0-9]{2}\.[0-9]{2,}\])([^\n]+)'
+        original_ts_texts = re.findall(lyrics_expr, original)
+        translation_ts_dict = dict(
+            (time_stamp, text) for time_stamp, text in re.findall(lyrics_expr, translated)
+        )
+        lyrics = '\n'.join([
+            '%s%s / %s' % (time_stamp, text, translation_ts_dict.get(time_stamp, ''))
+            for time_stamp, text in original_ts_texts
+        ])
+        return lyrics
+
+    def _real_extract(self, url):
+        song_id = self._match_id(url)
+
+        params = {
+            'id': song_id,
+            'ids': '[%s]' % song_id
+        }
+        info = self.query_api(
+            'song/detail?' + compat_urllib_parse.urlencode(params),
+            song_id, 'Downloading song info')['songs'][0]
+
+        formats = self.extract_formats(info)
+        self._sort_formats(formats)
+
+        lyrics_info = self.query_api(
+            'song/lyric?id=%s&lv=-1&tv=-1' % song_id,
+            song_id, 'Downloading lyrics data')
+        lyrics = self._process_lyrics(lyrics_info)
+
+        alt_title = None
+        if info.get('transNames'):
+            alt_title = '/'.join(info.get('transNames'))
+
+        return {
+            'id': song_id,
+            'title': info['name'],
+            'alt_title': alt_title,
+            'creator': ' / '.join([artist['name'] for artist in info.get('artists', [])]),
+            'timestamp': self.convert_milliseconds(info.get('album', {}).get('publishTime')),
+            'thumbnail': info.get('album', {}).get('picUrl'),
+            'duration': self.convert_milliseconds(info.get('duration', 0)),
+            'description': lyrics,
+            'formats': formats,
+        }
+
+
+class NetEaseMusicAlbumIE(NetEaseMusicBaseIE):
+    IE_NAME = 'netease:album'
+    IE_DESC = '网易云音乐 - 专辑'
+    _VALID_URL = r'https?://music\.163\.com/(#/)?album\?id=(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://music.163.com/#/album?id=220780',
+        'info_dict': {
+            'id': '220780',
+            'title': 'B\'day',
+        },
+        'playlist_count': 23,
+    }
+
+    def _real_extract(self, url):
+        album_id = self._match_id(url)
+
+        info = self.query_api(
+            'album/%s?id=%s' % (album_id, album_id),
+            album_id, 'Downloading album data')['album']
+
+        name = info['name']
+        desc = info.get('description')
+        entries = [
+            self.url_result('http://music.163.com/#/song?id=%s' % song['id'],
+                            'NetEaseMusic', song['id'])
+            for song in info['songs']
+        ]
+        return self.playlist_result(entries, album_id, name, desc)
+
+
+class NetEaseMusicSingerIE(NetEaseMusicBaseIE):
+    IE_NAME = 'netease:singer'
+    IE_DESC = '网易云音乐 - 歌手'
+    _VALID_URL = r'https?://music\.163\.com/(#/)?artist\?id=(?P<id>[0-9]+)'
+    _TESTS = [{
+        'note': 'Singer has aliases.',
+        'url': 'http://music.163.com/#/artist?id=10559',
+        'info_dict': {
+            'id': '10559',
+            'title': '张惠妹 - aMEI;阿密特',
+        },
+        'playlist_count': 50,
+    }, {
+        'note': 'Singer has translated name.',
+        'url': 'http://music.163.com/#/artist?id=124098',
+        'info_dict': {
+            'id': '124098',
+            'title': '李昇基 - 이승기',
+        },
+        'playlist_count': 50,
+    }]
+
+    def _real_extract(self, url):
+        singer_id = self._match_id(url)
+
+        info = self.query_api(
+            'artist/%s?id=%s' % (singer_id, singer_id),
+            singer_id, 'Downloading singer data')
+
+        name = info['artist']['name']
+        if info['artist']['trans']:
+            name = '%s - %s' % (name, info['artist']['trans'])
+        if info['artist']['alias']:
+            name = '%s - %s' % (name, ';'.join(info['artist']['alias']))
+
+        entries = [
+            self.url_result('http://music.163.com/#/song?id=%s' % song['id'],
+                            'NetEaseMusic', song['id'])
+            for song in info['hotSongs']
+        ]
+        return self.playlist_result(entries, singer_id, name)
+
+
+class NetEaseMusicListIE(NetEaseMusicBaseIE):
+    IE_NAME = 'netease:playlist'
+    IE_DESC = '网易云音乐 - 歌单'
+    _VALID_URL = r'https?://music\.163\.com/(#/)?(playlist|discover/toplist)\?id=(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://music.163.com/#/playlist?id=79177352',
+        'info_dict': {
+            'id': '79177352',
+            'title': 'Billboard 2007 Top 100',
+            'description': 'md5:12fd0819cab2965b9583ace0f8b7b022'
+        },
+        'playlist_count': 99,
+    }, {
+        'note': 'Toplist/Charts sample',
+        'url': 'http://music.163.com/#/discover/toplist?id=3733003',
+        'info_dict': {
+            'id': '3733003',
+            'title': 're:韩国Melon排行榜周榜 [0-9]{4}-[0-9]{2}-[0-9]{2}',
+            'description': 'md5:73ec782a612711cadc7872d9c1e134fc',
+        },
+        'playlist_count': 50,
+    }]
+
+    def _real_extract(self, url):
+        list_id = self._match_id(url)
+
+        info = self.query_api(
+            'playlist/detail?id=%s&lv=-1&tv=-1' % list_id,
+            list_id, 'Downloading playlist data')['result']
+
+        name = info['name']
+        desc = info.get('description')
+
+        if info.get('specialType') == 10:  # is a chart/toplist
+            datestamp = datetime.fromtimestamp(
+                self.convert_milliseconds(info['updateTime'])).strftime('%Y-%m-%d')
+            name = '%s %s' % (name, datestamp)
+
+        entries = [
+            self.url_result('http://music.163.com/#/song?id=%s' % song['id'],
+                            'NetEaseMusic', song['id'])
+            for song in info['tracks']
+        ]
+        return self.playlist_result(entries, list_id, name, desc)
+
+
+class NetEaseMusicMvIE(NetEaseMusicBaseIE):
+    IE_NAME = 'netease:mv'
+    IE_DESC = '网易云音乐 - MV'
+    _VALID_URL = r'https?://music\.163\.com/(#/)?mv\?id=(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://music.163.com/#/mv?id=415350',
+        'info_dict': {
+            'id': '415350',
+            'ext': 'mp4',
+            'title': '이럴거면 그러지말지',
+            'description': '白雅言自作曲唱甜蜜爱情',
+            'creator': '白雅言',
+            'upload_date': '20150520',
+        },
+    }
+
+    def _real_extract(self, url):
+        mv_id = self._match_id(url)
+
+        info = self.query_api(
+            'mv/detail?id=%s&type=mp4' % mv_id,
+            mv_id, 'Downloading mv info')['data']
+
+        formats = [
+            {'url': mv_url, 'ext': 'mp4', 'format_id': '%sp' % brs, 'height': int(brs)}
+            for brs, mv_url in info['brs'].items()
+        ]
+        self._sort_formats(formats)
+
+        return {
+            'id': mv_id,
+            'title': info['name'],
+            'description': info.get('desc') or info.get('briefDesc'),
+            'creator': info['artistName'],
+            'upload_date': info['publishTime'].replace('-', ''),
+            'formats': formats,
+            'thumbnail': info.get('cover'),
+            'duration': self.convert_milliseconds(info.get('duration', 0)),
+        }
+
+
+class NetEaseMusicProgramIE(NetEaseMusicBaseIE):
+    IE_NAME = 'netease:program'
+    IE_DESC = '网易云音乐 - 电台节目'
+    _VALID_URL = r'https?://music\.163\.com/(#/?)program\?id=(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://music.163.com/#/program?id=10109055',
+        'info_dict': {
+            'id': '10109055',
+            'ext': 'mp3',
+            'title': '不丹足球背后的故事',
+            'description': '喜马拉雅人的足球梦 ...',
+            'creator': '大话西藏',
+            'timestamp': 1434179342,
+            'upload_date': '20150613',
+            'duration': 900,
+        },
+    }, {
+        'note': 'This program has accompanying songs.',
+        'url': 'http://music.163.com/#/program?id=10141022',
+        'info_dict': {
+            'id': '10141022',
+            'title': '25岁,你是自在如风的少年<27°C>',
+            'description': 'md5:8d594db46cc3e6509107ede70a4aaa3b',
+        },
+        'playlist_count': 4,
+    }, {
+        'note': 'This program has accompanying songs.',
+        'url': 'http://music.163.com/#/program?id=10141022',
+        'info_dict': {
+            'id': '10141022',
+            'ext': 'mp3',
+            'title': '25岁,你是自在如风的少年<27°C>',
+            'description': 'md5:8d594db46cc3e6509107ede70a4aaa3b',
+            'timestamp': 1434450841,
+            'upload_date': '20150616',
+        },
+        'params': {
+            'noplaylist': True
+        }
+    }]
+
+    def _real_extract(self, url):
+        program_id = self._match_id(url)
+
+        info = self.query_api(
+            'dj/program/detail?id=%s' % program_id,
+            program_id, 'Downloading program info')['program']
+
+        name = info['name']
+        description = info['description']
+
+        if not info['songs'] or self._downloader.params.get('noplaylist'):
+            if info['songs']:
+                self.to_screen(
+                    'Downloading just the main audio %s because of --no-playlist'
+                    % info['mainSong']['id'])
+
+            formats = self.extract_formats(info['mainSong'])
+            self._sort_formats(formats)
+
+            return {
+                'id': program_id,
+                'title': name,
+                'description': description,
+                'creator': info['dj']['brand'],
+                'timestamp': self.convert_milliseconds(info['createTime']),
+                'thumbnail': info['coverUrl'],
+                'duration': self.convert_milliseconds(info.get('duration', 0)),
+                'formats': formats,
+            }
+
+        self.to_screen(
+            'Downloading playlist %s - add --no-playlist to just download the main audio %s'
+            % (program_id, info['mainSong']['id']))
+
+        song_ids = [info['mainSong']['id']]
+        song_ids.extend([song['id'] for song in info['songs']])
+        entries = [
+            self.url_result('http://music.163.com/#/song?id=%s' % song_id,
+                            'NetEaseMusic', song_id)
+            for song_id in song_ids
+        ]
+        return self.playlist_result(entries, program_id, name, description)
+
+
+class NetEaseMusicDjRadioIE(NetEaseMusicBaseIE):
+    IE_NAME = 'netease:djradio'
+    IE_DESC = '网易云音乐 - 电台'
+    _VALID_URL = r'https?://music\.163\.com/(#/)?djradio\?id=(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://music.163.com/#/djradio?id=42',
+        'info_dict': {
+            'id': '42',
+            'title': '声音蔓延',
+            'description': 'md5:766220985cbd16fdd552f64c578a6b15'
+        },
+        'playlist_mincount': 40,
+    }
+    _PAGE_SIZE = 1000
+
+    def _real_extract(self, url):
+        dj_id = self._match_id(url)
+
+        name = None
+        desc = None
+        entries = []
+        for offset in compat_itertools_count(start=0, step=self._PAGE_SIZE):
+            info = self.query_api(
+                'dj/program/byradio?asc=false&limit=%d&radioId=%s&offset=%d'
+                % (self._PAGE_SIZE, dj_id, offset),
+                dj_id, 'Downloading dj programs - %d' % offset)
+
+            entries.extend([
+                self.url_result(
+                    'http://music.163.com/#/program?id=%s' % program['id'],
+                    'NetEaseMusicProgram', program['id'])
+                for program in info['programs']
+            ])
+
+            if name is None:
+                radio = info['programs'][0]['radio']
+                name = radio['name']
+                desc = radio['desc']
+
+            if not info['more']:
+                break
+
+        return self.playlist_result(entries, dj_id, name, desc)
index bc17e20aa9d736eb9e4ba0a39929f20db47d8465..0d165a82ad53ac8ac16ca8943c934db9fb28b720 100644 (file)
@@ -49,7 +49,7 @@ class NetzkinoIE(InfoExtractor):
             'http://www.netzkino.de/beta/dist/production.min.js', video_id,
             note='Downloading player code')
         avo_js = self._search_regex(
-            r'window\.avoCore\s*=.*?urlTemplate:\s*(\{.*?"\})',
+            r'var urlTemplate=(\{.*?"\})',
             production_js, 'URL templates')
         templates = self._parse_json(
             avo_js, video_id, transform_source=js_to_json)
index 85fcad06b51dc9ce87bdd563043c92a126cc8eea..5a9e73cd66a1b1224bdec848722f5e9d14f65c38 100644 (file)
@@ -31,7 +31,7 @@ class NewstubeIE(InfoExtractor):
         page = self._download_webpage(url, video_id, 'Downloading page')
 
         video_guid = self._html_search_regex(
-            r'<meta property="og:video" content="https?://(?:www\.)?newstube\.ru/freshplayer\.swf\?guid=(?P<guid>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})',
+            r'<meta property="og:video:url" content="https?://(?:www\.)?newstube\.ru/freshplayer\.swf\?guid=(?P<guid>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})',
             page, 'video GUID')
 
         player = self._download_xml(
index 02dba4ef639e64deff790f94bd5cd0cd6da2a4cb..c10784f6b7321395e69c86ab06f91f8d3b37b655 100644 (file)
@@ -6,6 +6,7 @@ from ..utils import parse_iso8601
 
 
 class NextMediaIE(InfoExtractor):
+    IE_DESC = '蘋果日報'
     _VALID_URL = r'http://hk.apple.nextmedia.com/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)'
     _TESTS = [{
         'url': 'http://hk.apple.nextmedia.com/realtime/news/20141108/53109199',
@@ -66,6 +67,7 @@ class NextMediaIE(InfoExtractor):
 
 
 class NextMediaActionNewsIE(NextMediaIE):
+    IE_DESC = '蘋果日報 - 動新聞'
     _VALID_URL = r'http://hk.dv.nextmedia.com/actionnews/[^/]+/(?P<date>\d+)/(?P<id>\d+)/\d+'
     _TESTS = [{
         'url': 'http://hk.dv.nextmedia.com/actionnews/hit/20150121/19009428/20061460',
@@ -89,8 +91,9 @@ class NextMediaActionNewsIE(NextMediaIE):
         return self._extract_from_nextmedia_page(news_id, url, article_page)
 
 
-class AppleDailyRealtimeNewsIE(NextMediaIE):
-    _VALID_URL = r'http://(www|ent).appledaily.com.tw/(realtimenews|enews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
+class AppleDailyIE(NextMediaIE):
+    IE_DESC = '臺灣蘋果日報'
+    _VALID_URL = r'http://(www|ent).appledaily.com.tw/(?:animation|appledaily|enews|realtimenews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
     _TESTS = [{
         'url': 'http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694',
         'md5': 'a843ab23d150977cc55ef94f1e2c1e4d',
@@ -99,7 +102,7 @@ class AppleDailyRealtimeNewsIE(NextMediaIE):
             'ext': 'mp4',
             'title': '周亭羽走過摩鐵陰霾2男陪吃 九把刀孤寒看醫生',
             'thumbnail': 're:^https?://.*\.jpg$',
-            'description': 'md5:b23787119933404ce515c6356a8c355c',
+            'description': 'md5:2acd430e59956dc47cd7f67cb3c003f4',
             'upload_date': '20150128',
         }
     }, {
@@ -110,26 +113,10 @@ class AppleDailyRealtimeNewsIE(NextMediaIE):
             'ext': 'mp4',
             'title': '不滿被踩腳 山東兩大媽一路打下車',
             'thumbnail': 're:^https?://.*\.jpg$',
-            'description': 'md5:2648aaf6fc4f401f6de35a91d111aa1d',
+            'description': 'md5:175b4260c1d7c085993474217e4ab1b4',
             'upload_date': '20150128',
         }
-    }]
-
-    _URL_PATTERN = r'\{url: \'(.+)\'\}'
-
-    def _fetch_title(self, page):
-        return self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title')
-
-    def _fetch_thumbnail(self, page):
-        return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False)
-
-    def _fetch_timestamp(self, page):
-        return None
-
-
-class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE):
-    _VALID_URL = 'http://www.appledaily.com.tw/animation/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
-    _TESTS = [{
+    }, {
         'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003671',
         'md5': '03df296d95dedc2d5886debbb80cb43f',
         'info_dict': {
@@ -154,10 +141,22 @@ class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE):
         'expected_warnings': [
             'video thumbnail',
         ]
+    }, {
+        'url': 'http://www.appledaily.com.tw/appledaily/article/supplement/20140417/35770334/',
+        'only_matching': True,
     }]
 
+    _URL_PATTERN = r'\{url: \'(.+)\'\}'
+
     def _fetch_title(self, page):
-        return self._html_search_meta('description', page, 'news title')
+        return (self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title', default=None) or
+                self._html_search_meta('description', page, 'news title'))
+
+    def _fetch_thumbnail(self, page):
+        return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False)
+
+    def _fetch_timestamp(self, page):
+        return None
 
     def _fetch_description(self, page):
         return self._html_search_meta('description', page, 'news description')
index 2684dd250aa65e22903612f4a1780fc8f701296a..dc54634a58e440fc70ae9bcb3e7d5781981b2b1e 100644 (file)
@@ -19,7 +19,7 @@ class NFLIE(InfoExtractor):
     _VALID_URL = r'''(?x)https?://
         (?P<host>(?:www\.)?(?:nfl\.com|.*?\.clubs\.nfl\.com))/
         (?:.+?/)*
-        (?P<id>(?:\d[a-z]{2}\d{13}|\w{8}\-(?:\w{4}\-){3}\w{12}))'''
+        (?P<id>(?:[a-z0-9]{16}|\w{8}\-(?:\w{4}\-){3}\w{12}))'''
     _TESTS = [
         {
             'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights',
@@ -58,6 +58,10 @@ class NFLIE(InfoExtractor):
                 'upload_date': '20150202',
             },
         },
+        {
+            'url': 'http://www.nfl.com/videos/nfl-network-top-ten/09000d5d810a6bd4/Top-10-Gutsiest-Performances-Jack-Youngblood',
+            'only_matching': True,
+        }
     ]
 
     @staticmethod
index 40746599880469f5c79110020f39d31f2a8cbff6..279b18386197560346e1cbce716ecf7ff61af2f9 100644 (file)
@@ -21,6 +21,9 @@ class NHLBaseInfoExtractor(InfoExtractor):
         return json_string.replace('\\\'', '\'')
 
     def _real_extract_video(self, video_id):
+        vid_parts = video_id.split(',')
+        if len(vid_parts) == 3:
+            video_id = '%s0%s%s-X-h' % (vid_parts[0][:4], vid_parts[1], vid_parts[2].rjust(4, '0'))
         json_url = 'http://video.nhl.com/videocenter/servlets/playlist?ids=%s&format=json' % video_id
         data = self._download_json(
             json_url, video_id, transform_source=self._fix_json)
@@ -47,7 +50,7 @@ class NHLBaseInfoExtractor(InfoExtractor):
             video_url = initial_video_url
 
         join = compat_urlparse.urljoin
-        return {
+        ret = {
             'id': video_id,
             'title': info['name'],
             'url': video_url,
@@ -56,11 +59,20 @@ class NHLBaseInfoExtractor(InfoExtractor):
             'thumbnail': join(join(video_url, '/u/'), info['bigImage']),
             'upload_date': unified_strdate(info['releaseDate'].split('.')[0]),
         }
+        if video_url.startswith('rtmp:'):
+            mobj = re.match(r'(?P<tc_url>rtmp://[^/]+/(?P<app>[a-z0-9/]+))/(?P<play_path>mp4:.*)', video_url)
+            ret.update({
+                'tc_url': mobj.group('tc_url'),
+                'play_path': mobj.group('play_path'),
+                'app': mobj.group('app'),
+                'no_resume': True,
+            })
+        return ret
 
 
 class NHLIE(NHLBaseInfoExtractor):
     IE_NAME = 'nhl.com'
-    _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/(?:console)?(?:\?(?:.*?[?&])?)id=(?P<id>[-0-9a-zA-Z]+)'
+    _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/(?:console)?(?:\?(?:.*?[?&])?)(?:id|hlg)=(?P<id>[-0-9a-zA-Z,]+)'
 
     _TESTS = [{
         'url': 'http://video.canucks.nhl.com/videocenter/console?catid=6?id=453614',
@@ -101,6 +113,29 @@ class NHLIE(NHLBaseInfoExtractor):
     }, {
         'url': 'http://video.nhl.com/videocenter/?id=736722',
         'only_matching': True,
+    }, {
+        'url': 'http://video.nhl.com/videocenter/console?hlg=20142015,2,299&lang=en',
+        'md5': '076fcb88c255154aacbf0a7accc3f340',
+        'info_dict': {
+            'id': '2014020299-X-h',
+            'ext': 'mp4',
+            'title': 'Penguins at Islanders / Game Highlights',
+            'description': 'Home broadcast - Pittsburgh Penguins at New York Islanders - November 22, 2014',
+            'duration': 268,
+            'upload_date': '20141122',
+        }
+    }, {
+        'url': 'http://video.oilers.nhl.com/videocenter/console?id=691469&catid=4',
+        'info_dict': {
+            'id': '691469',
+            'ext': 'mp4',
+            'title': 'RAW | Craig MacTavish Full Press Conference',
+            'description': 'Oilers GM Craig MacTavish addresses the media at Rexall Place on Friday.',
+            'upload_date': '20141205',
+        },
+        'params': {
+            'skip_download': True,  # Requires rtmpdump
+        }
     }]
 
     def _real_extract(self, url):
index ddec7b3387f98e8bc86ab44feefd175676d40489..0f8aa5adad5b2247621ce00249f3bd03a33a104a 100644 (file)
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
 
 import re
 import json
+import datetime
 
 from .common import InfoExtractor
 from ..compat import (
@@ -14,7 +15,9 @@ from ..utils import (
     ExtractorError,
     int_or_none,
     parse_duration,
-    unified_strdate,
+    parse_iso8601,
+    xpath_text,
+    determine_ext,
 )
 
 
@@ -32,30 +35,50 @@ class NiconicoIE(InfoExtractor):
             'uploader': 'takuya0301',
             'uploader_id': '2698420',
             'upload_date': '20131123',
+            'timestamp': 1385182762,
             'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org',
             'duration': 33,
         },
-        'params': {
-            'username': 'ydl.niconico@gmail.com',
-            'password': 'youtube-dl',
-        },
     }, {
+        # File downloaded with and without credentials are different, so omit
+        # the md5 field
         'url': 'http://www.nicovideo.jp/watch/nm14296458',
-        'md5': '8db08e0158457cf852a31519fceea5bc',
         'info_dict': {
             'id': 'nm14296458',
             'ext': 'swf',
             'title': '【鏡音リン】Dance on media【オリジナル】take2!',
-            'description': 'md5:',
+            'description': 'md5:689f066d74610b3b22e0f1739add0f58',
             'uploader': 'りょうた',
             'uploader_id': '18822557',
             'upload_date': '20110429',
+            'timestamp': 1304065916,
             'duration': 209,
         },
-        'params': {
-            'username': 'ydl.niconico@gmail.com',
-            'password': 'youtube-dl',
+    }, {
+        # 'video exists but is marked as "deleted"
+        # md5 is unstable
+        'url': 'http://www.nicovideo.jp/watch/sm10000',
+        'info_dict': {
+            'id': 'sm10000',
+            'ext': 'unknown_video',
+            'description': 'deleted',
+            'title': 'ドラえもんエターナル第3話「決戦第3新東京市」<前編>',
+            'upload_date': '20071224',
+            'timestamp': 1198527840,  # timestamp field has different value if logged in
+            'duration': 304,
         },
+    }, {
+        'url': 'http://www.nicovideo.jp/watch/so22543406',
+        'info_dict': {
+            'id': '1388129933',
+            'ext': 'mp4',
+            'title': '【第1回】RADIOアニメロミックス ラブライブ!~のぞえりRadio Garden~',
+            'description': 'md5:b27d224bb0ff53d3c8269e9f8b561cf1',
+            'timestamp': 1388851200,
+            'upload_date': '20140104',
+            'uploader': 'アニメロチャンネル',
+            'uploader_id': '312',
+        }
     }]
 
     _VALID_URL = r'https?://(?:www\.|secure\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)'
@@ -95,9 +118,13 @@ class NiconicoIE(InfoExtractor):
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        # Get video webpage. We are not actually interested in it, but need
-        # the cookies in order to be able to download the info webpage
-        self._download_webpage('http://www.nicovideo.jp/watch/' + video_id, video_id)
+        # Get video webpage. We are not actually interested in it for normal
+        # cases, but need the cookies in order to be able to download the
+        # info webpage
+        webpage, handle = self._download_webpage_handle(
+            'http://www.nicovideo.jp/watch/' + video_id, video_id)
+        if video_id.startswith('so'):
+            video_id = self._match_id(handle.geturl())
 
         video_info = self._download_xml(
             'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id,
@@ -127,22 +154,77 @@ class NiconicoIE(InfoExtractor):
                 flv_info_request, video_id,
                 note='Downloading flv info', errnote='Unable to download flv info')
 
-        if 'deleted=' in flv_info_webpage:
-            raise ExtractorError('The video has been deleted.',
-                                 expected=True)
-        video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0]
+        flv_info = compat_urlparse.parse_qs(flv_info_webpage)
+        if 'url' not in flv_info:
+            if 'deleted' in flv_info:
+                raise ExtractorError('The video has been deleted.',
+                                     expected=True)
+            else:
+                raise ExtractorError('Unable to find video URL')
+
+        video_real_url = flv_info['url'][0]
 
         # Start extracting information
-        title = video_info.find('.//title').text
-        extension = video_info.find('.//movie_type').text
-        video_format = extension.upper()
-        thumbnail = video_info.find('.//thumbnail_url').text
-        description = video_info.find('.//description').text
-        upload_date = unified_strdate(video_info.find('.//first_retrieve').text.split('+')[0])
-        view_count = int_or_none(video_info.find('.//view_counter').text)
-        comment_count = int_or_none(video_info.find('.//comment_num').text)
-        duration = parse_duration(video_info.find('.//length').text)
-        webpage_url = video_info.find('.//watch_url').text
+        title = xpath_text(video_info, './/title')
+        if not title:
+            title = self._og_search_title(webpage, default=None)
+        if not title:
+            title = self._html_search_regex(
+                r'<span[^>]+class="videoHeaderTitle"[^>]*>([^<]+)</span>',
+                webpage, 'video title')
+
+        watch_api_data_string = self._html_search_regex(
+            r'<div[^>]+id="watchAPIDataContainer"[^>]+>([^<]+)</div>',
+            webpage, 'watch api data', default=None)
+        watch_api_data = self._parse_json(watch_api_data_string, video_id) if watch_api_data_string else {}
+        video_detail = watch_api_data.get('videoDetail', {})
+
+        extension = xpath_text(video_info, './/movie_type')
+        if not extension:
+            extension = determine_ext(video_real_url)
+
+        thumbnail = (
+            xpath_text(video_info, './/thumbnail_url') or
+            self._html_search_meta('image', webpage, 'thumbnail', default=None) or
+            video_detail.get('thumbnail'))
+
+        description = xpath_text(video_info, './/description')
+
+        timestamp = parse_iso8601(xpath_text(video_info, './/first_retrieve'))
+        if not timestamp:
+            match = self._html_search_meta('datePublished', webpage, 'date published', default=None)
+            if match:
+                timestamp = parse_iso8601(match.replace('+', ':00+'))
+        if not timestamp and video_detail.get('postedAt'):
+            timestamp = parse_iso8601(
+                video_detail['postedAt'].replace('/', '-'),
+                delimiter=' ', timezone=datetime.timedelta(hours=9))
+
+        view_count = int_or_none(xpath_text(video_info, './/view_counter'))
+        if not view_count:
+            match = self._html_search_regex(
+                r'>Views: <strong[^>]*>([^<]+)</strong>',
+                webpage, 'view count', default=None)
+            if match:
+                view_count = int_or_none(match.replace(',', ''))
+        view_count = view_count or video_detail.get('viewCount')
+
+        comment_count = int_or_none(xpath_text(video_info, './/comment_num'))
+        if not comment_count:
+            match = self._html_search_regex(
+                r'>Comments: <strong[^>]*>([^<]+)</strong>',
+                webpage, 'comment count', default=None)
+            if match:
+                comment_count = int_or_none(match.replace(',', ''))
+        comment_count = comment_count or video_detail.get('commentCount')
+
+        duration = (parse_duration(
+            xpath_text(video_info, './/length') or
+            self._html_search_meta(
+                'video:duration', webpage, 'video duration', default=None)) or
+            video_detail.get('length'))
+
+        webpage_url = xpath_text(video_info, './/watch_url') or url
 
         if video_info.find('.//ch_id') is not None:
             uploader_id = video_info.find('.//ch_id').text
@@ -158,11 +240,11 @@ class NiconicoIE(InfoExtractor):
             'url': video_real_url,
             'title': title,
             'ext': extension,
-            'format': video_format,
+            'format_id': 'economy' if video_real_url.endswith('low') else 'normal',
             'thumbnail': thumbnail,
             'description': description,
             'uploader': uploader,
-            'upload_date': upload_date,
+            'timestamp': timestamp,
             'uploader_id': uploader_id,
             'view_count': view_count,
             'comment_count': comment_count,
index 251e6da07457b7e7be6b5703b5769214ae299c3d..a53e27b274eaa21ac15a1dc5077001d520832696 100644 (file)
@@ -14,7 +14,9 @@ from ..compat import (
 from ..utils import (
     clean_html,
     ExtractorError,
-    unified_strdate,
+    int_or_none,
+    float_or_none,
+    parse_iso8601,
 )
 
 
@@ -25,21 +27,38 @@ class NocoIE(InfoExtractor):
     _SUB_LANG_TEMPLATE = '&sub_lang=%s'
     _NETRC_MACHINE = 'noco'
 
-    _TEST = {
-        'url': 'http://noco.tv/emission/11538/nolife/ami-ami-idol-hello-france/',
-        'md5': '0a993f0058ddbcd902630b2047ef710e',
-        'info_dict': {
-            'id': '11538',
-            'ext': 'mp4',
-            'title': 'Ami Ami Idol - Hello! France',
-            'description': 'md5:4eaab46ab68fa4197a317a88a53d3b86',
-            'upload_date': '20140412',
-            'uploader': 'Nolife',
-            'uploader_id': 'NOL',
-            'duration': 2851.2,
+    _TESTS = [
+        {
+            'url': 'http://noco.tv/emission/11538/nolife/ami-ami-idol-hello-france/',
+            'md5': '0a993f0058ddbcd902630b2047ef710e',
+            'info_dict': {
+                'id': '11538',
+                'ext': 'mp4',
+                'title': 'Ami Ami Idol - Hello! France',
+                'description': 'md5:4eaab46ab68fa4197a317a88a53d3b86',
+                'upload_date': '20140412',
+                'uploader': 'Nolife',
+                'uploader_id': 'NOL',
+                'duration': 2851.2,
+            },
+            'skip': 'Requires noco account',
         },
-        'skip': 'Requires noco account',
-    }
+        {
+            'url': 'http://noco.tv/emission/12610/lbl42/the-guild/s01e01-wake-up-call',
+            'md5': 'c190f1f48e313c55838f1f412225934d',
+            'info_dict': {
+                'id': '12610',
+                'ext': 'mp4',
+                'title': 'The Guild #1 - Wake-Up Call',
+                'timestamp': 1403863200,
+                'upload_date': '20140627',
+                'uploader': 'LBL42',
+                'uploader_id': 'LBL',
+                'duration': 233.023,
+            },
+            'skip': 'Requires noco account',
+        }
+    ]
 
     def _real_initialize(self):
         self._login()
@@ -90,51 +109,70 @@ class NocoIE(InfoExtractor):
             'shows/%s/medias' % video_id,
             video_id, 'Downloading video JSON')
 
+        show = self._call_api(
+            'shows/by_id/%s' % video_id,
+            video_id, 'Downloading show JSON')[0]
+
+        options = self._call_api(
+            'users/init', video_id,
+            'Downloading user options JSON')['options']
+        audio_lang_pref = options.get('audio_language') or options.get('language', 'fr')
+
+        if audio_lang_pref == 'original':
+            audio_lang_pref = show['original_lang']
+        if len(medias) == 1:
+            audio_lang_pref = list(medias.keys())[0]
+        elif audio_lang_pref not in medias:
+            audio_lang_pref = 'fr'
+
         qualities = self._call_api(
             'qualities',
             video_id, 'Downloading qualities JSON')
 
         formats = []
 
-        for lang, lang_dict in medias['fr']['video_list'].items():
-            for format_id, fmt in lang_dict['quality_list'].items():
-                format_id_extended = '%s-%s' % (lang, format_id) if lang != 'none' else format_id
-
-                video = self._call_api(
-                    'shows/%s/video/%s/fr' % (video_id, format_id.lower()),
-                    video_id, 'Downloading %s video JSON' % format_id_extended,
-                    lang if lang != 'none' else None)
-
-                file_url = video['file']
-                if not file_url:
-                    continue
-
-                if file_url in ['forbidden', 'not found']:
-                    popmessage = video['popmessage']
-                    self._raise_error(popmessage['title'], popmessage['message'])
-
-                formats.append({
-                    'url': file_url,
-                    'format_id': format_id_extended,
-                    'width': fmt['res_width'],
-                    'height': fmt['res_lines'],
-                    'abr': fmt['audiobitrate'],
-                    'vbr': fmt['videobitrate'],
-                    'filesize': fmt['filesize'],
-                    'format_note': qualities[format_id]['quality_name'],
-                    'preference': qualities[format_id]['priority'],
-                })
+        for audio_lang, audio_lang_dict in medias.items():
+            preference = 1 if audio_lang == audio_lang_pref else 0
+            for sub_lang, lang_dict in audio_lang_dict['video_list'].items():
+                for format_id, fmt in lang_dict['quality_list'].items():
+                    format_id_extended = 'audio-%s_sub-%s_%s' % (audio_lang, sub_lang, format_id)
+
+                    video = self._call_api(
+                        'shows/%s/video/%s/%s' % (video_id, format_id.lower(), audio_lang),
+                        video_id, 'Downloading %s video JSON' % format_id_extended,
+                        sub_lang if sub_lang != 'none' else None)
+
+                    file_url = video['file']
+                    if not file_url:
+                        continue
+
+                    if file_url in ['forbidden', 'not found']:
+                        popmessage = video['popmessage']
+                        self._raise_error(popmessage['title'], popmessage['message'])
+
+                    formats.append({
+                        'url': file_url,
+                        'format_id': format_id_extended,
+                        'width': int_or_none(fmt.get('res_width')),
+                        'height': int_or_none(fmt.get('res_lines')),
+                        'abr': int_or_none(fmt.get('audiobitrate')),
+                        'vbr': int_or_none(fmt.get('videobitrate')),
+                        'filesize': int_or_none(fmt.get('filesize')),
+                        'format_note': qualities[format_id].get('quality_name'),
+                        'quality': qualities[format_id].get('priority'),
+                        'preference': preference,
+                    })
 
         self._sort_formats(formats)
 
-        show = self._call_api(
-            'shows/by_id/%s' % video_id,
-            video_id, 'Downloading show JSON')[0]
+        timestamp = parse_iso8601(show.get('online_date_start_utc'), ' ')
+
+        if timestamp is not None and timestamp < 0:
+            timestamp = None
 
-        upload_date = unified_strdate(show['online_date_start_utc'])
-        uploader = show['partner_name']
-        uploader_id = show['partner_key']
-        duration = show['duration_ms'] / 1000.0
+        uploader = show.get('partner_name')
+        uploader_id = show.get('partner_key')
+        duration = float_or_none(show.get('duration_ms'), 1000)
 
         thumbnails = []
         for thumbnail_key, thumbnail_url in show.items():
@@ -157,7 +195,7 @@ class NocoIE(InfoExtractor):
         if episode_number:
             title += ' #' + compat_str(episode_number)
         if episode:
-            title += ' - ' + episode
+            title += ' - ' + compat_str(episode)
 
         description = show.get('show_resume') or show.get('family_resume')
 
@@ -166,7 +204,7 @@ class NocoIE(InfoExtractor):
             'title': title,
             'description': description,
             'thumbnails': thumbnails,
-            'upload_date': upload_date,
+            'timestamp': timestamp,
             'uploader': uploader,
             'uploader_id': uploader_id,
             'duration': duration,
diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py
new file mode 100644 (file)
index 0000000..3f9c776
--- /dev/null
@@ -0,0 +1,179 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    unified_strdate,
+)
+
+
+class NovaIE(InfoExtractor):
+    IE_DESC = 'TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz'
+    _VALID_URL = 'http://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)'
+    _TESTS = [{
+        'url': 'http://tvnoviny.nova.cz/clanek/novinky/co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou.html?utm_source=tvnoviny&utm_medium=cpfooter&utm_campaign=novaplus',
+        'info_dict': {
+            'id': '1608920',
+            'display_id': 'co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou',
+            'ext': 'flv',
+            'title': 'Duel: Michal Hrdlička a Petr Suchoň',
+            'description': 'md5:d0cc509858eee1b1374111c588c6f5d5',
+            'thumbnail': 're:^https?://.*\.(?:jpg)',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }, {
+        'url': 'http://tn.nova.cz/clanek/tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci.html#player_13260',
+        'md5': '1dd7b9d5ea27bc361f110cd855a19bd3',
+        'info_dict': {
+            'id': '1757139',
+            'display_id': 'tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci',
+            'ext': 'mp4',
+            'title': 'Podzemní nemocnice v pražské Krči',
+            'description': 'md5:f0a42dd239c26f61c28f19e62d20ef53',
+            'thumbnail': 're:^https?://.*\.(?:jpg)',
+        }
+    }, {
+        'url': 'http://novaplus.nova.cz/porad/policie-modrava/video/5591-policie-modrava-15-dil-blondynka-na-hrbitove',
+        'info_dict': {
+            'id': '1756825',
+            'display_id': '5591-policie-modrava-15-dil-blondynka-na-hrbitove',
+            'ext': 'flv',
+            'title': 'Policie Modrava - 15. díl - Blondýnka na hřbitově',
+            'description': 'md5:dc24e50be5908df83348e50d1431295e',  # Make sure this description is clean of html tags
+            'thumbnail': 're:^https?://.*\.(?:jpg)',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }, {
+        'url': 'http://novaplus.nova.cz/porad/televizni-noviny/video/5585-televizni-noviny-30-5-2015/',
+        'info_dict': {
+            'id': '1756858',
+            'ext': 'flv',
+            'title': 'Televizní noviny - 30. 5. 2015',
+            'thumbnail': 're:^https?://.*\.(?:jpg)',
+            'upload_date': '20150530',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }, {
+        'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html',
+        'info_dict': {
+            'id': '1753621',
+            'ext': 'mp4',
+            'title': 'Zaklínač 3: Divoký hon',
+            'description': 're:.*Pokud se stejně jako my nemůžete.*',
+            'thumbnail': 're:https?://.*\.jpg(\?.*)?',
+            'upload_date': '20150521',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }, {
+        'url': 'http://sport.tn.nova.cz/clanek/sport/hokej/nhl/zivot-jde-dal-hodnotil-po-vyrazeni-z-playoff-jiri-sekac.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://doma.nova.cz/clanek/zdravi/prijdte-se-zapsat-do-registru-kostni-drene-jiz-ve-stredu-3-cervna.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://prask.nova.cz/clanek/novinky/co-si-na-sobe-nase-hvezdy-nechaly-pojistit.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://tv.nova.cz/clanek/novinky/zivot-je-zivot-bondovsky-trailer.html',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('id')
+        site = mobj.group('site')
+
+        webpage = self._download_webpage(url, display_id)
+
+        video_id = self._search_regex(
+            [r"(?:media|video_id)\s*:\s*'(\d+)'",
+             r'media=(\d+)',
+             r'id="article_video_(\d+)"',
+             r'id="player_(\d+)"'],
+            webpage, 'video id')
+
+        config_url = self._search_regex(
+            r'src="(http://tn\.nova\.cz/bin/player/videojs/config\.php\?[^"]+)"',
+            webpage, 'config url', default=None)
+
+        if not config_url:
+            DEFAULT_SITE_ID = '23000'
+            SITES = {
+                'tvnoviny': DEFAULT_SITE_ID,
+                'novaplus': DEFAULT_SITE_ID,
+                'vymena': DEFAULT_SITE_ID,
+                'krasna': DEFAULT_SITE_ID,
+                'fanda': '30',
+                'tn': '30',
+                'doma': '30',
+            }
+
+            site_id = self._search_regex(
+                r'site=(\d+)', webpage, 'site id', default=None) or SITES.get(site, DEFAULT_SITE_ID)
+
+            config_url = ('http://tn.nova.cz/bin/player/videojs/config.php?site=%s&media=%s&jsVar=vjsconfig'
+                          % (site_id, video_id))
+
+        config = self._download_json(
+            config_url, display_id,
+            'Downloading config JSON',
+            transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1])
+
+        mediafile = config['mediafile']
+        video_url = mediafile['src']
+
+        m = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+?))/&*(?P<playpath>.+)$', video_url)
+        if m:
+            formats = [{
+                'url': m.group('url'),
+                'app': m.group('app'),
+                'play_path': m.group('playpath'),
+                'player_path': 'http://tvnoviny.nova.cz/static/shared/app/videojs/video-js.swf',
+                'ext': 'flv',
+            }]
+        else:
+            formats = [{
+                'url': video_url,
+            }]
+        self._sort_formats(formats)
+
+        title = mediafile.get('meta', {}).get('title') or self._og_search_title(webpage)
+        description = clean_html(self._og_search_description(webpage, default=None))
+        thumbnail = config.get('poster')
+
+        if site == 'novaplus':
+            upload_date = unified_strdate(self._search_regex(
+                r'(\d{1,2}-\d{1,2}-\d{4})$', display_id, 'upload date', default=None))
+        elif site == 'fanda':
+            upload_date = unified_strdate(self._search_regex(
+                r'<span class="date_time">(\d{1,2}\.\d{1,2}\.\d{4})', webpage, 'upload date', default=None))
+        else:
+            upload_date = None
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'upload_date': upload_date,
+            'thumbnail': thumbnail,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py
new file mode 100644 (file)
index 0000000..0b5ff47
--- /dev/null
@@ -0,0 +1,192 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    parse_iso8601,
+    parse_duration,
+    remove_start,
+)
+
+
+class NowTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?nowtv\.de/(?P<station>rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/player'
+
+    _TESTS = [{
+        # rtl
+        'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/player',
+        'info_dict': {
+            'id': '203519',
+            'display_id': 'bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit',
+            'ext': 'mp4',
+            'title': 'Die neuen Bauern und eine Hochzeit',
+            'description': 'md5:e234e1ed6d63cf06be5c070442612e7e',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1432580700,
+            'upload_date': '20150525',
+            'duration': 2786,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # rtl2
+        'url': 'http://www.nowtv.de/rtl2/berlin-tag-nacht/berlin-tag-nacht-folge-934/player',
+        'info_dict': {
+            'id': '203481',
+            'display_id': 'berlin-tag-nacht/berlin-tag-nacht-folge-934',
+            'ext': 'mp4',
+            'title': 'Berlin - Tag & Nacht (Folge 934)',
+            'description': 'md5:c85e88c2e36c552dfe63433bc9506dd0',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1432666800,
+            'upload_date': '20150526',
+            'duration': 2641,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # rtlnitro
+        'url': 'http://www.nowtv.de/rtlnitro/alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00/player',
+        'info_dict': {
+            'id': '165780',
+            'display_id': 'alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00',
+            'ext': 'mp4',
+            'title': 'Hals- und Beinbruch',
+            'description': 'md5:b50d248efffe244e6f56737f0911ca57',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1432415400,
+            'upload_date': '20150523',
+            'duration': 2742,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # superrtl
+        'url': 'http://www.nowtv.de/superrtl/medicopter-117/angst/player',
+        'info_dict': {
+            'id': '99205',
+            'display_id': 'medicopter-117/angst',
+            'ext': 'mp4',
+            'title': 'Angst!',
+            'description': 'md5:30cbc4c0b73ec98bcd73c9f2a8c17c4e',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1222632900,
+            'upload_date': '20080928',
+            'duration': 3025,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # ntv
+        'url': 'http://www.nowtv.de/ntv/ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch/player',
+        'info_dict': {
+            'id': '203521',
+            'display_id': 'ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch',
+            'ext': 'mp4',
+            'title': 'Thema u.a.: Der erste Blick: Die Apple Watch',
+            'description': 'md5:4312b6c9d839ffe7d8caf03865a531af',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1432751700,
+            'upload_date': '20150527',
+            'duration': 1083,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # vox
+        'url': 'http://www.nowtv.de/vox/der-hundeprofi/buero-fall-chihuahua-joel/player',
+        'info_dict': {
+            'id': '128953',
+            'display_id': 'der-hundeprofi/buero-fall-chihuahua-joel',
+            'ext': 'mp4',
+            'title': "Büro-Fall / Chihuahua 'Joel'",
+            'description': 'md5:e62cb6bf7c3cc669179d4f1eb279ad8d',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1432408200,
+            'upload_date': '20150523',
+            'duration': 3092,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('id')
+        station = mobj.group('station')
+
+        info = self._download_json(
+            'https://api.nowtv.de/v3/movies/%s?fields=id,title,free,geoblocked,articleLong,articleShort,broadcastStartDate,seoUrl,duration,format,files' % display_id,
+            display_id)
+
+        video_id = compat_str(info['id'])
+
+        files = info['files']
+        if not files:
+            if info.get('geoblocked', False):
+                raise ExtractorError(
+                    'Video %s is not available from your location due to geo restriction' % video_id,
+                    expected=True)
+            if not info.get('free', True):
+                raise ExtractorError(
+                    'Video %s is not available for free' % video_id, expected=True)
+
+        f = info.get('format', {})
+        station = f.get('station') or station
+
+        STATIONS = {
+            'rtl': 'rtlnow',
+            'rtl2': 'rtl2now',
+            'vox': 'voxnow',
+            'nitro': 'rtlnitronow',
+            'ntv': 'n-tvnow',
+            'superrtl': 'superrtlnow'
+        }
+
+        formats = []
+        for item in files['items']:
+            item_path = remove_start(item['path'], '/')
+            tbr = int_or_none(item['bitrate'])
+            m3u8_url = 'http://hls.fra.%s.de/hls-vod-enc/%s.m3u8' % (STATIONS[station], item_path)
+            m3u8_url = m3u8_url.replace('now/', 'now/videos/')
+            formats.append({
+                'url': m3u8_url,
+                'format_id': '%s-%sk' % (item['id'], tbr),
+                'ext': 'mp4',
+                'tbr': tbr,
+            })
+        self._sort_formats(formats)
+
+        title = info['title']
+        description = info.get('articleLong') or info.get('articleShort')
+        timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ')
+        duration = parse_duration(info.get('duration'))
+        thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo')
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
+            'duration': duration,
+            'formats': formats,
+        }
index 5d84485714b9f360d47c8676710e9c3e6d9578c7..0c2d02c108ed425cf4688c34aa2a826ddaa400b4 100644 (file)
@@ -1,5 +1,7 @@
 from __future__ import unicode_literals
 
+import re
+
 from .common import InfoExtractor
 from ..utils import (
     fix_xml_ampersands,
@@ -7,7 +9,6 @@ from ..utils import (
     qualities,
     strip_jsonp,
     unified_strdate,
-    url_basename,
 )
 
 
@@ -16,13 +17,42 @@ class NPOBaseIE(InfoExtractor):
         token_page = self._download_webpage(
             'http://ida.omroep.nl/npoplayer/i.js',
             video_id, note='Downloading token')
-        return self._search_regex(
+        token = self._search_regex(
             r'npoplayer\.token = "(.+?)"', token_page, 'token')
+        # Decryption algorithm extracted from http://npoplayer.omroep.nl/csjs/npoplayer-min.js
+        token_l = list(token)
+        first = second = None
+        for i in range(5, len(token_l) - 4):
+            if token_l[i].isdigit():
+                if first is None:
+                    first = i
+                elif second is None:
+                    second = i
+        if first is None or second is None:
+            first = 12
+            second = 13
+
+        token_l[first], token_l[second] = token_l[second], token_l[first]
+
+        return ''.join(token_l)
 
 
 class NPOIE(NPOBaseIE):
-    IE_NAME = 'npo.nl'
-    _VALID_URL = r'https?://(?:www\.)?npo\.nl/(?!live|radio)[^/]+/[^/]+/(?P<id>[^/?]+)'
+    IE_NAME = 'npo'
+    IE_DESC = 'npo.nl and ntr.nl'
+    _VALID_URL = r'''(?x)
+                    (?:
+                        npo:|
+                        https?://
+                            (?:www\.)?
+                            (?:
+                                npo\.nl/(?!live|radio)(?:[^/]+/){2}|
+                                ntr\.nl/(?:[^/]+/){2,}|
+                                omroepwnl\.nl/video/fragment/[^/]+__
+                            )
+                        )
+                        (?P<id>[^/?#]+)
+                '''
 
     _TESTS = [
         {
@@ -42,7 +72,7 @@ class NPOIE(NPOBaseIE):
             'info_dict': {
                 'id': 'VARA_101191800',
                 'ext': 'm4v',
-                'title': 'De Mega Mike & Mega Thomas show',
+                'title': 'De Mega Mike & Mega Thomas show: The best of.',
                 'description': 'md5:3b74c97fc9d6901d5a665aac0e5400f4',
                 'upload_date': '20090227',
                 'duration': 2400,
@@ -54,8 +84,8 @@ class NPOIE(NPOBaseIE):
             'info_dict': {
                 'id': 'VPWON_1169289',
                 'ext': 'm4v',
-                'title': 'Tegenlicht',
-                'description': 'md5:d6476bceb17a8c103c76c3b708f05dd1',
+                'title': 'Tegenlicht: De toekomst komt uit Afrika',
+                'description': 'md5:52cf4eefbc96fffcbdc06d024147abea',
                 'upload_date': '20130225',
                 'duration': 3000,
             },
@@ -84,6 +114,30 @@ class NPOIE(NPOBaseIE):
                 'title': 'Hoe gaat Europa verder na Parijs?',
             },
         },
+        {
+            'url': 'http://www.ntr.nl/Aap-Poot-Pies/27/detail/Aap-poot-pies/VPWON_1233944#content',
+            'md5': '01c6a2841675995da1f0cf776f03a9c3',
+            'info_dict': {
+                'id': 'VPWON_1233944',
+                'ext': 'm4v',
+                'title': 'Aap, poot, pies',
+                'description': 'md5:c9c8005d1869ae65b858e82c01a91fde',
+                'upload_date': '20150508',
+                'duration': 599,
+            },
+        },
+        {
+            'url': 'http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698',
+            'md5': 'd30cd8417b8b9bca1fdff27428860d08',
+            'info_dict': {
+                'id': 'POW_00996502',
+                'ext': 'm4v',
+                'title': '''"Dit is wel een 'landslide'..."''',
+                'description': 'md5:f8d66d537dfb641380226e31ca57b8e8',
+                'upload_date': '20150508',
+                'duration': 462,
+            },
+        }
     ]
 
     def _real_extract(self, url):
@@ -92,12 +146,24 @@ class NPOIE(NPOBaseIE):
 
     def _get_info(self, video_id):
         metadata = self._download_json(
-            'http://e.omroep.nl/metadata/aflevering/%s' % video_id,
+            'http://e.omroep.nl/metadata/%s' % video_id,
             video_id,
             # We have to remove the javascript callback
             transform_source=strip_jsonp,
         )
 
+        # For some videos actual video id (prid) is different (e.g. for
+        # http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698
+        # video id is POMS_WNL_853698 but prid is POW_00996502)
+        video_id = metadata.get('prid') or video_id
+
+        # titel is too generic in some cases so utilize aflevering_titel as well
+        # when available (e.g. http://tegenlicht.vpro.nl/afleveringen/2014-2015/access-to-africa.html)
+        title = metadata['titel']
+        sub_title = metadata.get('aflevering_titel')
+        if sub_title and sub_title != title:
+            title += ': %s' % sub_title
+
         token = self._get_token(video_id)
 
         formats = []
@@ -170,8 +236,8 @@ class NPOIE(NPOBaseIE):
 
         return {
             'id': video_id,
-            'title': metadata['titel'],
-            'description': metadata['info'],
+            'title': title,
+            'description': metadata.get('info'),
             'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'],
             'upload_date': unified_strdate(metadata.get('gidsdatum')),
             'duration': parse_duration(metadata.get('tijdsduur')),
@@ -340,9 +406,8 @@ class NPORadioFragmentIE(InfoExtractor):
         }
 
 
-class TegenlichtVproIE(NPOIE):
-    IE_NAME = 'tegenlicht.vpro.nl'
-    _VALID_URL = r'https?://tegenlicht\.vpro\.nl/afleveringen/.*?'
+class VPROIE(NPOIE):
+    _VALID_URL = r'https?://(?:www\.)?(?:tegenlicht\.)?vpro\.nl/(?:[^/]+/){2,}(?P<id>[^/]+)\.html'
 
     _TESTS = [
         {
@@ -351,17 +416,72 @@ class TegenlichtVproIE(NPOIE):
             'info_dict': {
                 'id': 'VPWON_1169289',
                 'ext': 'm4v',
-                'title': 'Tegenlicht',
-                'description': 'md5:d6476bceb17a8c103c76c3b708f05dd1',
+                'title': 'De toekomst komt uit Afrika',
+                'description': 'md5:52cf4eefbc96fffcbdc06d024147abea',
                 'upload_date': '20130225',
             },
         },
+        {
+            'url': 'http://www.vpro.nl/programmas/2doc/2015/sergio-herman.html',
+            'info_dict': {
+                'id': 'sergio-herman',
+                'title': 'Sergio Herman: Fucking perfect',
+            },
+            'playlist_count': 2,
+        },
+        {
+            # playlist with youtube embed
+            'url': 'http://www.vpro.nl/programmas/2doc/2015/education-education.html',
+            'info_dict': {
+                'id': 'education-education',
+                'title': '2Doc',
+            },
+            'playlist_count': 2,
+        }
     ]
 
     def _real_extract(self, url):
-        name = url_basename(url)
-        webpage = self._download_webpage(url, name)
-        urn = self._html_search_meta('mediaurn', webpage)
-        info_page = self._download_json(
-            'http://rs.vpro.nl/v2/api/media/%s.json' % urn, name)
-        return self._get_info(info_page['mid'])
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        entries = [
+            self.url_result('npo:%s' % video_id if not video_id.startswith('http') else video_id)
+            for video_id in re.findall(r'data-media-id="([^"]+)"', webpage)
+        ]
+
+        playlist_title = self._search_regex(
+            r'<title>\s*([^>]+?)\s*-\s*Teledoc\s*-\s*VPRO\s*</title>',
+            webpage, 'playlist title', default=None) or self._og_search_title(webpage)
+
+        return self.playlist_result(entries, playlist_id, playlist_title)
+
+
+class WNLIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?omroepwnl\.nl/video/detail/(?P<id>[^/]+)__\d+'
+
+    _TEST = {
+        'url': 'http://www.omroepwnl.nl/video/detail/vandaag-de-dag-6-mei__060515',
+        'info_dict': {
+            'id': 'vandaag-de-dag-6-mei',
+            'title': 'Vandaag de Dag 6 mei',
+        },
+        'playlist_count': 4,
+    }
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        entries = [
+            self.url_result('npo:%s' % video_id, 'NPO')
+            for video_id, part in re.findall(
+                r'<a[^>]+href="([^"]+)"[^>]+class="js-mid"[^>]*>(Deel \d+)', webpage)
+        ]
+
+        playlist_title = self._html_search_regex(
+            r'(?s)<h1[^>]+class="subject"[^>]*>(.+?)</h1>',
+            webpage, 'playlist title')
+
+        return self.playlist_result(entries, playlist_id, playlist_title)
index e91d3a248ec3367af8b7e9a8f60be0d503877481..d066a96db137ee3fb2c36f712803622e73b4aa40 100644 (file)
@@ -4,7 +4,6 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..compat import compat_str
 from ..utils import (
     ExtractorError,
     float_or_none,
@@ -14,7 +13,7 @@ from ..utils import (
 
 
 class NRKIE(InfoExtractor):
-    _VALID_URL = r'(?:nrk:|http://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)'
+    _VALID_URL = r'(?:nrk:|https?://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)'
 
     _TESTS = [
         {
@@ -77,7 +76,7 @@ class NRKIE(InfoExtractor):
 
 
 class NRKPlaylistIE(InfoExtractor):
-    _VALID_URL = r'http://(?:www\.)?nrk\.no/(?!video)(?:[^/]+/)+(?P<id>[^/]+)'
+    _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video)(?:[^/]+/)+(?P<id>[^/]+)'
 
     _TESTS = [{
         'url': 'http://www.nrk.no/troms/gjenopplev-den-historiske-solformorkelsen-1.12270763',
@@ -117,11 +116,12 @@ class NRKPlaylistIE(InfoExtractor):
 
 
 class NRKTVIE(InfoExtractor):
-    _VALID_URL = r'(?P<baseurl>http://tv\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?'
+    IE_DESC = 'NRK TV and NRK Radio'
+    _VALID_URL = r'(?P<baseurl>https?://(?:tv|radio)\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?'
 
     _TESTS = [
         {
-            'url': 'http://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',
+            'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',
             'md5': 'adf2c5454fa2bf032f47a9f8fb351342',
             'info_dict': {
                 'id': 'MUHH48000314',
@@ -133,7 +133,7 @@ class NRKTVIE(InfoExtractor):
             },
         },
         {
-            'url': 'http://tv.nrk.no/program/mdfp15000514',
+            'url': 'https://tv.nrk.no/program/mdfp15000514',
             'md5': '383650ece2b25ecec996ad7b5bb2a384',
             'info_dict': {
                 'id': 'mdfp15000514',
@@ -146,7 +146,7 @@ class NRKTVIE(InfoExtractor):
         },
         {
             # single playlist video
-            'url': 'http://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2',
+            'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2',
             'md5': 'adbd1dbd813edaf532b0a253780719c2',
             'info_dict': {
                 'id': 'MSPO40010515-part2',
@@ -158,7 +158,7 @@ class NRKTVIE(InfoExtractor):
             'skip': 'Only works from Norway',
         },
         {
-            'url': 'http://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015',
+            'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015',
             'playlist': [
                 {
                     'md5': '9480285eff92d64f06e02a5367970a7a',
@@ -189,6 +189,10 @@ class NRKTVIE(InfoExtractor):
                 'duration': 6947.5199999999995,
             },
             'skip': 'Only works from Norway',
+        },
+        {
+            'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#',
+            'only_matching': True,
         }
     ]
 
@@ -200,24 +204,15 @@ class NRKTVIE(InfoExtractor):
         url = "%s%s" % (baseurl, subtitlesurl)
         self._debug_print('%s: Subtitle url: %s' % (video_id, url))
         captions = self._download_xml(
-            url, video_id, 'Downloading subtitles',
-            transform_source=lambda s: s.replace(r'<br />', '\r\n'))
+            url, video_id, 'Downloading subtitles')
         lang = captions.get('lang', 'no')
-        ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/ns/ttml}'))
-        srt = ''
-        for pos, p in enumerate(ps):
-            begin = parse_duration(p.get('begin'))
-            duration = parse_duration(p.get('dur'))
-            starttime = self._subtitles_timecode(begin)
-            endtime = self._subtitles_timecode(begin + duration)
-            srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (compat_str(pos), starttime, endtime, p.text)
         return {lang: [
             {'ext': 'ttml', 'url': url},
-            {'ext': 'srt', 'data': srt},
         ]}
 
     def _extract_f4m(self, manifest_url, video_id):
-        return self._extract_f4m_formats(manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id)
+        return self._extract_f4m_formats(
+            manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id, f4m_id='hds')
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
@@ -279,7 +274,7 @@ class NRKTVIE(InfoExtractor):
 
         m3u8_url = re.search(r'data-hls-media="([^"]+)"', webpage)
         if m3u8_url:
-            formats.extend(self._extract_m3u8_formats(m3u8_url.group(1), video_id, 'mp4'))
+            formats.extend(self._extract_m3u8_formats(m3u8_url.group(1), video_id, 'mp4', m3u8_id='hls'))
         self._sort_formats(formats)
 
         subtitles_url = self._html_search_regex(
index 03f0a4de6dccd8485091b802422bbfaf5f7402df..7f254b867da66f70a79ff7aac5d81eb6f37bd997 100644 (file)
@@ -8,30 +8,8 @@ from ..utils import (
 )
 
 
-class NYTimesIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)'
-
-    _TESTS = [{
-        'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263',
-        'md5': '18a525a510f942ada2720db5f31644c0',
-        'info_dict': {
-            'id': '100000002847155',
-            'ext': 'mov',
-            'title': 'Verbatim: What Is a Photocopier?',
-            'description': 'md5:93603dada88ddbda9395632fdc5da260',
-            'timestamp': 1398631707,
-            'upload_date': '20140427',
-            'uploader': 'Brett Weiner',
-            'duration': 419,
-        }
-    }, {
-        'url': 'http://www.nytimes.com/video/travel/100000003550828/36-hours-in-dubai.html',
-        'only_matching': True,
-    }]
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-
+class NYTimesBaseIE(InfoExtractor):
+    def _extract_video_from_id(self, video_id):
         video_data = self._download_json(
             'http://www.nytimes.com/svc/video/api/v2/video/%s' % video_id,
             video_id, 'Downloading video JSON')
@@ -81,3 +59,59 @@ class NYTimesIE(InfoExtractor):
             'formats': formats,
             'thumbnails': thumbnails,
         }
+
+
+class NYTimesIE(NYTimesBaseIE):
+    _VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263',
+        'md5': '18a525a510f942ada2720db5f31644c0',
+        'info_dict': {
+            'id': '100000002847155',
+            'ext': 'mov',
+            'title': 'Verbatim: What Is a Photocopier?',
+            'description': 'md5:93603dada88ddbda9395632fdc5da260',
+            'timestamp': 1398631707,
+            'upload_date': '20140427',
+            'uploader': 'Brett Weiner',
+            'duration': 419,
+        }
+    }, {
+        'url': 'http://www.nytimes.com/video/travel/100000003550828/36-hours-in-dubai.html',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        return self._extract_video_from_id(video_id)
+
+
+class NYTimesArticleIE(NYTimesBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?nytimes\.com/(.(?<!video))*?/(?:[^/]+/)*(?P<id>[^.]+)(?:\.html)?'
+    _TESTS = [{
+        'url': 'http://www.nytimes.com/2015/04/14/business/owner-of-gravity-payments-a-credit-card-processor-is-setting-a-new-minimum-wage-70000-a-year.html?_r=0',
+        'md5': 'e2076d58b4da18e6a001d53fd56db3c9',
+        'info_dict': {
+            'id': '100000003628438',
+            'ext': 'mov',
+            'title': 'New Minimum Wage: $70,000 a Year',
+            'description': 'Dan Price, C.E.O. of Gravity Payments, surprised his 120-person staff by announcing that he planned over the next three years to raise the salary of every employee to $70,000 a year.',
+            'timestamp': 1429033037,
+            'upload_date': '20150414',
+            'uploader': 'Matthew Williams',
+        }
+    }, {
+        'url': 'http://www.nytimes.com/news/minute/2014/03/17/times-minute-whats-next-in-crimea/?_php=true&_type=blogs&_php=true&_type=blogs&_r=1',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_id = self._html_search_regex(r'data-videoid="(\d+)"', webpage, 'video id')
+
+        return self._extract_video_from_id(video_id)
index 155d0ee6a834fa6fb551900e0d0c35dcf0007a5c..215ffe87b55db126300f0c18c98d9c5bfd920ed7 100644 (file)
@@ -2,16 +2,19 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
 from ..utils import (
     unified_strdate,
     int_or_none,
     qualities,
+    unescapeHTML,
 )
 
 
 class OdnoklassnikiIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:odnoklassniki|ok)\.ru/(?:video|web-api/video/moviePlayer)/(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:odnoklassniki|ok)\.ru/(?:video|web-api/video/moviePlayer)/(?P<id>[\d-]+)'
     _TESTS = [{
+        # metadata in JSON
         'url': 'http://ok.ru/video/20079905452',
         'md5': '8e24ad2da6f387948e7a7d44eb8668fe',
         'info_dict': {
@@ -19,11 +22,22 @@ class OdnoklassnikiIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'Культура меняет нас (прекрасный ролик!))',
             'duration': 100,
-            'upload_date': '20141207',
             'uploader_id': '330537914540',
             'uploader': 'Виталий Добровольский',
             'like_count': int,
-            'age_limit': 0,
+        },
+    }, {
+        # metadataUrl
+        'url': 'http://ok.ru/video/63567059965189-0',
+        'md5': '9676cf86eff5391d35dea675d224e131',
+        'info_dict': {
+            'id': '63567059965189-0',
+            'ext': 'mp4',
+            'title': 'Девушка без комплексов ...',
+            'duration': 191,
+            'uploader_id': '534380003155',
+            'uploader': 'Андрей Мещанинов',
+            'like_count': int,
         },
     }, {
         'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452',
@@ -33,14 +47,23 @@ class OdnoklassnikiIE(InfoExtractor):
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        webpage = self._download_webpage(url, video_id)
+        webpage = self._download_webpage(
+            'http://ok.ru/video/%s' % video_id, video_id)
 
         player = self._parse_json(
-            self._search_regex(
-                r"OKVideo\.start\(({.+?})\s*,\s*'VideoAutoplay_player'", webpage, 'player'),
+            unescapeHTML(self._search_regex(
+                r'data-attributes="([^"]+)"', webpage, 'player')),
             video_id)
 
-        metadata = self._parse_json(player['flashvars']['metadata'], video_id)
+        flashvars = player['flashvars']
+
+        metadata = flashvars.get('metadata')
+        if metadata:
+            metadata = self._parse_json(metadata, video_id)
+        else:
+            metadata = self._download_json(
+                compat_urllib_parse_unquote(flashvars['metadataUrl']),
+                video_id, 'Downloading metadata JSON')
 
         movie = metadata['movie']
         title = movie['title']
@@ -52,11 +75,11 @@ class OdnoklassnikiIE(InfoExtractor):
         uploader = author.get('name')
 
         upload_date = unified_strdate(self._html_search_meta(
-            'ya:ovs:upload_date', webpage, 'upload date'))
+            'ya:ovs:upload_date', webpage, 'upload date', default=None))
 
         age_limit = None
         adult = self._html_search_meta(
-            'ya:ovs:adult', webpage, 'age limit')
+            'ya:ovs:adult', webpage, 'age limit', default=None)
         if adult:
             age_limit = 18 if adult == 'true' else 0
 
diff --git a/youtube_dl/extractor/onionstudios.py b/youtube_dl/extractor/onionstudios.py
new file mode 100644 (file)
index 0000000..0f1f448
--- /dev/null
@@ -0,0 +1,76 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import determine_ext
+
+
+class OnionStudiosIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?onionstudios\.com/(?:videos/[^/]+-|embed\?.*\bid=)(?P<id>\d+)(?!-)'
+
+    _TESTS = [{
+        'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937',
+        'md5': 'd4851405d31adfadf71cd7a487b765bb',
+        'info_dict': {
+            'id': '2937',
+            'ext': 'mp4',
+            'title': 'Hannibal charges forward, stops for a cocktail',
+            'description': 'md5:545299bda6abf87e5ec666548c6a9448',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'uploader': 'The A.V. Club',
+            'uploader_id': 'TheAVClub',
+        },
+    }, {
+        'url': 'http://www.onionstudios.com/embed?id=2855&autoplay=true',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?onionstudios\.com/embed.+?)\1', webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'http://www.onionstudios.com/embed?id=%s' % video_id, video_id)
+
+        formats = []
+        for src in re.findall(r'<source[^>]+src="([^"]+)"', webpage):
+            if determine_ext(src) != 'm3u8':  # m3u8 always results in 403
+                formats.append({
+                    'url': src,
+                })
+        self._sort_formats(formats)
+
+        title = self._search_regex(
+            r'share_title\s*=\s*(["\'])(?P<title>[^\1]+?)\1',
+            webpage, 'title', group='title')
+        description = self._search_regex(
+            r'share_description\s*=\s*(["\'])(?P<description>[^\1]+?)\1',
+            webpage, 'description', default=None, group='description')
+        thumbnail = self._search_regex(
+            r'poster\s*=\s*(["\'])(?P<thumbnail>[^\1]+?)\1',
+            webpage, 'thumbnail', default=False, group='thumbnail')
+
+        uploader_id = self._search_regex(
+            r'twitter_handle\s*=\s*(["\'])(?P<uploader_id>[^\1]+?)\1',
+            webpage, 'uploader id', fatal=False, group='uploader_id')
+        uploader = self._search_regex(
+            r'window\.channelName\s*=\s*(["\'])Embedded:(?P<uploader>[^\1]+?)\1',
+            webpage, 'uploader', default=False, group='uploader')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'formats': formats,
+        }
index d5b05c18febb580a448263b4f7b2876ef3234957..a262a9f6d4ec232e78ee34f3ce38b03ea27d01e9 100644 (file)
@@ -1,63 +1,41 @@
 from __future__ import unicode_literals
 import re
 import json
+import base64
 
 from .common import InfoExtractor
 from ..utils import (
     unescapeHTML,
     ExtractorError,
+    determine_ext,
+    int_or_none,
 )
 
 
-class OoyalaIE(InfoExtractor):
-    _VALID_URL = r'(?:ooyala:|https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=)(?P<id>.+?)(&|$)'
-
-    _TESTS = [
-        {
-            # From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video
-            'url': 'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
-            'info_dict': {
-                'id': 'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
-                'ext': 'mp4',
-                'title': 'Explaining Data Recovery from Hard Drives and SSDs',
-                'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.',
-            },
-        }, {
-            # Only available for ipad
-            'url': 'http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0',
-            'info_dict': {
-                'id': 'x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0',
-                'ext': 'mp4',
-                'title': 'Simulation Overview - Levels of Simulation',
-                'description': '',
-            },
-        },
-    ]
+class OoyalaBaseIE(InfoExtractor):
 
-    @staticmethod
-    def _url_for_embed_code(embed_code):
-        return 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code
+    def _extract_result(self, info, more_info):
+        embedCode = info['embedCode']
+        video_url = info.get('ipad_url') or info['url']
 
-    @classmethod
-    def _build_url_result(cls, embed_code):
-        return cls.url_result(cls._url_for_embed_code(embed_code),
-                              ie=cls.ie_key())
+        if determine_ext(video_url) == 'm3u8':
+            formats = self._extract_m3u8_formats(video_url, embedCode, ext='mp4')
+        else:
+            formats = [{
+                'url': video_url,
+                'ext': 'mp4',
+            }]
 
-    def _extract_result(self, info, more_info):
         return {
-            'id': info['embedCode'],
-            'ext': 'mp4',
+            'id': embedCode,
             'title': unescapeHTML(info['title']),
-            'url': info.get('ipad_url') or info['url'],
+            'formats': formats,
             'description': unescapeHTML(more_info['description']),
             'thumbnail': more_info['promo'],
         }
 
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        embedCode = mobj.group('id')
-        player_url = 'http://player.ooyala.com/player.js?embedCode=%s' % embedCode
-        player = self._download_webpage(player_url, embedCode)
+    def _extract(self, player_url, video_id):
+        player = self._download_webpage(player_url, video_id)
         mobile_url = self._search_regex(r'mobile_player_url="(.+?)&device="',
                                         player, 'mobile player url')
         # Looks like some videos are only available for particular devices
@@ -70,13 +48,43 @@ class OoyalaIE(InfoExtractor):
         devices.insert(0, 'unknown')
         for device in devices:
             mobile_player = self._download_webpage(
-                '%s&device=%s' % (mobile_url, device), embedCode,
+                '%s&device=%s' % (mobile_url, device), video_id,
                 'Downloading mobile player JS for %s device' % device)
             videos_info = self._search_regex(
                 r'var streams=window.oo_testEnv\?\[\]:eval\("\((\[{.*?}\])\)"\);',
                 mobile_player, 'info', fatal=False, default=None)
             if videos_info:
                 break
+
+        if not videos_info:
+            formats = []
+            auth_data = self._download_json(
+                'http://player.ooyala.com/sas/player_api/v1/authorization/embed_code/%s/%s?domain=www.example.org&supportedFormats=mp4,webm' % (video_id, video_id),
+                video_id)
+
+            cur_auth_data = auth_data['authorization_data'][video_id]
+
+            for stream in cur_auth_data['streams']:
+                formats.append({
+                    'url': base64.b64decode(stream['url']['data'].encode('ascii')).decode('utf-8'),
+                    'ext': stream.get('delivery_type'),
+                    'format': stream.get('video_codec'),
+                    'format_id': stream.get('profile'),
+                    'width': int_or_none(stream.get('width')),
+                    'height': int_or_none(stream.get('height')),
+                    'abr': int_or_none(stream.get('audio_bitrate')),
+                    'vbr': int_or_none(stream.get('video_bitrate')),
+                })
+            if formats:
+                return {
+                    'id': video_id,
+                    'formats': formats,
+                    'title': 'Ooyala video',
+                }
+
+            if not cur_auth_data['authorized']:
+                raise ExtractorError(cur_auth_data['message'], expected=True)
+
         if not videos_info:
             raise ExtractorError('Unable to extract info')
         videos_info = videos_info.replace('\\"', '"')
@@ -89,9 +97,100 @@ class OoyalaIE(InfoExtractor):
             videos = [self._extract_result(info, more_info) for (info, more_info) in zip(videos_info, videos_more_info['lineup'])]
             return {
                 '_type': 'playlist',
-                'id': embedCode,
+                'id': video_id,
                 'title': unescapeHTML(videos_more_info['title']),
                 'entries': videos,
             }
         else:
             return self._extract_result(videos_info[0], videos_more_info)
+
+
+class OoyalaIE(OoyalaBaseIE):
+    _VALID_URL = r'(?:ooyala:|https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=)(?P<id>.+?)(&|$)'
+
+    _TESTS = [
+        {
+            # From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video
+            'url': 'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
+            'info_dict': {
+                'id': 'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
+                'ext': 'mp4',
+                'title': 'Explaining Data Recovery from Hard Drives and SSDs',
+                'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.',
+            },
+        }, {
+            # Only available for ipad
+            'url': 'http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0',
+            'info_dict': {
+                'id': 'x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0',
+                'ext': 'mp4',
+                'title': 'Simulation Overview - Levels of Simulation',
+                'description': '',
+            },
+        },
+        {
+            # Information available only through SAS api
+            # From http://community.plm.automation.siemens.com/t5/News-NX-Manufacturing/Tool-Path-Divide/ba-p/4187
+            'url': 'http://player.ooyala.com/player.js?embedCode=FiOG81ZTrvckcchQxmalf4aQj590qTEx',
+            'md5': 'a84001441b35ea492bc03736e59e7935',
+            'info_dict': {
+                'id': 'FiOG81ZTrvckcchQxmalf4aQj590qTEx',
+                'ext': 'mp4',
+                'title': 'Ooyala video',
+            }
+        }
+    ]
+
+    @staticmethod
+    def _url_for_embed_code(embed_code):
+        return 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code
+
+    @classmethod
+    def _build_url_result(cls, embed_code):
+        return cls.url_result(cls._url_for_embed_code(embed_code),
+                              ie=cls.ie_key())
+
+    def _real_extract(self, url):
+        embed_code = self._match_id(url)
+        player_url = 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code
+        return self._extract(player_url, embed_code)
+
+
+class OoyalaExternalIE(OoyalaBaseIE):
+    _VALID_URL = r'''(?x)
+                    (?:
+                        ooyalaexternal:|
+                        https?://.+?\.ooyala\.com/.*?\bexternalId=
+                    )
+                    (?P<partner_id>[^:]+)
+                    :
+                    (?P<id>.+)
+                    (?:
+                        :|
+                        .*?&pcode=
+                    )
+                    (?P<pcode>.+?)
+                    (&|$)
+                    '''
+
+    _TEST = {
+        'url': 'https://player.ooyala.com/player.js?externalId=espn:10365079&pcode=1kNG061cgaoolOncv54OAO1ceO-I&adSetCode=91cDU6NuXTGKz3OdjOxFdAgJVtQcKJnI&callback=handleEvents&hasModuleParams=1&height=968&playerBrandingId=7af3bd04449c444c964f347f11873075&targetReplaceId=videoPlayer&width=1656&wmode=opaque&allowScriptAccess=always',
+        'info_dict': {
+            'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG',
+            'ext': 'mp4',
+            'title': 'dm_140128_30for30Shorts___JudgingJewellv2',
+            'description': '',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        partner_id = mobj.group('partner_id')
+        video_id = mobj.group('id')
+        pcode = mobj.group('pcode')
+        player_url = 'http://player.ooyala.com/player.js?externalId=%s:%s&pcode=%s' % (partner_id, video_id, pcode)
+        return self._extract(player_url, video_id)
index 2249657eb1b796970c155f4baf3445fef9b60681..d2ceedd018fe1f0237aefa17330d5dbe3d94f68c 100644 (file)
@@ -3,9 +3,9 @@ from __future__ import unicode_literals
 import json
 
 from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote_plus
 from ..utils import (
     parse_iso8601,
-    compat_urllib_parse,
     parse_age_limit,
     int_or_none,
 )
@@ -37,7 +37,7 @@ class OpenFilmIE(InfoExtractor):
 
         webpage = self._download_webpage(url, display_id)
 
-        player = compat_urllib_parse.unquote_plus(
+        player = compat_urllib_parse_unquote_plus(
             self._og_search_video_url(webpage))
 
         video = json.loads(self._search_regex(
index ca1a5bb3cd520fd9add9f195c89d8ac13619467b..2e6c9872b5d251be4eb3c61addab113aad4d2416 100644 (file)
@@ -210,16 +210,16 @@ class ORFIPTVIE(InfoExtractor):
     _VALID_URL = r'http://iptv\.orf\.at/(?:#/)?stories/(?P<id>\d+)'
 
     _TEST = {
-        'url': 'http://iptv.orf.at/stories/2267952',
-        'md5': '26ffa4bab6dbce1eee78bbc7021016cd',
+        'url': 'http://iptv.orf.at/stories/2275236/',
+        'md5': 'c8b22af4718a4b4af58342529453e3e5',
         'info_dict': {
-            'id': '339775',
+            'id': '350612',
             'ext': 'flv',
-            'title': 'Kreml-Kritiker Nawalny wieder frei',
-            'description': 'md5:6f24e7f546d364dacd0e616a9e409236',
-            'duration': 84.729,
+            'title': 'Weitere Evakuierungen um Vulkan Calbuco',
+            'description': 'md5:d689c959bdbcf04efeddedbf2299d633',
+            'duration': 68.197,
             'thumbnail': 're:^https?://.*\.jpg$',
-            'upload_date': '20150306',
+            'upload_date': '20150425',
         },
     }
 
index f179ea2008636f061c6a4cdad6fc69841a291076..6cdc2638b4930dc92835d71f673b560dea99022d 100644 (file)
@@ -87,7 +87,7 @@ class PatreonIE(InfoExtractor):
             r'<div class="attach"><a target="_blank" href="([^"]+)">',
             webpage, 'attachment URL', default=None)
         embed = self._html_search_regex(
-            r'<div id="watchCreation">\s*<iframe class="embedly-embed" src="([^"]+)"',
+            r'<div[^>]+id="watchCreation"[^>]*>\s*<iframe[^>]+src="([^"]+)"',
             webpage, 'embedded URL', default=None)
 
         if attach_fn is not None:
index afce732e141a1ae6cec78cc28ed4376fa174ab1f..fec5d65ad94892ca0f40a9e49703c857d98b47a4 100644 (file)
@@ -1,3 +1,4 @@
+# coding: utf-8
 from __future__ import unicode_literals
 
 import re
@@ -5,6 +6,8 @@ import re
 from .common import InfoExtractor
 from ..utils import (
     ExtractorError,
+    determine_ext,
+    int_or_none,
     unified_strdate,
     US_RATINGS,
 )
@@ -33,6 +36,9 @@ class PBSIE(InfoExtractor):
                 'description': 'md5:ba0c207295339c8d6eced00b7c363c6a',
                 'duration': 3190,
             },
+            'params': {
+                'skip_download': True,  # requires ffmpeg
+            },
         },
         {
             'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/',
@@ -44,6 +50,9 @@ class PBSIE(InfoExtractor):
                 'description': 'md5:f5bfbefadf421e8bb8647602011caf8e',
                 'duration': 5050,
             },
+            'params': {
+                'skip_download': True,  # requires ffmpeg
+            }
         },
         {
             'url': 'http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/',
@@ -66,7 +75,10 @@ class PBSIE(InfoExtractor):
                 'title': 'Dudamel Conducts Verdi Requiem at the Hollywood Bowl - Full',
                 'duration': 6559,
                 'thumbnail': 're:^https?://.*\.jpg$',
-            }
+            },
+            'params': {
+                'skip_download': True,  # requires ffmpeg
+            },
         },
         {
             'url': 'http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html',
@@ -80,7 +92,10 @@ class PBSIE(InfoExtractor):
                 'duration': 3172,
                 'thumbnail': 're:^https?://.*\.jpg$',
                 'upload_date': '20140122',
-            }
+            },
+            'params': {
+                'skip_download': True,  # requires ffmpeg
+            },
         },
         {
             'url': 'http://www.pbs.org/wgbh/pages/frontline/united-states-of-secrets/',
@@ -88,6 +103,21 @@ class PBSIE(InfoExtractor):
                 'id': 'united-states-of-secrets',
             },
             'playlist_count': 2,
+        },
+        {
+            'url': 'http://www.pbs.org/wgbh/americanexperience/films/death/player/',
+            'info_dict': {
+                'id': '2280706814',
+                'display_id': 'player',
+                'ext': 'mp4',
+                'title': 'Death and the Civil War',
+                'description': 'American Experience, TV’s most-watched history series, brings to life the compelling stories from our past that inform our understanding of the world today.',
+                'duration': 6705,
+                'thumbnail': 're:^https?://.*\.jpg$',
+            },
+            'params': {
+                'skip_download': True,  # requires ffmpeg
+            },
         }
     ]
 
@@ -121,7 +151,7 @@ class PBSIE(InfoExtractor):
                 return media_id, presumptive_id, upload_date
 
             url = self._search_regex(
-                r'<iframe\s+(?:class|id)=["\']partnerPlayer["\'].*?\s+src=["\'](.*?)["\']>',
+                r'<iframe\s+[^>]*\s+src=["\']([^\'"]+partnerplayer[^\'"]+)["\']',
                 webpage, 'player URL')
             mobj = re.match(self._VALID_URL, url)
 
@@ -149,36 +179,68 @@ class PBSIE(InfoExtractor):
                 for vid_id in video_id]
             return self.playlist_result(entries, display_id)
 
-        info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id
-        info = self._download_json(info_url, display_id)
-
-        redirect_url = info['alternate_encoding']['url']
-        redirect_info = self._download_json(
-            redirect_url + '?format=json', display_id,
-            'Downloading video url info')
-        if redirect_info['status'] == 'error':
-            if redirect_info['http_code'] == 403:
-                message = (
-                    'The video is not available in your region due to '
-                    'right restrictions')
+        info = self._download_json(
+            'http://video.pbs.org/videoInfo/%s?format=json&type=partner' % video_id,
+            display_id)
+
+        formats = []
+        for encoding_name in ('recommended_encoding', 'alternate_encoding'):
+            redirect = info.get(encoding_name)
+            if not redirect:
+                continue
+            redirect_url = redirect.get('url')
+            if not redirect_url:
+                continue
+
+            redirect_info = self._download_json(
+                redirect_url + '?format=json', display_id,
+                'Downloading %s video url info' % encoding_name)
+
+            if redirect_info['status'] == 'error':
+                if redirect_info['http_code'] == 403:
+                    message = (
+                        'The video is not available in your region due to '
+                        'right restrictions')
+                else:
+                    message = redirect_info['message']
+                raise ExtractorError(message, expected=True)
+
+            format_url = redirect_info.get('url')
+            if not format_url:
+                continue
+
+            if determine_ext(format_url) == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    format_url, display_id, 'mp4', preference=1, m3u8_id='hls'))
             else:
-                message = redirect_info['message']
-            raise ExtractorError(message, expected=True)
+                formats.append({
+                    'url': format_url,
+                    'format_id': redirect.get('eeid'),
+                })
+        self._sort_formats(formats)
 
         rating_str = info.get('rating')
         if rating_str is not None:
             rating_str = rating_str.rpartition('-')[2]
         age_limit = US_RATINGS.get(rating_str)
 
+        subtitles = {}
+        closed_captions_url = info.get('closed_captions_url')
+        if closed_captions_url:
+            subtitles['en'] = [{
+                'ext': 'ttml',
+                'url': closed_captions_url,
+            }]
+
         return {
             'id': video_id,
             'display_id': display_id,
             'title': info['title'],
-            'url': redirect_info['url'],
-            'ext': 'mp4',
             'description': info['program'].get('description'),
             'thumbnail': info.get('image_url'),
-            'duration': info.get('duration'),
+            'duration': int_or_none(info.get('duration')),
             'age_limit': age_limit,
             'upload_date': upload_date,
+            'formats': formats,
+            'subtitles': subtitles,
         }
diff --git a/youtube_dl/extractor/philharmoniedeparis.py b/youtube_dl/extractor/philharmoniedeparis.py
new file mode 100644 (file)
index 0000000..6e60e5f
--- /dev/null
@@ -0,0 +1,78 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    float_or_none,
+    int_or_none,
+    parse_iso8601,
+    xpath_text,
+)
+
+
+class PhilharmonieDeParisIE(InfoExtractor):
+    IE_DESC = 'Philharmonie de Paris'
+    _VALID_URL = r'http://live\.philharmoniedeparis\.fr/(?:[Cc]oncert/|misc/Playlist\.ashx\?id=)(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://live.philharmoniedeparis.fr/concert/1032066.html',
+        'info_dict': {
+            'id': '1032066',
+            'ext': 'flv',
+            'title': 'md5:d1f5585d87d041d07ce9434804bc8425',
+            'timestamp': 1428179400,
+            'upload_date': '20150404',
+            'duration': 6592.278,
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }, {
+        'url': 'http://live.philharmoniedeparis.fr/Concert/1030324.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://live.philharmoniedeparis.fr/misc/Playlist.ashx?id=1030324&track=&lang=fr',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        concert = self._download_xml(
+            'http://live.philharmoniedeparis.fr/misc/Playlist.ashx?id=%s' % video_id,
+            video_id).find('./concert')
+
+        formats = []
+        info_dict = {
+            'id': video_id,
+            'title': xpath_text(concert, './titre', 'title', fatal=True),
+            'formats': formats,
+        }
+
+        fichiers = concert.find('./fichiers')
+        stream = fichiers.attrib['serveurstream']
+        for fichier in fichiers.findall('./fichier'):
+            info_dict['duration'] = float_or_none(fichier.get('timecodefin'))
+            for quality, (format_id, suffix) in enumerate([('lq', ''), ('hq', '_hd')]):
+                format_url = fichier.get('url%s' % suffix)
+                if not format_url:
+                    continue
+                formats.append({
+                    'url': stream,
+                    'play_path': format_url,
+                    'ext': 'flv',
+                    'format_id': format_id,
+                    'width': int_or_none(concert.get('largeur%s' % suffix)),
+                    'height': int_or_none(concert.get('hauteur%s' % suffix)),
+                    'quality': quality,
+                })
+        self._sort_formats(formats)
+
+        date, hour = concert.get('date'), concert.get('heure')
+        if date and hour:
+            info_dict['timestamp'] = parse_iso8601(
+                '%s-%s-%sT%s:00' % (date[0:4], date[4:6], date[6:8], hour))
+        elif date:
+            info_dict['upload_date'] = date
+
+        return info_dict
index c66db3cdc84e55a6a3a904ddf3ff7c09aaac9573..788411ccc18082f59588d40704900c26dba1fe21 100644 (file)
@@ -4,7 +4,7 @@ import json
 import re
 
 from .common import InfoExtractor
-from ..compat import compat_urllib_parse
+from ..compat import compat_urllib_parse_unquote
 
 
 class PhotobucketIE(InfoExtractor):
@@ -34,7 +34,7 @@ class PhotobucketIE(InfoExtractor):
         info_json = self._search_regex(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (.*?)\);',
                                        webpage, 'info json')
         info = json.loads(info_json)
-        url = compat_urllib_parse.unquote(self._html_search_regex(r'file=(.+\.mp4)', info['linkcodes']['html'], 'url'))
+        url = compat_urllib_parse_unquote(self._html_search_regex(r'file=(.+\.mp4)', info['linkcodes']['html'], 'url'))
         return {
             'id': video_id,
             'url': url,
diff --git a/youtube_dl/extractor/pinkbike.py b/youtube_dl/extractor/pinkbike.py
new file mode 100644 (file)
index 0000000..a52210f
--- /dev/null
@@ -0,0 +1,96 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    remove_end,
+    remove_start,
+    str_to_int,
+    unified_strdate,
+)
+
+
+class PinkbikeIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:(?:www\.)?pinkbike\.com/video/|es\.pinkbike\.org/i/kvid/kvid-y5\.swf\?id=)(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://www.pinkbike.com/video/402811/',
+        'md5': '4814b8ca7651034cd87e3361d5c2155a',
+        'info_dict': {
+            'id': '402811',
+            'ext': 'mp4',
+            'title': 'Brandon Semenuk - RAW 100',
+            'description': 'Official release: www.redbull.ca/rupertwalker',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 100,
+            'upload_date': '20150406',
+            'uploader': 'revelco',
+            'location': 'Victoria, British Columbia, Canada',
+            'view_count': int,
+            'comment_count': int,
+        }
+    }, {
+        'url': 'http://es.pinkbike.org/i/kvid/kvid-y5.swf?id=406629',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'http://www.pinkbike.com/video/%s' % video_id, video_id)
+
+        formats = []
+        for _, format_id, src in re.findall(
+                r'data-quality=((?:\\)?["\'])(.+?)\1[^>]+src=\1(.+?)\1', webpage):
+            height = int_or_none(self._search_regex(
+                r'^(\d+)[pP]$', format_id, 'height', default=None))
+            formats.append({
+                'url': src,
+                'format_id': format_id,
+                'height': height,
+            })
+        self._sort_formats(formats)
+
+        title = remove_end(self._og_search_title(webpage), ' Video - Pinkbike')
+        description = self._html_search_regex(
+            r'(?s)id="media-description"[^>]*>(.+?)<',
+            webpage, 'description', default=None) or remove_start(
+            self._og_search_description(webpage), title + '. ')
+        thumbnail = self._og_search_thumbnail(webpage)
+        duration = int_or_none(self._html_search_meta(
+            'video:duration', webpage, 'duration'))
+
+        uploader = self._search_regex(
+            r'un:\s*"([^"]+)"', webpage, 'uploader', fatal=False)
+        upload_date = unified_strdate(self._search_regex(
+            r'class="fullTime"[^>]+title="([^"]+)"',
+            webpage, 'upload date', fatal=False))
+
+        location = self._html_search_regex(
+            r'(?s)<dt>Location</dt>\s*<dd>(.+?)<',
+            webpage, 'location', fatal=False)
+
+        def extract_count(webpage, label):
+            return str_to_int(self._search_regex(
+                r'<span[^>]+class="stat-num"[^>]*>([\d,.]+)</span>\s*<span[^>]+class="stat-label"[^>]*>%s' % label,
+                webpage, label, fatal=False))
+
+        view_count = extract_count(webpage, 'Views')
+        comment_count = extract_count(webpage, 'Comments')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'upload_date': upload_date,
+            'uploader': uploader,
+            'location': location,
+            'view_count': view_count,
+            'comment_count': comment_count,
+            'formats': formats
+        }
index abde34b94659574041469e8f2f4a8b0e4e903bb1..551c8c9f0fef4566afd5691628b2c216c157fd0c 100644 (file)
@@ -30,7 +30,7 @@ class PladformIE(InfoExtractor):
         'info_dict': {
             'id': '100183293',
             'ext': 'mp4',
-            'title': 'Тайны перевала Дятлова • Тайна перевала Дятлова 1 серия 2 часть',
+            'title': 'Тайны перевала Дятлова • 1 серия 2 часть',
             'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
             'thumbnail': 're:^https?://.*\.jpg$',
             'duration': 694,
index 596c621d75067255f5be6e4eaf97ba897cc51fe5..06505e96fb9ac81224f03f71299eb80238c0921f 100644 (file)
@@ -18,7 +18,8 @@ class PlanetaPlayIE(InfoExtractor):
             'id': '3586',
             'ext': 'flv',
             'title': 'md5:e829428ee28b1deed00de90de49d1da1',
-        }
+        },
+        'skip': 'Not accessible from Travis CI server',
     }
 
     _SONG_FORMATS = {
index 45716c75d9505c5fcb7e8c6d73ec4feaef298aee..8a1c296dda8b57611a0e464387be43ab0fc9a370 100644 (file)
@@ -38,9 +38,7 @@ class PlayedIE(InfoExtractor):
         if m_error:
             raise ExtractorError(m_error.group('msg'), expected=True)
 
-        fields = re.findall(
-            r'type="hidden" name="([^"]+)"\s+value="([^"]+)">', orig_webpage)
-        data = dict(fields)
+        data = self._hidden_inputs(orig_webpage)
 
         self._sleep(2, video_id)
 
index c3e667e9e72ea0aaf6e5db731f630816e6a2861d..2eb4fd96dcbc071c1c2ecfb596ab20c4526018bd 100644 (file)
@@ -4,7 +4,8 @@ import re
 
 from .common import InfoExtractor
 from ..compat import (
-    compat_urllib_parse,
+    compat_urllib_parse_unquote,
+    compat_urllib_parse_unquote_plus,
 )
 from ..utils import (
     clean_html,
@@ -44,7 +45,7 @@ class PlayvidIE(InfoExtractor):
         flashvars = self._html_search_regex(
             r'flashvars="(.+?)"', webpage, 'flashvars')
 
-        infos = compat_urllib_parse.unquote(flashvars).split(r'&')
+        infos = compat_urllib_parse_unquote(flashvars).split(r'&')
         for info in infos:
             videovars_match = re.match(r'^video_vars\[(.+?)\]=(.+?)$', info)
             if videovars_match:
@@ -52,7 +53,7 @@ class PlayvidIE(InfoExtractor):
                 val = videovars_match.group(2)
 
                 if key == 'title':
-                    video_title = compat_urllib_parse.unquote_plus(val)
+                    video_title = compat_urllib_parse_unquote_plus(val)
                 if key == 'duration':
                     try:
                         duration = int(val)
diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py
new file mode 100644 (file)
index 0000000..72d1b27
--- /dev/null
@@ -0,0 +1,71 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+from ..compat import compat_urllib_parse
+from .common import InfoExtractor
+from ..utils import (
+    parse_duration,
+    int_or_none,
+    ExtractorError,
+)
+
+
+class Porn91IE(InfoExtractor):
+    IE_NAME = '91porn'
+    _VALID_URL = r'(?:https?://)(?:www\.|)91porn\.com/.+?\?viewkey=(?P<id>[\w\d]+)'
+
+    _TEST = {
+        'url': 'http://91porn.com/view_video.php?viewkey=7e42283b4f5ab36da134',
+        'md5': '6df8f6d028bc8b14f5dbd73af742fb20',
+        'info_dict': {
+            'id': '7e42283b4f5ab36da134',
+            'title': '18岁大一漂亮学妹,水嫩性感,再爽一次!',
+            'ext': 'mp4',
+            'duration': 431,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        url = 'http://91porn.com/view_video.php?viewkey=%s' % video_id
+        self._set_cookie('91porn.com', 'language', 'cn_CN')
+        webpage = self._download_webpage(url, video_id, 'get HTML content')
+
+        if '作为游客,你每天只可观看10个视频' in webpage:
+            raise ExtractorError('91 Porn says: Daily limit 10 videos exceeded', expected=True)
+
+        title = self._search_regex(
+            r'<div id="viewvideo-title">([^<]+)</div>', webpage, 'title')
+        title = title.replace('\n', '')
+
+        # get real url
+        file_id = self._search_regex(
+            r'so.addVariable\(\'file\',\'(\d+)\'', webpage, 'file id')
+        sec_code = self._search_regex(
+            r'so.addVariable\(\'seccode\',\'([^\']+)\'', webpage, 'sec code')
+        max_vid = self._search_regex(
+            r'so.addVariable\(\'max_vid\',\'(\d+)\'', webpage, 'max vid')
+        url_params = compat_urllib_parse.urlencode({
+            'VID': file_id,
+            'mp4': '1',
+            'seccode': sec_code,
+            'max_vid': max_vid,
+        })
+        info_cn = self._download_webpage(
+            'http://91porn.com/getfile.php?' + url_params, video_id,
+            'get real video url')
+        video_url = self._search_regex(r'file=([^&]+)&', info_cn, 'url')
+
+        duration = parse_duration(self._search_regex(
+            r'时长:\s*</span>\s*(\d+:\d+)', webpage, 'duration', fatal=False))
+
+        comment_count = int_or_none(self._search_regex(
+            r'留言:\s*</span>\s*(\d+)', webpage, 'comment count', fatal=False))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': video_url,
+            'duration': duration,
+            'comment_count': comment_count,
+        }
index 0c8b731cf47267568e43ccd09ff21f1683b4d992..0b7886840fbced3d9fa6fb219050f40ac709c080 100644 (file)
@@ -5,7 +5,8 @@ import re
 
 from .common import InfoExtractor
 from ..compat import (
-    compat_urllib_parse,
+    compat_urllib_parse_unquote,
+    compat_urllib_parse_unquote_plus,
     compat_urllib_parse_urlparse,
     compat_urllib_request,
 )
@@ -19,8 +20,8 @@ from ..aes import (
 
 
 class PornHubIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?pornhub\.com/view_video\.php\?viewkey=(?P<id>[0-9a-f]+)'
-    _TEST = {
+    _VALID_URL = r'https?://(?:www\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P<id>[0-9a-z]+)'
+    _TESTS = [{
         'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
         'md5': '882f488fa1f0026f023f33576004a2ed',
         'info_dict': {
@@ -30,7 +31,17 @@ class PornHubIE(InfoExtractor):
             "title": "Seductive Indian beauty strips down and fingers her pink pussy",
             "age_limit": 18
         }
-    }
+    }, {
+        'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def _extract_url(cls, webpage):
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/\d+)\1', webpage)
+        if mobj:
+            return mobj.group('url')
 
     def _extract_count(self, pattern, webpage, name):
         return str_to_int(self._search_regex(
@@ -39,7 +50,8 @@ class PornHubIE(InfoExtractor):
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        req = compat_urllib_request.Request(url)
+        req = compat_urllib_request.Request(
+            'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id)
         req.add_header('Cookie', 'age_verified=1')
         webpage = self._download_webpage(req, video_id)
 
@@ -58,7 +70,7 @@ class PornHubIE(InfoExtractor):
             webpage, 'uploader', fatal=False)
         thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False)
         if thumbnail:
-            thumbnail = compat_urllib_parse.unquote(thumbnail)
+            thumbnail = compat_urllib_parse_unquote(thumbnail)
 
         view_count = self._extract_count(
             r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
@@ -69,9 +81,10 @@ class PornHubIE(InfoExtractor):
         comment_count = self._extract_count(
             r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
 
-        video_urls = list(map(compat_urllib_parse.unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))
+        video_urls = list(map(compat_urllib_parse_unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))
         if webpage.find('"encrypted":true') != -1:
-            password = compat_urllib_parse.unquote_plus(self._html_search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
+            password = compat_urllib_parse_unquote_plus(
+                self._search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
             video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls))
 
         formats = []
index 9688ed94898de231e6c7f1c9dc28d3779da10311..eba4dfbb39576bff355b722c997dd31e07ce370f 100644 (file)
@@ -34,7 +34,7 @@ class PornoVoisinesIE(InfoExtractor):
             'duration': 120,
             'view_count': int,
             'average_rating': float,
-            'categories': ['Débutante', 'Scénario', 'Sodomie'],
+            'categories': ['Débutantes', 'Scénario', 'Sodomie'],
             'age_limit': 18,
         }
     }
@@ -71,7 +71,7 @@ class PornoVoisinesIE(InfoExtractor):
         view_count = int_or_none(self._search_regex(
             r'(\d+) vues', webpage, 'view count', fatal=False))
         average_rating = self._search_regex(
-            r'Note : (\d+,\d+)', webpage, 'average rating', fatal=False)
+            r'Note\s*:\s*(\d+(?:,\d+)?)', webpage, 'average rating', fatal=False)
         if average_rating:
             average_rating = float_or_none(average_rating.replace(',', '.'))
 
index 01cc3d9ea3ff845476a7f7b306c3bfee25078b96..304359dc5b189b8ce27c967c2d369b26db334532 100644 (file)
@@ -1,7 +1,5 @@
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
 from ..compat import (
     compat_urllib_parse,
@@ -31,12 +29,7 @@ class PrimeShareTVIE(InfoExtractor):
         if '>File not exist<' in webpage:
             raise ExtractorError('Video %s does not exist' % video_id, expected=True)
 
-        fields = dict(re.findall(r'''(?x)<input\s+
-            type="hidden"\s+
-            name="([^"]+)"\s+
-            (?:id="[^"]+"\s+)?
-            value="([^"]*)"
-            ''', webpage))
+        fields = self._hidden_inputs(webpage)
 
         headers = {
             'Referer': url,
index f536e6e6cdfb3d71e21c98614e2baf117387493b..8190ed6766ce5c878fc82700524ec6d012d70a57 100644 (file)
@@ -35,10 +35,7 @@ class PromptFileIE(InfoExtractor):
             raise ExtractorError('Video %s does not exist' % video_id,
                                  expected=True)
 
-        fields = dict(re.findall(r'''(?x)type="hidden"\s+
-            name="(.+?)"\s+
-            value="(.*?)"
-            ''', webpage))
+        fields = self._hidden_inputs(webpage)
         post = compat_urllib_parse.urlencode(fields)
         req = compat_urllib_request.Request(url, post)
         req.add_header('Content-type', 'application/x-www-form-urlencoded')
index 7cc7996642cae1de1ca2a585391d167025b92162..fec008ce7687a0360d2e54e3130425245f30b4fc 100644 (file)
@@ -9,18 +9,24 @@ from ..compat import (
     compat_urllib_parse,
 )
 from ..utils import (
-    unified_strdate,
+    determine_ext,
     int_or_none,
+    unified_strdate,
 )
 
 
 class ProSiebenSat1IE(InfoExtractor):
     IE_NAME = 'prosiebensat1'
     IE_DESC = 'ProSiebenSat.1 Digital'
-    _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|ran|the-voice-of-germany)\.de|fem\.com)/(?P<id>.+)'
+    _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany)\.(?:de|at)|ran\.de|fem\.com)/(?P<id>.+)'
 
     _TESTS = [
         {
+            # Tests changes introduced in https://github.com/rg3/youtube-dl/pull/6242
+            # in response to fixing https://github.com/rg3/youtube-dl/issues/6215:
+            # - malformed f4m manifest support
+            # - proper handling of URLs starting with `https?://` in 2.0 manifests
+            # - recursive child f4m manifests extraction
             'url': 'http://www.prosieben.de/tv/circus-halligalli/videos/218-staffel-2-episode-18-jahresrueckblick-ganze-folge',
             'info_dict': {
                 'id': '2104602',
@@ -177,6 +183,7 @@ class ProSiebenSat1IE(InfoExtractor):
         r'<header class="clearfix">\s*<h3>(.+?)</h3>',
         r'<!-- start video -->\s*<h1>(.+?)</h1>',
         r'<h1 class="att-name">\s*(.+?)</h1>',
+        r'<header class="module_header">\s*<h2>([^<]+)</h2>\s*</header>',
     ]
     _DESCRIPTION_REGEXES = [
         r'<p itemprop="description">\s*(.+?)</p>',
@@ -206,8 +213,8 @@ class ProSiebenSat1IE(InfoExtractor):
     def _extract_clip(self, url, webpage):
         clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id')
 
-        access_token = 'testclient'
-        client_name = 'kolibri-1.2.5'
+        access_token = 'prosieben'
+        client_name = 'kolibri-2.0.19-splec4'
         client_location = url
 
         videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse.urlencode({
@@ -274,23 +281,30 @@ class ProSiebenSat1IE(InfoExtractor):
 
         for source in urls_sources:
             protocol = source['protocol']
+            source_url = source['url']
             if protocol == 'rtmp' or protocol == 'rtmpe':
-                mobj = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', source['url'])
+                mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url)
                 if not mobj:
                     continue
+                path = mobj.group('path')
+                mp4colon_index = path.rfind('mp4:')
+                app = path[:mp4colon_index]
+                play_path = path[mp4colon_index:]
                 formats.append({
-                    'url': mobj.group('url'),
-                    'app': mobj.group('app'),
-                    'play_path': mobj.group('playpath'),
+                    'url': '%s/%s' % (mobj.group('url'), app),
+                    'app': app,
+                    'play_path': play_path,
                     'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf',
                     'page_url': 'http://www.prosieben.de',
                     'vbr': fix_bitrate(source['bitrate']),
                     'ext': 'mp4',
                     'format_id': '%s_%s' % (source['cdn'], source['bitrate']),
                 })
+            elif 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m':
+                formats.extend(self._extract_f4m_formats(source_url, clip_id))
             else:
                 formats.append({
-                    'url': source['url'],
+                    'url': source_url,
                     'vbr': fix_bitrate(source['bitrate']),
                 })
 
diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py
new file mode 100644 (file)
index 0000000..1654a64
--- /dev/null
@@ -0,0 +1,317 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import random
+import time
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    strip_jsonp,
+    unescapeHTML,
+    clean_html,
+)
+from ..compat import compat_urllib_request
+
+
+class QQMusicIE(InfoExtractor):
+    IE_NAME = 'qqmusic'
+    IE_DESC = 'QQ音乐'
+    _VALID_URL = r'http://y.qq.com/#type=song&mid=(?P<id>[0-9A-Za-z]+)'
+    _TESTS = [{
+        'url': 'http://y.qq.com/#type=song&mid=004295Et37taLD',
+        'md5': '9ce1c1c8445f561506d2e3cfb0255705',
+        'info_dict': {
+            'id': '004295Et37taLD',
+            'ext': 'mp3',
+            'title': '可惜没如果',
+            'upload_date': '20141227',
+            'creator': '林俊杰',
+            'description': 'md5:d327722d0361576fde558f1ac68a7065',
+            'thumbnail': 're:^https?://.*\.jpg$',
+        }
+    }, {
+        'note': 'There is no mp3-320 version of this song.',
+        'url': 'http://y.qq.com/#type=song&mid=004MsGEo3DdNxV',
+        'md5': 'fa3926f0c585cda0af8fa4f796482e3e',
+        'info_dict': {
+            'id': '004MsGEo3DdNxV',
+            'ext': 'mp3',
+            'title': '如果',
+            'upload_date': '20050626',
+            'creator': '李季美',
+            'description': 'md5:46857d5ed62bc4ba84607a805dccf437',
+            'thumbnail': 're:^https?://.*\.jpg$',
+        }
+    }]
+
+    _FORMATS = {
+        'mp3-320': {'prefix': 'M800', 'ext': 'mp3', 'preference': 40, 'abr': 320},
+        'mp3-128': {'prefix': 'M500', 'ext': 'mp3', 'preference': 30, 'abr': 128},
+        'm4a': {'prefix': 'C200', 'ext': 'm4a', 'preference': 10}
+    }
+
+    # Reference: m_r_GetRUin() in top_player.js
+    # http://imgcache.gtimg.cn/music/portal_v3/y/top_player.js
+    @staticmethod
+    def m_r_get_ruin():
+        curMs = int(time.time() * 1000) % 1000
+        return int(round(random.random() * 2147483647) * curMs % 1E10)
+
+    def _real_extract(self, url):
+        mid = self._match_id(url)
+
+        detail_info_page = self._download_webpage(
+            'http://s.plcloud.music.qq.com/fcgi-bin/fcg_yqq_song_detail_info.fcg?songmid=%s&play=0' % mid,
+            mid, note='Download song detail info',
+            errnote='Unable to get song detail info', encoding='gbk')
+
+        song_name = self._html_search_regex(
+            r"songname:\s*'([^']+)'", detail_info_page, 'song name')
+
+        publish_time = self._html_search_regex(
+            r'发行时间:(\d{4}-\d{2}-\d{2})', detail_info_page,
+            'publish time', default=None)
+        if publish_time:
+            publish_time = publish_time.replace('-', '')
+
+        singer = self._html_search_regex(
+            r"singer:\s*'([^']+)", detail_info_page, 'singer', default=None)
+
+        lrc_content = self._html_search_regex(
+            r'<div class="content" id="lrc_content"[^<>]*>([^<>]+)</div>',
+            detail_info_page, 'LRC lyrics', default=None)
+        if lrc_content:
+            lrc_content = lrc_content.replace('\\n', '\n')
+
+        thumbnail_url = None
+        albummid = self._search_regex(
+            [r'albummid:\'([0-9a-zA-Z]+)\'', r'"albummid":"([0-9a-zA-Z]+)"'],
+            detail_info_page, 'album mid', default=None)
+        if albummid:
+            thumbnail_url = "http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg" \
+                            % (albummid[-2:-1], albummid[-1], albummid)
+
+        guid = self.m_r_get_ruin()
+
+        vkey = self._download_json(
+            'http://base.music.qq.com/fcgi-bin/fcg_musicexpress.fcg?json=3&guid=%s' % guid,
+            mid, note='Retrieve vkey', errnote='Unable to get vkey',
+            transform_source=strip_jsonp)['key']
+
+        formats = []
+        for format_id, details in self._FORMATS.items():
+            formats.append({
+                'url': 'http://cc.stream.qqmusic.qq.com/%s%s.%s?vkey=%s&guid=%s&fromtag=0'
+                       % (details['prefix'], mid, details['ext'], vkey, guid),
+                'format': format_id,
+                'format_id': format_id,
+                'preference': details['preference'],
+                'abr': details.get('abr'),
+            })
+        self._check_formats(formats, mid)
+        self._sort_formats(formats)
+
+        return {
+            'id': mid,
+            'formats': formats,
+            'title': song_name,
+            'upload_date': publish_time,
+            'creator': singer,
+            'description': lrc_content,
+            'thumbnail': thumbnail_url,
+        }
+
+
+class QQPlaylistBaseIE(InfoExtractor):
+    @staticmethod
+    def qq_static_url(category, mid):
+        return 'http://y.qq.com/y/static/%s/%s/%s/%s.html' % (category, mid[-2], mid[-1], mid)
+
+    @classmethod
+    def get_entries_from_page(cls, page):
+        entries = []
+
+        for item in re.findall(r'class="data"[^<>]*>([^<>]+)</', page):
+            song_mid = unescapeHTML(item).split('|')[-5]
+            entries.append(cls.url_result(
+                'http://y.qq.com/#type=song&mid=' + song_mid, 'QQMusic',
+                song_mid))
+
+        return entries
+
+
+class QQMusicSingerIE(QQPlaylistBaseIE):
+    IE_NAME = 'qqmusic:singer'
+    IE_DESC = 'QQ音乐 - 歌手'
+    _VALID_URL = r'http://y.qq.com/#type=singer&mid=(?P<id>[0-9A-Za-z]+)'
+    _TEST = {
+        'url': 'http://y.qq.com/#type=singer&mid=001BLpXF2DyJe2',
+        'info_dict': {
+            'id': '001BLpXF2DyJe2',
+            'title': '林俊杰',
+            'description': 'md5:2a222d89ba4455a3af19940c0481bb78',
+        },
+        'playlist_count': 12,
+    }
+
+    def _real_extract(self, url):
+        mid = self._match_id(url)
+
+        singer_page = self._download_webpage(
+            self.qq_static_url('singer', mid), mid, 'Download singer page')
+
+        entries = self.get_entries_from_page(singer_page)
+
+        singer_name = self._html_search_regex(
+            r"singername\s*:\s*'([^']+)'", singer_page, 'singer name',
+            default=None)
+
+        singer_id = self._html_search_regex(
+            r"singerid\s*:\s*'([0-9]+)'", singer_page, 'singer id',
+            default=None)
+
+        singer_desc = None
+
+        if singer_id:
+            req = compat_urllib_request.Request(
+                'http://s.plcloud.music.qq.com/fcgi-bin/fcg_get_singer_desc.fcg?utf8=1&outCharset=utf-8&format=xml&singerid=%s' % singer_id)
+            req.add_header(
+                'Referer', 'http://s.plcloud.music.qq.com/xhr_proxy_utf8.html')
+            singer_desc_page = self._download_xml(
+                req, mid, 'Donwload singer description XML')
+
+            singer_desc = singer_desc_page.find('./data/info/desc').text
+
+        return self.playlist_result(entries, mid, singer_name, singer_desc)
+
+
+class QQMusicAlbumIE(QQPlaylistBaseIE):
+    IE_NAME = 'qqmusic:album'
+    IE_DESC = 'QQ音乐 - 专辑'
+    _VALID_URL = r'http://y.qq.com/#type=album&mid=(?P<id>[0-9A-Za-z]+)'
+
+    _TESTS = [{
+        'url': 'http://y.qq.com/#type=album&mid=000gXCTb2AhRR1',
+        'info_dict': {
+            'id': '000gXCTb2AhRR1',
+            'title': '我们都是这样长大的',
+            'description': 'md5:179c5dce203a5931970d306aa9607ea6',
+        },
+        'playlist_count': 4,
+    }, {
+        'url': 'http://y.qq.com/#type=album&mid=002Y5a3b3AlCu3',
+        'info_dict': {
+            'id': '002Y5a3b3AlCu3',
+            'title': '그리고...',
+            'description': 'md5:a48823755615508a95080e81b51ba729',
+        },
+        'playlist_count': 8,
+    }]
+
+    def _real_extract(self, url):
+        mid = self._match_id(url)
+
+        album = self._download_json(
+            'http://i.y.qq.com/v8/fcg-bin/fcg_v8_album_info_cp.fcg?albummid=%s&format=json' % mid,
+            mid, 'Download album page')['data']
+
+        entries = [
+            self.url_result(
+                'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid']
+            ) for song in album['list']
+        ]
+        album_name = album.get('name')
+        album_detail = album.get('desc')
+        if album_detail is not None:
+            album_detail = album_detail.strip()
+
+        return self.playlist_result(entries, mid, album_name, album_detail)
+
+
+class QQMusicToplistIE(QQPlaylistBaseIE):
+    IE_NAME = 'qqmusic:toplist'
+    IE_DESC = 'QQ音乐 - 排行榜'
+    _VALID_URL = r'http://y\.qq\.com/#type=toplist&p=(?P<id>(top|global)_[0-9]+)'
+
+    _TESTS = [{
+        'url': 'http://y.qq.com/#type=toplist&p=global_123',
+        'info_dict': {
+            'id': 'global_123',
+            'title': '美国iTunes榜',
+        },
+        'playlist_count': 10,
+    }, {
+        'url': 'http://y.qq.com/#type=toplist&p=top_3',
+        'info_dict': {
+            'id': 'top_3',
+            'title': 'QQ音乐巅峰榜·欧美',
+            'description': 'QQ音乐巅峰榜·欧美根据用户收听行为自动生成,集结当下最流行的欧美新歌!:更新时间:每周四22点|统'
+                           '计周期:一周(上周四至本周三)|统计对象:三个月内发行的欧美歌曲|统计数量:100首|统计算法:根据'
+                           '歌曲在一周内的有效播放次数,由高到低取前100名(同一歌手最多允许5首歌曲同时上榜)|有效播放次数:'
+                           '登录用户完整播放一首歌曲,记为一次有效播放;同一用户收听同一首歌曲,每天记录为1次有效播放'
+        },
+        'playlist_count': 100,
+    }, {
+        'url': 'http://y.qq.com/#type=toplist&p=global_106',
+        'info_dict': {
+            'id': 'global_106',
+            'title': '韩国Mnet榜',
+        },
+        'playlist_count': 50,
+    }]
+
+    def _real_extract(self, url):
+        list_id = self._match_id(url)
+
+        list_type, num_id = list_id.split("_")
+
+        toplist_json = self._download_json(
+            'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg?type=%s&topid=%s&format=json'
+            % (list_type, num_id),
+            list_id, 'Download toplist page')
+
+        entries = [
+            self.url_result(
+                'http://y.qq.com/#type=song&mid=' + song['data']['songmid'], 'QQMusic', song['data']['songmid']
+            ) for song in toplist_json['songlist']
+        ]
+
+        topinfo = toplist_json.get('topinfo', {})
+        list_name = topinfo.get('ListName')
+        list_description = topinfo.get('info')
+        return self.playlist_result(entries, list_id, list_name, list_description)
+
+
+class QQMusicPlaylistIE(QQPlaylistBaseIE):
+    IE_NAME = 'qqmusic:playlist'
+    IE_DESC = 'QQ音乐 - 歌单'
+    _VALID_URL = r'http://y\.qq\.com/#type=taoge&id=(?P<id>[0-9]+)'
+
+    _TEST = {
+        'url': 'http://y.qq.com/#type=taoge&id=3462654915',
+        'info_dict': {
+            'id': '3462654915',
+            'title': '韩国5月新歌精选下旬',
+            'description': 'md5:d2c9d758a96b9888cf4fe82f603121d4',
+        },
+        'playlist_count': 40,
+    }
+
+    def _real_extract(self, url):
+        list_id = self._match_id(url)
+
+        list_json = self._download_json(
+            'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg?type=1&json=1&utf8=1&onlysong=0&disstid=%s'
+            % list_id, list_id, 'Download list page',
+            transform_source=strip_jsonp)['cdlist'][0]
+
+        entries = [
+            self.url_result(
+                'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid']
+            ) for song in list_json['songlist']
+        ]
+
+        list_name = list_json.get('dissname')
+        list_description = clean_html(unescapeHTML(list_json.get('desc')))
+        return self.playlist_result(entries, list_id, list_name, list_description)
index af7d76cf47e575277de3ffe480fdb8e1eb48b43c..f414e2384e619e8faccab280bdb9e8a9518819c9 100644 (file)
@@ -24,6 +24,7 @@ class QuickVidIE(InfoExtractor):
             'thumbnail': 're:^https?://.*\.(?:png|jpg|gif)$',
             'view_count': int,
         },
+        'skip': 'Not accessible from Travis CI server',
     }
 
     def _real_extract(self, url):
diff --git a/youtube_dl/extractor/rds.py b/youtube_dl/extractor/rds.py
new file mode 100644 (file)
index 0000000..796adfd
--- /dev/null
@@ -0,0 +1,73 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    parse_duration,
+    parse_iso8601,
+)
+
+
+class RDSIE(InfoExtractor):
+    IE_DESC = 'RDS.ca'
+    _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P<display_id>[^/]+)-(?P<id>\d+\.\d+)'
+
+    _TESTS = [{
+        'url': 'http://www.rds.ca/videos/football/nfl/fowler-jr-prend-la-direction-de-jacksonville-3.1132799',
+        'info_dict': {
+            'id': '3.1132799',
+            'display_id': 'fowler-jr-prend-la-direction-de-jacksonville',
+            'ext': 'mp4',
+            'title': 'Fowler Jr. prend la direction de Jacksonville',
+            'description': 'Dante Fowler Jr. est le troisième choix du repêchage 2015 de la NFL. ',
+            'timestamp': 1430397346,
+            'upload_date': '20150430',
+            'duration': 154.354,
+            'age_limit': 0,
+        }
+    }, {
+        'url': 'http://www.rds.ca/vid%C3%A9os/un-voyage-positif-3.877934',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id')
+
+        webpage = self._download_webpage(url, display_id)
+
+        # TODO: extract f4m from 9c9media.com
+        video_url = self._search_regex(
+            r'<span[^>]+itemprop="contentURL"[^>]+content="([^"]+)"',
+            webpage, 'video url')
+
+        title = self._og_search_title(webpage) or self._html_search_meta(
+            'title', webpage, 'title', fatal=True)
+        description = self._og_search_description(webpage) or self._html_search_meta(
+            'description', webpage, 'description')
+        thumbnail = self._og_search_thumbnail(webpage) or self._search_regex(
+            [r'<link[^>]+itemprop="thumbnailUrl"[^>]+href="([^"]+)"',
+             r'<span[^>]+itemprop="thumbnailUrl"[^>]+content="([^"]+)"'],
+            webpage, 'thumbnail', fatal=False)
+        timestamp = parse_iso8601(self._search_regex(
+            r'<span[^>]+itemprop="uploadDate"[^>]+content="([^"]+)"',
+            webpage, 'upload date', fatal=False))
+        duration = parse_duration(self._search_regex(
+            r'<span[^>]+itemprop="duration"[^>]+content="([^"]+)"',
+            webpage, 'duration', fatal=False))
+        age_limit = self._family_friendly_search(webpage)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'url': video_url,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
+            'duration': duration,
+            'age_limit': age_limit,
+        }
index dce64e1517003015722db1097ac83b106cc91136..e4215d546219bb95fe79abfb184da149148962db 100644 (file)
@@ -1,10 +1,11 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import re
-import json
-
 from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    unescapeHTML,
+)
 
 
 class RTBFIE(InfoExtractor):
@@ -16,34 +17,47 @@ class RTBFIE(InfoExtractor):
             'id': '1921274',
             'ext': 'mp4',
             'title': 'Les Diables au coeur (épisode 2)',
-            'description': 'Football - Diables Rouges',
             'duration': 3099,
-            'timestamp': 1398456336,
-            'upload_date': '20140425',
         }
     }
 
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+    _QUALITIES = [
+        ('mobile', 'mobile'),
+        ('web', 'SD'),
+        ('url', 'MD'),
+        ('high', 'HD'),
+    ]
 
-        page = self._download_webpage('https://www.rtbf.be/video/embed?id=%s' % video_id, video_id)
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
 
-        data = json.loads(self._html_search_regex(
-            r'<div class="js-player-embed(?: player-embed)?" data-video="([^"]+)"', page, 'data video'))['data']
+        webpage = self._download_webpage(
+            'http://www.rtbf.be/video/embed?id=%s' % video_id, video_id)
 
-        video_url = data.get('downloadUrl') or data.get('url')
+        data = self._parse_json(
+            unescapeHTML(self._search_regex(
+                r'data-video="([^"]+)"', webpage, 'data video')),
+            video_id)
 
-        if data['provider'].lower() == 'youtube':
+        if data.get('provider').lower() == 'youtube':
+            video_url = data.get('downloadUrl') or data.get('url')
             return self.url_result(video_url, 'Youtube')
+        formats = []
+        for key, format_id in self._QUALITIES:
+            format_url = data['sources'].get(key)
+            if format_url:
+                formats.append({
+                    'format_id': format_id,
+                    'url': format_url,
+                })
 
         return {
             'id': video_id,
-            'url': video_url,
+            'formats': formats,
             'title': data['title'],
             'description': data.get('description') or data.get('subtitle'),
-            'thumbnail': data['thumbnail']['large'],
+            'thumbnail': data.get('thumbnail'),
             'duration': data.get('duration') or data.get('realDuration'),
-            'timestamp': data['created'],
-            'view_count': data['viewCount'],
+            'timestamp': int_or_none(data.get('created')),
+            'view_count': int_or_none(data.get('viewCount')),
         }
index cfce4550ada568cfe13fae859a2bb745671074b5..e0c530d64a97500eaa1b653599f92c9420518c77 100644 (file)
@@ -12,10 +12,10 @@ class RtlNlIE(InfoExtractor):
     IE_NAME = 'rtl.nl'
     IE_DESC = 'rtl.nl and rtlxl.nl'
     _VALID_URL = r'''(?x)
-        https?://(www\.)?
+        https?://(?:www\.)?
         (?:
             rtlxl\.nl/\#!/[^/]+/|
-            rtl\.nl/system/videoplayer/[^?#]+?/video_embed\.html\#uuid=
+            rtl\.nl/system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html\b.+?\buuid=
         )
         (?P<id>[0-9a-f-]+)'''
 
@@ -43,22 +43,51 @@ class RtlNlIE(InfoExtractor):
             'upload_date': '20150215',
             'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.',
         }
+    }, {
+        # empty synopsis and missing episodes (see https://github.com/rg3/youtube-dl/issues/6275)
+        'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a/autoplay=false',
+        'info_dict': {
+            'id': 'f536aac0-1dc3-4314-920e-3bd1c5b3811a',
+            'ext': 'mp4',
+            'title': 'RTL Nieuws - Meer beelden van overval juwelier',
+            'thumbnail': 're:^https?://screenshots\.rtl\.nl/system/thumb/sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$',
+            'timestamp': 1437233400,
+            'upload_date': '20150718',
+            'duration': 30.474,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # encrypted m3u8 streams, georestricted
+        'url': 'http://www.rtlxl.nl/#!/afl-2-257632/52a74543-c504-4cde-8aa8-ec66fe8d68a7',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.rtl.nl/system/videoplayer/derden/embed.html#!/uuid=bb0353b0-d6a4-1dad-90e9-18fe75b8d1f0',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
         uuid = self._match_id(url)
         info = self._download_json(
-            'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=flash/' % uuid,
+            'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=adaptive/' % uuid,
             uuid)
 
         material = info['material'][0]
-        progname = info['abstracts'][0]['name']
-        subtitle = material['title'] or info['episodes'][0]['name']
-        description = material.get('synopsis') or info['episodes'][0]['synopsis']
+        title = info['abstracts'][0]['name']
+        subtitle = material.get('title')
+        if subtitle:
+            title += ' - %s' % subtitle
+        description = material.get('synopsis')
+
+        meta = info.get('meta', {})
 
         # Use unencrypted m3u8 streams (See https://github.com/rg3/youtube-dl/issues/4118)
-        videopath = material['videopath'].replace('.f4m', '.m3u8')
-        m3u8_url = 'http://manifest.us.rtl.nl' + videopath
+        # NB: nowadays, recent ffmpeg and avconv can handle these encrypted streams, so
+        # this adaptive -> flash workaround is not required in general, but it also
+        # allows bypassing georestriction therefore is retained for now.
+        videopath = material['videopath'].replace('/adaptive/', '/flash/')
+        m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath
 
         formats = self._extract_m3u8_formats(m3u8_url, uuid, ext='mp4')
 
@@ -79,7 +108,7 @@ class RtlNlIE(InfoExtractor):
         self._sort_formats(formats)
 
         thumbnails = []
-        meta = info.get('meta', {})
+
         for p in ('poster_base_url', '"thumb_base_url"'):
             if not meta.get(p):
                 continue
@@ -95,7 +124,7 @@ class RtlNlIE(InfoExtractor):
 
         return {
             'id': uuid,
-            'title': '%s - %s' % (progname, subtitle),
+            'title': title,
             'formats': formats,
             'timestamp': material['original_date'],
             'description': description,
diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py
deleted file mode 100644 (file)
index 785a804..0000000
+++ /dev/null
@@ -1,174 +0,0 @@
-# encoding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import (
-    ExtractorError,
-    clean_html,
-    unified_strdate,
-    int_or_none,
-)
-
-
-class RTLnowIE(InfoExtractor):
-    """Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW"""
-    _VALID_URL = r'''(?x)
-                        (?:https?://)?
-                        (?P<url>
-                            (?P<domain>
-                                rtl-now\.rtl\.de|
-                                rtl2now\.rtl2\.de|
-                                (?:www\.)?voxnow\.de|
-                                (?:www\.)?rtlnitronow\.de|
-                                (?:www\.)?superrtlnow\.de|
-                                (?:www\.)?n-tvnow\.de)
-                            /+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?
-                            (?:container_id|film_id)=(?P<video_id>[0-9]+)&
-                            player=1(?:&season=[0-9]+)?(?:&.*)?
-                        )'''
-
-    _TESTS = [
-        {
-            'url': 'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1',
-            'info_dict': {
-                'id': '90419',
-                'ext': 'flv',
-                'title': 'Ahornallee - Folge 1 - Der Einzug',
-                'description': 'md5:ce843b6b5901d9a7f7d04d1bbcdb12de',
-                'upload_date': '20070416',
-                'duration': 1685,
-            },
-            'params': {
-                'skip_download': True,
-            },
-            'skip': 'Only works from Germany',
-        },
-        {
-            'url': 'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5',
-            'info_dict': {
-                'id': '69756',
-                'ext': 'flv',
-                'title': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.',
-                'description': 'md5:3fb247005ed21a935ffc82b7dfa70cf0',
-                'thumbnail': 'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg',
-                'upload_date': '20120519',
-                'duration': 1245,
-            },
-            'params': {
-                'skip_download': True,
-            },
-            'skip': 'Only works from Germany',
-        },
-        {
-            'url': 'http://www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17',
-            'info_dict': {
-                'id': '13883',
-                'ext': 'flv',
-                'title': 'Voxtours - Südafrika-Reporter II',
-                'description': 'md5:de7f8d56be6fd4fed10f10f57786db00',
-                'upload_date': '20090627',
-                'duration': 1800,
-            },
-            'params': {
-                'skip_download': True,
-            },
-        },
-        {
-            'url': 'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1',
-            'info_dict': {
-                'id': '99205',
-                'ext': 'flv',
-                'title': 'Medicopter 117 - Angst!',
-                'description': 're:^Im Therapiezentrum \'Sonnalm\' kommen durch eine Unachtsamkeit die für die B.handlung mit Phobikern gehaltenen Voglespinnen frei\. Eine Ausreißerin',
-                'thumbnail': 'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg',
-                'upload_date': '20080928',
-                'duration': 2691,
-            },
-            'params': {
-                'skip_download': True,
-            },
-        },
-        {
-            'url': 'http://rtl-now.rtl.de/der-bachelor/folge-4.php?film_id=188729&player=1&season=5',
-            'info_dict': {
-                'id': '188729',
-                'ext': 'flv',
-                'upload_date': '20150204',
-                'description': 'md5:5e1ce23095e61a79c166d134b683cecc',
-                'title': 'Der Bachelor - Folge 4',
-            }
-        }, {
-            'url': 'http://www.n-tvnow.de/deluxe-alles-was-spass-macht/thema-ua-luxushotel-fuer-vierbeiner.php?container_id=153819&player=1&season=0',
-            'only_matching': True,
-        },
-    ]
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_page_url = 'http://%s/' % mobj.group('domain')
-        video_id = mobj.group('video_id')
-
-        webpage = self._download_webpage('http://' + mobj.group('url'), video_id)
-
-        mobj = re.search(r'(?s)<div style="margin-left: 20px; font-size: 13px;">(.*?)<div id="playerteaser">', webpage)
-        if mobj:
-            raise ExtractorError(clean_html(mobj.group(1)), expected=True)
-
-        title = self._og_search_title(webpage)
-        description = self._og_search_description(webpage)
-        thumbnail = self._og_search_thumbnail(webpage, default=None)
-
-        upload_date = unified_strdate(self._html_search_meta('uploadDate', webpage, 'upload date'))
-
-        mobj = re.search(r'<meta itemprop="duration" content="PT(?P<seconds>\d+)S" />', webpage)
-        duration = int(mobj.group('seconds')) if mobj else None
-
-        playerdata_url = self._html_search_regex(
-            r"'playerdata': '(?P<playerdata_url>[^']+)'", webpage, 'playerdata_url')
-
-        playerdata = self._download_xml(playerdata_url, video_id, 'Downloading player data XML')
-
-        videoinfo = playerdata.find('./playlist/videoinfo')
-
-        formats = []
-        for filename in videoinfo.findall('filename'):
-            mobj = re.search(r'(?P<url>rtmpe://(?:[^/]+/){2})(?P<play_path>.+)', filename.text)
-            if mobj:
-                fmt = {
-                    'url': mobj.group('url'),
-                    'play_path': 'mp4:' + mobj.group('play_path'),
-                    'page_url': video_page_url,
-                    'player_url': video_page_url + 'includes/vodplayer.swf',
-                }
-            else:
-                mobj = re.search(r'.*/(?P<hoster>[^/]+)/videos/(?P<play_path>.+)\.f4m', filename.text)
-                if mobj:
-                    fmt = {
-                        'url': 'rtmpe://fms.rtl.de/' + mobj.group('hoster'),
-                        'play_path': 'mp4:' + mobj.group('play_path'),
-                        'page_url': url,
-                        'player_url': video_page_url + 'includes/vodplayer.swf',
-                    }
-                else:
-                    fmt = {
-                        'url': filename.text,
-                    }
-            fmt.update({
-                'width': int_or_none(filename.get('width')),
-                'height': int_or_none(filename.get('height')),
-                'vbr': int_or_none(filename.get('bitrate')),
-                'ext': 'flv',
-            })
-            formats.append(fmt)
-
-        return {
-            'id': video_id,
-            'title': title,
-            'description': description,
-            'thumbnail': thumbnail,
-            'upload_date': upload_date,
-            'duration': duration,
-            'formats': formats,
-        }
index d0981115da9c64f1addf3108b8a9e6acf0c6508e..9fbe239d8c3fdc3290b4beace30a25357bb086b8 100644 (file)
@@ -190,6 +190,7 @@ class RTSIE(InfoExtractor):
                 'tbr': media['rate'] or extract_bitrate(media['url']),
             } for media in info['media'] if media.get('rate')])
 
+        self._check_formats(formats, video_id)
         self._sort_formats(formats)
 
         return {
index 849300140ecbf598874d22b090262eabec1e7ea5..82cd98ac742bf436b24fbbc77cac9a6fb8a44ff6 100644 (file)
@@ -17,7 +17,7 @@ from ..utils import (
 
 
 def _decrypt_url(png):
-    encrypted_data = base64.b64decode(png)
+    encrypted_data = base64.b64decode(png.encode('utf-8'))
     text_index = encrypted_data.find(b'tEXt')
     text_chunk = encrypted_data[text_index - 4:]
     length = struct_unpack('!I', text_chunk[:4])[0]
index ef766237bf318d40da067a6a820a725fbe0da286..d9df0686133a6772deb1e58260069857620afc58 100644 (file)
@@ -84,18 +84,27 @@ class RUTVIE(InfoExtractor):
                 'title': 'Сочи-2014. Биатлон. Индивидуальная гонка. Мужчины ',
                 'description': 'md5:9e0ed5c9d2fa1efbfdfed90c9a6d179c',
             },
+            'skip': 'Translation has finished',
+        },
+        {
+            'url': 'http://player.rutv.ru/iframe/live/id/21/showZoomBtn/false/isPlay/true/',
+            'info_dict': {
+                'id': '21',
+                'ext': 'mp4',
+                'title': 're:^Россия 24. Прямой эфир [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+                'is_live': True,
+            },
             'params': {
-                # rtmp download
+                # m3u8 download
                 'skip_download': True,
             },
-            'skip': 'Translation has finished',
         },
     ]
 
     @classmethod
     def _extract_url(cls, webpage):
         mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.rutv\.ru/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage)
+            r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage)
         if mobj:
             return mobj.group('url')
 
@@ -119,8 +128,10 @@ class RUTVIE(InfoExtractor):
         elif video_path.startswith('index/iframe/cast_id'):
             video_type = 'live'
 
+        is_live = video_type == 'live'
+
         json_data = self._download_json(
-            'http://player.rutv.ru/iframe/%splay/id/%s' % ('live-' if video_type == 'live' else '', video_id),
+            'http://player.rutv.ru/iframe/%splay/id/%s' % ('live-' if is_live else '', video_id),
             video_id, 'Downloading JSON')
 
         if json_data['errors']:
@@ -147,6 +158,7 @@ class RUTVIE(InfoExtractor):
 
         for transport, links in media['sources'].items():
             for quality, url in links.items():
+                preference = -1 if priority_transport == transport else -2
                 if transport == 'rtmp':
                     mobj = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>.+)$', url)
                     if not mobj:
@@ -160,9 +172,11 @@ class RUTVIE(InfoExtractor):
                         'rtmp_live': True,
                         'ext': 'flv',
                         'vbr': int(quality),
+                        'preference': preference,
                     }
                 elif transport == 'm3u8':
-                    formats.extend(self._extract_m3u8_formats(url, video_id, 'mp4'))
+                    formats.extend(self._extract_m3u8_formats(
+                        url, video_id, 'mp4', preference=preference, m3u8_id='hls'))
                     continue
                 else:
                     fmt = {
@@ -172,21 +186,18 @@ class RUTVIE(InfoExtractor):
                     'width': width,
                     'height': height,
                     'format_id': '%s-%s' % (transport, quality),
-                    'preference': -1 if priority_transport == transport else -2,
                 })
                 formats.append(fmt)
 
-        if not formats:
-            raise ExtractorError('No media links available for %s' % video_id)
-
         self._sort_formats(formats)
 
         return {
             'id': video_id,
-            'title': title,
+            'title': self._live_title(title) if is_live else title,
             'description': description,
             'thumbnail': thumbnail,
             'view_count': view_count,
             'duration': duration,
             'formats': formats,
+            'is_live': is_live,
         }
diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py
new file mode 100644 (file)
index 0000000..4e22628
--- /dev/null
@@ -0,0 +1,119 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_urlparse
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    xpath_text,
+)
+
+
+class RuutuIE(InfoExtractor):
+    _VALID_URL = r'http://(?:www\.)?ruutu\.fi/ohjelmat/(?:[^/?#]+/)*(?P<id>[^/?#]+)'
+    _TESTS = [
+        {
+            'url': 'http://www.ruutu.fi/ohjelmat/oletko-aina-halunnut-tietaa-mita-tapahtuu-vain-hetki-ennen-lahetysta-nyt-se-selvisi',
+            'md5': 'ab2093f39be1ca8581963451b3c0234f',
+            'info_dict': {
+                'id': '2058907',
+                'display_id': 'oletko-aina-halunnut-tietaa-mita-tapahtuu-vain-hetki-ennen-lahetysta-nyt-se-selvisi',
+                'ext': 'mp4',
+                'title': 'Oletko aina halunnut tietää mitä tapahtuu vain hetki ennen lähetystä? - Nyt se selvisi!',
+                'description': 'md5:cfc6ccf0e57a814360df464a91ff67d6',
+                'thumbnail': 're:^https?://.*\.jpg$',
+                'duration': 114,
+                'age_limit': 0,
+            },
+        },
+        {
+            'url': 'http://www.ruutu.fi/ohjelmat/superpesis/superpesis-katso-koko-kausi-ruudussa',
+            'md5': '065a10ae4d5b8cfd9d0c3d332465e3d9',
+            'info_dict': {
+                'id': '2057306',
+                'display_id': 'superpesis-katso-koko-kausi-ruudussa',
+                'ext': 'mp4',
+                'title': 'Superpesis: katso koko kausi Ruudussa',
+                'description': 'md5:44c44a99fdbe5b380ab74ebd75f0af77',
+                'thumbnail': 're:^https?://.*\.jpg$',
+                'duration': 40,
+                'age_limit': 0,
+            },
+        },
+    ]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        video_id = self._search_regex(
+            r'data-media-id="(\d+)"', webpage, 'media id')
+
+        video_xml_url = None
+
+        media_data = self._search_regex(
+            r'jQuery\.extend\([^,]+,\s*(.+?)\);', webpage,
+            'media data', default=None)
+        if media_data:
+            media_json = self._parse_json(media_data, display_id, fatal=False)
+            if media_json:
+                xml_url = media_json.get('ruutuplayer', {}).get('xmlUrl')
+                if xml_url:
+                    video_xml_url = xml_url.replace('{ID}', video_id)
+
+        if not video_xml_url:
+            video_xml_url = 'http://gatling.ruutu.fi/media-xml-cache?id=%s' % video_id
+
+        video_xml = self._download_xml(video_xml_url, video_id)
+
+        formats = []
+        processed_urls = []
+
+        def extract_formats(node):
+            for child in node:
+                if child.tag.endswith('Files'):
+                    extract_formats(child)
+                elif child.tag.endswith('File'):
+                    video_url = child.text
+                    if not video_url or video_url in processed_urls or 'NOT_USED' in video_url:
+                        return
+                    processed_urls.append(video_url)
+                    ext = determine_ext(video_url)
+                    if ext == 'm3u8':
+                        formats.extend(self._extract_m3u8_formats(
+                            video_url, video_id, 'mp4', m3u8_id='hls'))
+                    elif ext == 'f4m':
+                        formats.extend(self._extract_f4m_formats(
+                            video_url, video_id, f4m_id='hds'))
+                    else:
+                        proto = compat_urllib_parse_urlparse(video_url).scheme
+                        if not child.tag.startswith('HTTP') and proto != 'rtmp':
+                            continue
+                        preference = -1 if proto == 'rtmp' else 1
+                        label = child.get('label')
+                        tbr = int_or_none(child.get('bitrate'))
+                        width, height = [int_or_none(x) for x in child.get('resolution', '').split('x')]
+                        formats.append({
+                            'format_id': '%s-%s' % (proto, label if label else tbr),
+                            'url': video_url,
+                            'width': width,
+                            'height': height,
+                            'tbr': tbr,
+                            'preference': preference,
+                        })
+
+        extract_formats(video_xml.find('./Clip'))
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': self._og_search_title(webpage),
+            'description': self._og_search_description(webpage),
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')),
+            'age_limit': int_or_none(xpath_text(video_xml, './/AgeLimit', 'age limit')),
+            'formats': formats,
+        }
index 10251f29e033ef241618ed7985e214dc0e76cd51..f3c80708c86ab2fc29fbd029b245bbe894af2dfb 100644 (file)
@@ -83,7 +83,7 @@ class SafariIE(SafariBaseIE):
                                     library/view/[^/]+|
                                     api/v1/book
                                 )/
-                                (?P<course_id>\d+)/
+                                (?P<course_id>[^/]+)/
                                     (?:chapter(?:-content)?/)?
                                 (?P<part>part\d+)\.html
     '''
@@ -100,6 +100,10 @@ class SafariIE(SafariBaseIE):
     }, {
         'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html',
         'only_matching': True,
+    }, {
+        # non-digits in course id
+        'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
@@ -122,7 +126,7 @@ class SafariCourseIE(SafariBaseIE):
     IE_NAME = 'safari:course'
     IE_DESC = 'safaribooksonline.com online courses'
 
-    _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)/(?P<id>\d+)/?(?:[#?]|$)'
+    _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)/(?P<id>[^/]+)/?(?:[#?]|$)'
 
     _TESTS = [{
         'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/',
index b8775c2f99f4a105ae35f1b04a919e64c987df0f..d6ee2d9e2245475d236c12fb6967af68558d8598 100644 (file)
@@ -1,18 +1,12 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 
-import json
-import re
 from .common import InfoExtractor
-from ..utils import (
-    js_to_json,
-    remove_end,
-)
 
 
 class SBSIE(InfoExtractor):
     IE_DESC = 'sbs.com.au'
-    _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/ondemand/video/(?:single/)?(?P<id>[0-9]+)'
+    _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand|news)/video/(?:single/)?(?P<id>[0-9]+)'
 
     _TESTS = [{
         # Original URL is handled by the generic IE which finds the iframe:
@@ -22,38 +16,36 @@ class SBSIE(InfoExtractor):
         'info_dict': {
             'id': '320403011771',
             'ext': 'mp4',
-            'title': 'Dingo Conservation',
-            'description': 'Dingoes are on the brink of extinction; most of the animals we think are dingoes are in fact crossbred with wild dogs. This family run a dingo conservation park to prevent their extinction',
+            'title': 'Dingo Conservation (The Feed)',
+            'description': 'md5:f250a9856fca50d22dec0b5b8015f8a5',
             'thumbnail': 're:http://.*\.jpg',
+            'duration': 308,
         },
-        'add_ies': ['generic'],
     }, {
         'url': 'http://www.sbs.com.au/ondemand/video/320403011771/Dingo-Conservation-The-Feed',
         'only_matching': True,
+    }, {
+        'url': 'http://www.sbs.com.au/news/video/471395907773/The-Feed-July-9',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        webpage = self._download_webpage(url, video_id)
+        video_id = self._match_id(url)
 
-        release_urls_json = js_to_json(self._search_regex(
-            r'(?s)playerParams\.releaseUrls\s*=\s*(\{.*?\n\});\n',
-            webpage, ''))
-        release_urls = json.loads(release_urls_json)
-        theplatform_url = (
-            release_urls.get('progressive') or release_urls.get('standard'))
+        webpage = self._download_webpage(
+            'http://www.sbs.com.au/ondemand/video/single/%s?context=web' % video_id, video_id)
 
-        title = remove_end(self._og_search_title(webpage), ' (The Feed)')
-        description = self._html_search_meta('description', webpage)
-        thumbnail = self._og_search_thumbnail(webpage)
+        player_params = self._parse_json(
+            self._search_regex(
+                r'(?s)var\s+playerParams\s*=\s*({.+?});', webpage, 'playerParams'),
+            video_id)
+
+        urls = player_params['releaseUrls']
+        theplatform_url = (urls.get('progressive') or urls.get('standard') or
+                           urls.get('html') or player_params['relatedItemsURL'])
 
         return {
             '_type': 'url_transparent',
             'id': video_id,
             'url': theplatform_url,
-
-            'title': title,
-            'description': description,
-            'thumbnail': thumbnail,
         }
index 6c9fdb7c1aceb35efc166c9207fd503603040b9b..d1ab66b3216d5153a5480769fb0723919f3fdb37 100644 (file)
@@ -11,7 +11,7 @@ from ..utils import (
 
 
 class ScreenwaveMediaIE(InfoExtractor):
-    _VALID_URL = r'http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=(?P<id>.+)'
+    _VALID_URL = r'http://player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?[^"]*\bid=(?P<id>.+)'
 
     _TESTS = [{
         'url': 'http://player.screenwavemedia.com/play/play.php?playerdiv=videoarea&companiondiv=squareAd&id=Cinemassacre-19911',
@@ -20,7 +20,10 @@ class ScreenwaveMediaIE(InfoExtractor):
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        playerdata = self._download_webpage(url, video_id, 'Downloading player webpage')
+
+        playerdata = self._download_webpage(
+            'http://player.screenwavemedia.com/play/player.php?id=%s' % video_id,
+            video_id, 'Downloading player webpage')
 
         vidtitle = self._search_regex(
             r'\'vidtitle\'\s*:\s*"([^"]+)"', playerdata, 'vidtitle').replace('\\/', '/')
@@ -81,60 +84,6 @@ class ScreenwaveMediaIE(InfoExtractor):
         }
 
 
-class CinemassacreIE(InfoExtractor):
-    _VALID_URL = 'https?://(?:www\.)?cinemassacre\.com/(?P<date_y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/(?P<display_id>[^?#/]+)'
-    _TESTS = [
-        {
-            'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/',
-            'md5': 'fde81fbafaee331785f58cd6c0d46190',
-            'info_dict': {
-                'id': 'Cinemassacre-19911',
-                'ext': 'mp4',
-                'upload_date': '20121110',
-                'title': '“Angry Video Game Nerd: The Movie” – Trailer',
-                'description': 'md5:fb87405fcb42a331742a0dce2708560b',
-            },
-        },
-        {
-            'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
-            'md5': 'd72f10cd39eac4215048f62ab477a511',
-            'info_dict': {
-                'id': 'Cinemassacre-521be8ef82b16',
-                'ext': 'mp4',
-                'upload_date': '20131002',
-                'title': 'The Mummy’s Hand (1940)',
-            },
-        }
-    ]
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        display_id = mobj.group('display_id')
-        video_date = mobj.group('date_y') + mobj.group('date_m') + mobj.group('date_d')
-
-        webpage = self._download_webpage(url, display_id)
-
-        playerdata_url = self._search_regex(
-            r'src="(http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"',
-            webpage, 'player data URL')
-        video_title = self._html_search_regex(
-            r'<title>(?P<title>.+?)\|', webpage, 'title')
-        video_description = self._html_search_regex(
-            r'<div class="entry-content">(?P<description>.+?)</div>',
-            webpage, 'description', flags=re.DOTALL, fatal=False)
-        video_thumbnail = self._og_search_thumbnail(webpage)
-
-        return {
-            '_type': 'url_transparent',
-            'display_id': display_id,
-            'title': video_title,
-            'description': video_description,
-            'upload_date': video_date,
-            'thumbnail': video_thumbnail,
-            'url': playerdata_url,
-        }
-
-
 class TeamFourIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?teamfourstar\.com/video/(?P<id>[a-z0-9\-]+)/?'
     _TEST = {
@@ -153,7 +102,7 @@ class TeamFourIE(InfoExtractor):
         webpage = self._download_webpage(url, display_id)
 
         playerdata_url = self._search_regex(
-            r'src="(http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"',
+            r'src="(http://player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?[^"]*\bid=.+?)"',
             webpage, 'player data URL')
 
         video_title = self._html_search_regex(
diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py
new file mode 100644 (file)
index 0000000..9c53704
--- /dev/null
@@ -0,0 +1,145 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    unsmuggle_url,
+)
+from ..compat import (
+    compat_parse_qs,
+    compat_urlparse,
+)
+
+
+class SenateISVPIE(InfoExtractor):
+    _COMM_MAP = [
+        ["ag", "76440", "http://ag-f.akamaihd.net"],
+        ["aging", "76442", "http://aging-f.akamaihd.net"],
+        ["approps", "76441", "http://approps-f.akamaihd.net"],
+        ["armed", "76445", "http://armed-f.akamaihd.net"],
+        ["banking", "76446", "http://banking-f.akamaihd.net"],
+        ["budget", "76447", "http://budget-f.akamaihd.net"],
+        ["cecc", "76486", "http://srs-f.akamaihd.net"],
+        ["commerce", "80177", "http://commerce1-f.akamaihd.net"],
+        ["csce", "75229", "http://srs-f.akamaihd.net"],
+        ["dpc", "76590", "http://dpc-f.akamaihd.net"],
+        ["energy", "76448", "http://energy-f.akamaihd.net"],
+        ["epw", "76478", "http://epw-f.akamaihd.net"],
+        ["ethics", "76449", "http://ethics-f.akamaihd.net"],
+        ["finance", "76450", "http://finance-f.akamaihd.net"],
+        ["foreign", "76451", "http://foreign-f.akamaihd.net"],
+        ["govtaff", "76453", "http://govtaff-f.akamaihd.net"],
+        ["help", "76452", "http://help-f.akamaihd.net"],
+        ["indian", "76455", "http://indian-f.akamaihd.net"],
+        ["intel", "76456", "http://intel-f.akamaihd.net"],
+        ["intlnarc", "76457", "http://intlnarc-f.akamaihd.net"],
+        ["jccic", "85180", "http://jccic-f.akamaihd.net"],
+        ["jec", "76458", "http://jec-f.akamaihd.net"],
+        ["judiciary", "76459", "http://judiciary-f.akamaihd.net"],
+        ["rpc", "76591", "http://rpc-f.akamaihd.net"],
+        ["rules", "76460", "http://rules-f.akamaihd.net"],
+        ["saa", "76489", "http://srs-f.akamaihd.net"],
+        ["smbiz", "76461", "http://smbiz-f.akamaihd.net"],
+        ["srs", "75229", "http://srs-f.akamaihd.net"],
+        ["uscc", "76487", "http://srs-f.akamaihd.net"],
+        ["vetaff", "76462", "http://vetaff-f.akamaihd.net"],
+        ["arch", "", "http://ussenate-f.akamaihd.net/"]
+    ]
+    _IE_NAME = 'senate.gov'
+    _VALID_URL = r'http://www\.senate\.gov/isvp/?\?(?P<qs>.+)'
+    _TESTS = [{
+        'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png',
+        'info_dict': {
+            'id': 'judiciary031715',
+            'ext': 'flv',
+            'title': 'Integrated Senate Video Player',
+            'thumbnail': 're:^https?://.*\.(?:jpg|png)$',
+        }
+    }, {
+        'url': 'http://www.senate.gov/isvp/?type=live&comm=commerce&filename=commerce011514.mp4&auto_play=false',
+        'info_dict': {
+            'id': 'commerce011514',
+            'ext': 'flv',
+            'title': 'Integrated Senate Video Player'
+        }
+    }, {
+        'url': 'http://www.senate.gov/isvp/?type=arch&comm=intel&filename=intel090613&hc_location=ufi',
+        # checksum differs each time
+        'info_dict': {
+            'id': 'intel090613',
+            'ext': 'mp4',
+            'title': 'Integrated Senate Video Player'
+        }
+    }, {
+        # From http://www.c-span.org/video/?96791-1
+        'url': 'http://www.senate.gov/isvp?type=live&comm=banking&filename=banking012715',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _search_iframe_url(webpage):
+        mobj = re.search(
+            r"<iframe[^>]+src=['\"](?P<url>http://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]",
+            webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _get_info_for_comm(self, committee):
+        for entry in self._COMM_MAP:
+            if entry[0] == committee:
+                return entry[1:]
+
+    def _real_extract(self, url):
+        url, smuggled_data = unsmuggle_url(url, {})
+
+        qs = compat_parse_qs(re.match(self._VALID_URL, url).group('qs'))
+        if not qs.get('filename') or not qs.get('type') or not qs.get('comm'):
+            raise ExtractorError('Invalid URL', expected=True)
+
+        video_id = re.sub(r'.mp4$', '', qs['filename'][0])
+
+        webpage = self._download_webpage(url, video_id)
+
+        if smuggled_data.get('force_title'):
+            title = smuggled_data['force_title']
+        else:
+            title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, video_id)
+        poster = qs.get('poster')
+        thumbnail = poster[0] if poster else None
+
+        video_type = qs['type'][0]
+        committee = video_type if video_type == 'arch' else qs['comm'][0]
+        stream_num, domain = self._get_info_for_comm(committee)
+
+        formats = []
+        if video_type == 'arch':
+            filename = video_id if '.' in video_id else video_id + '.mp4'
+            formats = [{
+                # All parameters in the query string are necessary to prevent a 403 error
+                'url': compat_urlparse.urljoin(domain, filename) + '?v=3.1.0&fp=&r=&g=',
+            }]
+        else:
+            hdcore_sign = '?hdcore=3.1.0'
+            url_params = (domain, video_id, stream_num)
+            f4m_url = '%s/z/%s_1@%s/manifest.f4m' % url_params + hdcore_sign
+            m3u8_url = '%s/i/%s_1@%s/master.m3u8' % url_params
+            for entry in self._extract_f4m_formats(f4m_url, video_id, f4m_id='f4m'):
+                # URLs without the extra param induce an 404 error
+                entry.update({'extra_param_to_segment_url': hdcore_sign})
+                formats.append(entry)
+            for entry in self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8'):
+                mobj = re.search(r'(?P<tag>(?:-p|-b)).m3u8', entry['url'])
+                if mobj:
+                    entry['format_id'] += mobj.group('tag')
+                formats.append(entry)
+
+            self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': thumbnail,
+        }
index 26ced716e8a875f1c4c5c9527b856475dce83f9e..a07677686a4ecc2923b310c3aeeeaab610bb0868 100644 (file)
@@ -1,6 +1,5 @@
 from __future__ import unicode_literals
 
-import re
 import base64
 
 from .common import InfoExtractor
@@ -35,8 +34,7 @@ class SharedIE(InfoExtractor):
             raise ExtractorError(
                 'Video %s does not exist' % video_id, expected=True)
 
-        download_form = dict(re.findall(
-            r'<input type="hidden" name="([^"]+)" value="([^"]*)"', webpage))
+        download_form = self._hidden_inputs(webpage)
         request = compat_urllib_request.Request(
             url, compat_urllib_parse.urlencode(download_form))
         request.add_header('Content-Type', 'application/x-www-form-urlencoded')
@@ -47,7 +45,7 @@ class SharedIE(InfoExtractor):
         video_url = self._html_search_regex(
             r'data-url="([^"]+)"', video_page, 'video URL')
         title = base64.b64decode(self._html_search_meta(
-            'full:title', webpage, 'title')).decode('utf-8')
+            'full:title', webpage, 'title').encode('utf-8')).decode('utf-8')
         filesize = int_or_none(self._html_search_meta(
             'full:size', webpage, 'file size', fatal=False))
         thumbnail = self._html_search_regex(
index 24746a09a0c2183e8a0bd8e239cb59291b41f19a..93a7cfe15cc764bc61b912dd2e3283d950790565 100644 (file)
@@ -53,7 +53,7 @@ class SmotriIE(InfoExtractor):
                 'thumbnail': 'http://frame4.loadup.ru/03/ed/57591.2.3.jpg',
             },
         },
-        # video-password
+        # video-password, not approved by moderator
         {
             'url': 'http://smotri.com/video/view/?id=v1390466a13c',
             'md5': 'f6331cef33cad65a0815ee482a54440b',
@@ -71,7 +71,24 @@ class SmotriIE(InfoExtractor):
             },
             'skip': 'Video is not approved by moderator',
         },
-        # age limit + video-password
+        # video-password
+        {
+            'url': 'http://smotri.com/video/view/?id=v6984858774#',
+            'md5': 'f11e01d13ac676370fc3b95b9bda11b0',
+            'info_dict': {
+                'id': 'v6984858774',
+                'ext': 'mp4',
+                'title': 'Дача Солженицина ПАРОЛЬ 223322',
+                'uploader': 'psavari1',
+                'uploader_id': 'psavari1',
+                'upload_date': '20081103',
+                'thumbnail': 're:^https?://.*\.jpg$',
+            },
+            'params': {
+                'videopassword': '223322',
+            },
+        },
+        # age limit + video-password, not approved by moderator
         {
             'url': 'http://smotri.com/video/view/?id=v15408898bcf',
             'md5': '91e909c9f0521adf5ee86fbe073aad70',
@@ -90,19 +107,22 @@ class SmotriIE(InfoExtractor):
             },
             'skip': 'Video is not approved by moderator',
         },
-        # not approved by moderator, but available
+        # age limit + video-password
         {
-            'url': 'http://smotri.com/video/view/?id=v28888533b73',
-            'md5': 'f44bc7adac90af518ef1ecf04893bb34',
+            'url': 'http://smotri.com/video/view/?id=v7780025814',
+            'md5': 'b4599b068422559374a59300c5337d72',
             'info_dict': {
-                'id': 'v28888533b73',
+                'id': 'v7780025814',
                 'ext': 'mp4',
-                'title': 'Russian Spies Killed By ISIL Child Soldier',
-                'uploader': 'Mopeder',
-                'uploader_id': 'mopeder',
-                'duration': 71,
-                'thumbnail': 'http://frame9.loadup.ru/d7/32/2888853.2.3.jpg',
-                'upload_date': '20150114',
+                'title': 'Sexy Beach (пароль 123)',
+                'uploader': 'вАся',
+                'uploader_id': 'asya_prosto',
+                'upload_date': '20081218',
+                'thumbnail': 're:^https?://.*\.jpg$',
+                'age_limit': 18,
+            },
+            'params': {
+                'videopassword': '123'
             },
         },
         # swf player
@@ -152,6 +172,10 @@ class SmotriIE(InfoExtractor):
             'getvideoinfo': '1',
         }
 
+        video_password = self._downloader.params.get('videopassword', None)
+        if video_password:
+            video_form['pass'] = hashlib.md5(video_password.encode('utf-8')).hexdigest()
+
         request = compat_urllib_request.Request(
             'http://smotri.com/video/view/url/bot/', compat_urllib_parse.urlencode(video_form))
         request.add_header('Content-Type', 'application/x-www-form-urlencoded')
@@ -161,13 +185,18 @@ class SmotriIE(InfoExtractor):
         video_url = video.get('_vidURL') or video.get('_vidURL_mp4')
 
         if not video_url:
-            if video.get('_moderate_no') or not video.get('moderated'):
+            if video.get('_moderate_no'):
                 raise ExtractorError(
                     'Video %s has not been approved by moderator' % video_id, expected=True)
 
             if video.get('error'):
                 raise ExtractorError('Video %s does not exist' % video_id, expected=True)
 
+            if video.get('_pass_protected') == 1:
+                msg = ('Invalid video password' if video_password
+                       else 'This video is protected by a password, use the --video-password option')
+                raise ExtractorError(msg, expected=True)
+
         title = video['title']
         thumbnail = video['_imgURL']
         upload_date = unified_strdate(video['added'])
diff --git a/youtube_dl/extractor/snagfilms.py b/youtube_dl/extractor/snagfilms.py
new file mode 100644 (file)
index 0000000..6977afb
--- /dev/null
@@ -0,0 +1,181 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    clean_html,
+    determine_ext,
+    int_or_none,
+    js_to_json,
+    parse_duration,
+)
+
+
+class SnagFilmsEmbedIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:(?:www|embed)\.)?snagfilms\.com/embed/player\?.*\bfilmId=(?P<id>[\da-f-]{36})'
+    _TESTS = [{
+        'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500',
+        'md5': '2924e9215c6eff7a55ed35b72276bd93',
+        'info_dict': {
+            'id': '74849a00-85a9-11e1-9660-123139220831',
+            'ext': 'mp4',
+            'title': '#whilewewatch',
+        }
+    }, {
+        # invalid labels, 360p is better that 480p
+        'url': 'http://www.snagfilms.com/embed/player?filmId=17ca0950-a74a-11e0-a92a-0026bb61d036',
+        'md5': '882fca19b9eb27ef865efeeaed376a48',
+        'info_dict': {
+            'id': '17ca0950-a74a-11e0-a92a-0026bb61d036',
+            'ext': 'mp4',
+            'title': 'Life in Limbo',
+        }
+    }, {
+        'url': 'http://www.snagfilms.com/embed/player?filmId=0000014c-de2f-d5d6-abcf-ffef58af0017',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?snagfilms\.com/embed/player.+?)\1',
+            webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        if '>This film is not playable in your area.<' in webpage:
+            raise ExtractorError(
+                'Film %s is not playable in your area.' % video_id, expected=True)
+
+        formats = []
+        for source in self._parse_json(js_to_json(self._search_regex(
+                r'(?s)sources:\s*(\[.+?\]),', webpage, 'json')), video_id):
+            file_ = source.get('file')
+            if not file_:
+                continue
+            type_ = source.get('type')
+            ext = determine_ext(file_)
+            format_id = source.get('label') or ext
+            if all(v == 'm3u8' for v in (type_, ext)):
+                formats.extend(self._extract_m3u8_formats(
+                    file_, video_id, 'mp4', m3u8_id='hls'))
+            else:
+                bitrate = int_or_none(self._search_regex(
+                    [r'(\d+)kbps', r'_\d{1,2}x\d{1,2}_(\d{3,})\.%s' % ext],
+                    file_, 'bitrate', default=None))
+                height = int_or_none(self._search_regex(
+                    r'^(\d+)[pP]$', format_id, 'height', default=None))
+                formats.append({
+                    'url': file_,
+                    'format_id': format_id,
+                    'tbr': bitrate,
+                    'height': height,
+                })
+        self._sort_formats(formats)
+
+        title = self._search_regex(
+            [r"title\s*:\s*'([^']+)'", r'<title>([^<]+)</title>'],
+            webpage, 'title')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+        }
+
+
+class SnagFilmsIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?snagfilms\.com/(?:films/title|show)/(?P<id>[^?#]+)'
+    _TESTS = [{
+        'url': 'http://www.snagfilms.com/films/title/lost_for_life',
+        'md5': '19844f897b35af219773fd63bdec2942',
+        'info_dict': {
+            'id': '0000014c-de2f-d5d6-abcf-ffef58af0017',
+            'display_id': 'lost_for_life',
+            'ext': 'mp4',
+            'title': 'Lost for Life',
+            'description': 'md5:fbdacc8bb6b455e464aaf98bc02e1c82',
+            'thumbnail': 're:^https?://.*\.jpg',
+            'duration': 4489,
+            'categories': ['Documentary', 'Crime', 'Award Winning', 'Festivals']
+        }
+    }, {
+        'url': 'http://www.snagfilms.com/show/the_world_cut_project/india',
+        'md5': 'e6292e5b837642bbda82d7f8bf3fbdfd',
+        'info_dict': {
+            'id': '00000145-d75c-d96e-a9c7-ff5c67b20000',
+            'display_id': 'the_world_cut_project/india',
+            'ext': 'mp4',
+            'title': 'India',
+            'description': 'md5:5c168c5a8f4719c146aad2e0dfac6f5f',
+            'thumbnail': 're:^https?://.*\.jpg',
+            'duration': 979,
+            'categories': ['Documentary', 'Sports', 'Politics']
+        }
+    }, {
+        # Film is not playable in your area.
+        'url': 'http://www.snagfilms.com/films/title/inside_mecca',
+        'only_matching': True,
+    }, {
+        # Film is not available.
+        'url': 'http://www.snagfilms.com/show/augie_alone/flirting',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        if ">Sorry, the Film you're looking for is not available.<" in webpage:
+            raise ExtractorError(
+                'Film %s is not available.' % display_id, expected=True)
+
+        film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id')
+
+        snag = self._parse_json(
+            self._search_regex(
+                'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag'),
+            display_id)
+
+        for item in snag:
+            if item.get('data', {}).get('film', {}).get('id') == film_id:
+                data = item['data']['film']
+                title = data['title']
+                description = clean_html(data.get('synopsis'))
+                thumbnail = data.get('image')
+                duration = int_or_none(data.get('duration') or data.get('runtime'))
+                categories = [
+                    category['title'] for category in data.get('categories', [])
+                    if category.get('title')]
+                break
+        else:
+            title = self._search_regex(
+                r'itemprop="title">([^<]+)<', webpage, 'title')
+            description = self._html_search_regex(
+                r'(?s)<div itemprop="description" class="film-synopsis-inner ">(.+?)</div>',
+                webpage, 'description', default=None) or self._og_search_description(webpage)
+            thumbnail = self._og_search_thumbnail(webpage)
+            duration = parse_duration(self._search_regex(
+                r'<span itemprop="duration" class="film-duration strong">([^<]+)<',
+                webpage, 'duration', fatal=False))
+            categories = re.findall(r'<a href="/movies/[^"]+">([^<]+)</a>', webpage)
+
+        return {
+            '_type': 'url_transparent',
+            'url': 'http://embed.snagfilms.com/embed/player?filmId=%s' % film_id,
+            'id': film_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'categories': categories,
+        }
diff --git a/youtube_dl/extractor/sockshare.py b/youtube_dl/extractor/sockshare.py
deleted file mode 100644 (file)
index b5fa6f1..0000000
+++ /dev/null
@@ -1,83 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from ..compat import (
-    compat_urllib_parse,
-    compat_urllib_request,
-)
-from ..utils import (
-    determine_ext,
-    ExtractorError,
-)
-
-from .common import InfoExtractor
-
-
-class SockshareIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?sockshare\.com/file/(?P<id>[0-9A-Za-z]+)'
-    _FILE_DELETED_REGEX = r'This file doesn\'t exist, or has been removed\.</div>'
-    _TEST = {
-        'url': 'http://www.sockshare.com/file/437BE28B89D799D7',
-        'md5': '9d0bf1cfb6dbeaa8d562f6c97506c5bd',
-        'info_dict': {
-            'id': '437BE28B89D799D7',
-            'title': 'big_buck_bunny_720p_surround.avi',
-            'ext': 'avi',
-        }
-    }
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-        url = 'http://sockshare.com/file/%s' % video_id
-        webpage = self._download_webpage(url, video_id)
-
-        if re.search(self._FILE_DELETED_REGEX, webpage) is not None:
-            raise ExtractorError('Video %s does not exist' % video_id,
-                                 expected=True)
-
-        confirm_hash = self._html_search_regex(r'''(?x)<input\s+
-            type="hidden"\s+
-            value="([^"]*)"\s+
-            name="hash"
-            ''', webpage, 'hash')
-
-        fields = {
-            "hash": confirm_hash.encode('utf-8'),
-            "confirm": "Continue as Free User"
-        }
-
-        post = compat_urllib_parse.urlencode(fields)
-        req = compat_urllib_request.Request(url, post)
-        # Apparently, this header is required for confirmation to work.
-        req.add_header('Host', 'www.sockshare.com')
-        req.add_header('Content-type', 'application/x-www-form-urlencoded')
-
-        webpage = self._download_webpage(
-            req, video_id, 'Downloading video page')
-
-        video_url = self._html_search_regex(
-            r'<a href="([^"]*)".+class="download_file_link"',
-            webpage, 'file url')
-        video_url = "http://www.sockshare.com" + video_url
-        title = self._html_search_regex((
-            r'<h1>(.+)<strong>',
-            r'var name = "([^"]+)";'),
-            webpage, 'title', default=None)
-        thumbnail = self._html_search_regex(
-            r'<img\s+src="([^"]*)".+?name="bg"',
-            webpage, 'thumbnail', default=None)
-
-        formats = [{
-            'format_id': 'sd',
-            'url': video_url,
-            'ext': determine_ext(title),
-        }]
-
-        return {
-            'id': video_id,
-            'title': title,
-            'thumbnail': thumbnail,
-            'formats': formats,
-        }
index 11edf616ac8a781f0765cec94a7e49b2cb07c32c..ba2d5e19bc0d1de322b4b12ed5b8c0dc31157f7f 100644 (file)
@@ -6,9 +6,12 @@ import re
 from .common import InfoExtractor
 from ..compat import (
     compat_str,
-    compat_urllib_request
+    compat_urllib_request,
+    compat_urllib_parse,
+)
+from ..utils import (
+    ExtractorError,
 )
-from ..utils import sanitize_url_path_consecutive_slashes
 
 
 class SohuIE(InfoExtractor):
@@ -23,9 +26,7 @@ class SohuIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'MV:Far East Movement《The Illest》',
         },
-        'params': {
-            'cn_verification_proxy': 'proxy.uku.im:8888'
-        }
+        'skip': 'On available in China',
     }, {
         'url': 'http://tv.sohu.com/20150305/n409385080.shtml',
         'md5': '699060e75cf58858dd47fb9c03c42cfb',
@@ -47,6 +48,7 @@ class SohuIE(InfoExtractor):
         'url': 'http://my.tv.sohu.com/pl/8384802/78910339.shtml',
         'info_dict': {
             'id': '78910339',
+            'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
         },
         'playlist': [{
             'md5': 'bdbfb8f39924725e6589c146bc1883ad',
@@ -110,12 +112,21 @@ class SohuIE(InfoExtractor):
 
         webpage = self._download_webpage(url, video_id)
 
-        title = self._og_search_title(webpage)
+        title = re.sub(r' - 搜狐视频$', '', self._og_search_title(webpage))
 
         vid = self._html_search_regex(
             r'var vid ?= ?["\'](\d+)["\']',
             webpage, 'video path')
         vid_data = _fetch_data(vid, mytv)
+        if vid_data['play'] != 1:
+            if vid_data.get('status') == 12:
+                raise ExtractorError(
+                    'Sohu said: There\'s something wrong in the video.',
+                    expected=True)
+            else:
+                raise ExtractorError(
+                    'Sohu said: The video is only licensed to users in Mainland China.',
+                    expected=True)
 
         formats_json = {}
         for format_id in ('nor', 'high', 'super', 'ori', 'h2644k', 'h2654k'):
@@ -132,23 +143,41 @@ class SohuIE(InfoExtractor):
             formats = []
             for format_id, format_data in formats_json.items():
                 allot = format_data['allot']
-                prot = format_data['prot']
 
                 data = format_data['data']
                 clips_url = data['clipsURL']
                 su = data['su']
 
-                part_str = self._download_webpage(
-                    'http://%s/?prot=%s&file=%s&new=%s' %
-                    (allot, prot, clips_url[i], su[i]),
-                    video_id,
-                    'Downloading %s video URL part %d of %d'
-                    % (format_id, i + 1, part_count))
+                video_url = 'newflv.sohu.ccgslb.net'
+                cdnId = None
+                retries = 0
+
+                while 'newflv.sohu.ccgslb.net' in video_url:
+                    params = {
+                        'prot': 9,
+                        'file': clips_url[i],
+                        'new': su[i],
+                        'prod': 'flash',
+                    }
 
-                part_info = part_str.split('|')
+                    if cdnId is not None:
+                        params['idc'] = cdnId
 
-                video_url = sanitize_url_path_consecutive_slashes(
-                    '%s%s?key=%s' % (part_info[0], su[i], part_info[3]))
+                    download_note = 'Downloading %s video URL part %d of %d' % (
+                        format_id, i + 1, part_count)
+
+                    if retries > 0:
+                        download_note += ' (retry #%d)' % retries
+                    part_info = self._parse_json(self._download_webpage(
+                        'http://%s/?%s' % (allot, compat_urllib_parse.urlencode(params)),
+                        video_id, download_note), video_id)
+
+                    video_url = part_info['url']
+                    cdnId = part_info.get('nid')
+
+                    retries += 1
+                    if retries > 5:
+                        raise ExtractorError('Failed to get video URL')
 
                 formats.append({
                     'url': video_url,
@@ -172,9 +201,10 @@ class SohuIE(InfoExtractor):
             info['id'] = video_id
         else:
             info = {
-                '_type': 'playlist',
+                '_type': 'multi_video',
                 'entries': playlist,
                 'id': video_id,
+                'title': title,
             }
 
         return info
diff --git a/youtube_dl/extractor/soompi.py b/youtube_dl/extractor/soompi.py
new file mode 100644 (file)
index 0000000..5da66ca
--- /dev/null
@@ -0,0 +1,146 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .crunchyroll import CrunchyrollIE
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    remove_start,
+    xpath_text,
+)
+
+
+class SoompiBaseIE(InfoExtractor):
+    def _get_episodes(self, webpage, episode_filter=None):
+        episodes = self._parse_json(
+            self._search_regex(
+                r'VIDEOS\s*=\s*(\[.+?\]);', webpage, 'episodes JSON'),
+            None)
+        return list(filter(episode_filter, episodes))
+
+
+class SoompiIE(SoompiBaseIE, CrunchyrollIE):
+    IE_NAME = 'soompi'
+    _VALID_URL = r'https?://tv\.soompi\.com/(?:en/)?watch/(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://tv.soompi.com/en/watch/29235',
+        'info_dict': {
+            'id': '29235',
+            'ext': 'mp4',
+            'title': 'Episode 1096',
+            'description': '2015-05-20'
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    def _get_episode(self, webpage, video_id):
+        return self._get_episodes(webpage, lambda x: x['id'] == video_id)[0]
+
+    def _get_subtitles(self, config, video_id):
+        sub_langs = {}
+        for subtitle in config.findall('./{default}preload/subtitles/subtitle'):
+            sub_langs[subtitle.attrib['id']] = subtitle.attrib['title']
+
+        subtitles = {}
+        for s in config.findall('./{default}preload/subtitle'):
+            lang_code = sub_langs.get(s.attrib['id'])
+            if not lang_code:
+                continue
+            sub_id = s.get('id')
+            data = xpath_text(s, './data', 'data')
+            iv = xpath_text(s, './iv', 'iv')
+            if not id or not iv or not data:
+                continue
+            subtitle = self._decrypt_subtitles(data, iv, sub_id).decode('utf-8')
+            subtitles[lang_code] = self._extract_subtitles(subtitle)
+        return subtitles
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        try:
+            webpage = self._download_webpage(
+                url, video_id, 'Downloading episode page')
+        except ExtractorError as ee:
+            if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
+                webpage = ee.cause.read()
+                block_message = self._html_search_regex(
+                    r'(?s)<div class="block-message">(.+?)</div>', webpage,
+                    'block message', default=None)
+                if block_message:
+                    raise ExtractorError(block_message, expected=True)
+            raise
+
+        formats = []
+        config = None
+        for format_id in re.findall(r'\?quality=([0-9a-zA-Z]+)', webpage):
+            config = self._download_xml(
+                'http://tv.soompi.com/en/show/_/%s-config.xml?mode=hls&quality=%s' % (video_id, format_id),
+                video_id, 'Downloading %s XML' % format_id)
+            m3u8_url = xpath_text(
+                config, './{default}preload/stream_info/file',
+                '%s m3u8 URL' % format_id)
+            if not m3u8_url:
+                continue
+            formats.extend(self._extract_m3u8_formats(
+                m3u8_url, video_id, 'mp4', m3u8_id=format_id))
+        self._sort_formats(formats)
+
+        episode = self._get_episode(webpage, video_id)
+
+        title = episode['name']
+        description = episode.get('description')
+        duration = int_or_none(episode.get('duration'))
+
+        thumbnails = [{
+            'id': thumbnail_id,
+            'url': thumbnail_url,
+        } for thumbnail_id, thumbnail_url in episode.get('img_url', {}).items()]
+
+        subtitles = self.extract_subtitles(config, video_id)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnails': thumbnails,
+            'duration': duration,
+            'formats': formats,
+            'subtitles': subtitles
+        }
+
+
+class SoompiShowIE(SoompiBaseIE):
+    IE_NAME = 'soompi:show'
+    _VALID_URL = r'https?://tv\.soompi\.com/en/shows/(?P<id>[0-9a-zA-Z\-_]+)'
+    _TESTS = [{
+        'url': 'http://tv.soompi.com/en/shows/liar-game',
+        'info_dict': {
+            'id': 'liar-game',
+            'title': 'Liar Game',
+            'description': 'md5:52c02bce0c1a622a95823591d0589b66',
+        },
+        'playlist_count': 14,
+    }]
+
+    def _real_extract(self, url):
+        show_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            url, show_id, 'Downloading show page')
+
+        title = remove_start(self._og_search_title(webpage), 'SoompiTV | ')
+        description = self._og_search_description(webpage)
+
+        entries = [
+            self.url_result('http://tv.soompi.com/en/watch/%s' % episode['id'], 'Soompi')
+            for episode in self._get_episodes(webpage)]
+
+        return self.playlist_result(entries, show_id, title, description)
index 316b2c90f110770299084889552b8137e072a617..118ca483265cfda0a4419d93e8ce2fd174148be4 100644 (file)
@@ -29,7 +29,7 @@ class SoundcloudIE(InfoExtractor):
     _VALID_URL = r'''(?x)^(?:https?://)?
                     (?:(?:(?:www\.|m\.)?soundcloud\.com/
                             (?P<uploader>[\w\d-]+)/
-                            (?!sets/|likes/?(?:$|[?#]))
+                            (?!sets/|(?:likes|tracks)/?(?:$|[?#]))
                             (?P<title>[\w\d-]+)/?
                             (?P<token>[^?]+?)?(?:[?].*)?$)
                        |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+)
@@ -221,7 +221,12 @@ class SoundcloudIE(InfoExtractor):
                 info_json_url += "&secret_token=" + token
         elif mobj.group('player'):
             query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
-            return self.url_result(query['url'][0])
+            real_url = query['url'][0]
+            # If the token is in the query of the original url we have to
+            # manually add it
+            if 'secret_token' in query:
+                real_url += '?secret_token=' + query['secret_token'][0]
+            return self.url_result(real_url)
         else:
             # extract uploader (which is in the url)
             uploader = mobj.group('uploader')
@@ -274,9 +279,8 @@ class SoundcloudSetIE(SoundcloudIE):
         info = self._download_json(resolv_url, full_title)
 
         if 'errors' in info:
-            for err in info['errors']:
-                self._downloader.report_error('unable to download video webpage: %s' % compat_str(err['error_message']))
-            return
+            msgs = (compat_str(err['error_message']) for err in info['errors'])
+            raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs))
 
         return {
             '_type': 'playlist',
@@ -303,6 +307,9 @@ class SoundcloudUserIE(SoundcloudIE):
             'title': 'The Royal Concept',
         },
         'playlist_mincount': 1,
+    }, {
+        'url': 'https://soundcloud.com/the-akashic-chronicler/tracks',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
@@ -332,7 +339,7 @@ class SoundcloudUserIE(SoundcloudIE):
             if len(new_entries) == 0:
                 self.to_screen('%s: End page received' % uploader)
                 break
-            entries.extend(self._extract_info_dict(e, quiet=True) for e in new_entries)
+            entries.extend(self.url_result(e['permalink_url'], 'Soundcloud') for e in new_entries)
 
         return {
             '_type': 'playlist',
index c20397b3d1bbffb69188ac872facd36cce5b11f7..7fb165a872766f4c1917d2929ec8a73b8f74434e 100644 (file)
@@ -1,3 +1,4 @@
+# encoding: utf-8
 from __future__ import unicode_literals
 
 from .mtv import MTVServicesInfoExtractor
@@ -5,7 +6,7 @@ from .mtv import MTVServicesInfoExtractor
 
 class SouthParkIE(MTVServicesInfoExtractor):
     IE_NAME = 'southpark.cc.com'
-    _VALID_URL = r'https?://(www\.)?(?P<url>southpark\.cc\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$))'
+    _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/(?:clips|full-episodes)/(?P<id>.+?)(\?|#|$))'
 
     _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss'
 
@@ -20,9 +21,20 @@ class SouthParkIE(MTVServicesInfoExtractor):
     }]
 
 
-class SouthparkDeIE(SouthParkIE):
+class SouthParkEsIE(SouthParkIE):
+    IE_NAME = 'southpark.cc.com:español'
+    _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/episodios-en-espanol/(?P<id>.+?)(\?|#|$))'
+    _LANG = 'es'
+
+    _TESTS = [{
+        'url': 'http://southpark.cc.com/episodios-en-espanol/s01e01-cartman-consigue-una-sonda-anal#source=351c1323-0b96-402d-a8b9-40d01b2e9bde&position=1&sort=!airdate',
+        'playlist_count': 4,
+    }]
+
+
+class SouthParkDeIE(SouthParkIE):
     IE_NAME = 'southpark.de'
-    _VALID_URL = r'https?://(www\.)?(?P<url>southpark\.de/(clips|alle-episoden)/(?P<id>.+?)(\?|#|$))'
+    _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.de/(?:clips|alle-episoden)/(?P<id>.+?)(\?|#|$))'
     _FEED_URL = 'http://www.southpark.de/feeds/video-player/mrss/'
 
     _TESTS = [{
@@ -34,3 +46,25 @@ class SouthparkDeIE(SouthParkIE):
             'description': 'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.',
         },
     }]
+
+
+class SouthParkNlIE(SouthParkIE):
+    IE_NAME = 'southpark.nl'
+    _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.nl/(?:clips|full-episodes)/(?P<id>.+?)(\?|#|$))'
+    _FEED_URL = 'http://www.southpark.nl/feeds/video-player/mrss/'
+
+    _TESTS = [{
+        'url': 'http://www.southpark.nl/full-episodes/s18e06-freemium-isnt-free',
+        'playlist_count': 4,
+    }]
+
+
+class SouthParkDkIE(SouthParkIE):
+    IE_NAME = 'southparkstudios.dk'
+    _VALID_URL = r'https?://(?:www\.)?(?P<url>southparkstudios\.dk/(?:clips|full-episodes)/(?P<id>.+?)(\?|#|$))'
+    _FEED_URL = 'http://www.southparkstudios.dk/feeds/video-player/mrss/'
+
+    _TESTS = [{
+        'url': 'http://www.southparkstudios.dk/full-episodes/s18e07-grounded-vindaloop',
+        'playlist_count': 4,
+    }]
index b936202f6f3005fe9ae085724566d709c6a484cc..5fa6faf18b738aa32e384972bf65ad56188ad9b4 100644 (file)
@@ -4,7 +4,7 @@ import re
 
 from .common import InfoExtractor
 from ..compat import (
-    compat_urllib_parse,
+    compat_urllib_parse_unquote,
     compat_urllib_parse_urlparse,
     compat_urllib_request,
 )
@@ -27,7 +27,7 @@ class SpankwireIE(InfoExtractor):
             'description': 'Crazy Bitch X rated music video.',
             'uploader': 'oreusz',
             'uploader_id': '124697',
-            'upload_date': '20070508',
+            'upload_date': '20070507',
             'age_limit': 18,
         }
     }
@@ -44,7 +44,7 @@ class SpankwireIE(InfoExtractor):
         title = self._html_search_regex(
             r'<h1>([^<]+)', webpage, 'title')
         description = self._html_search_regex(
-            r'<div\s+id="descriptionContent">([^<]+)<',
+            r'(?s)<div\s+id="descriptionContent">(.+?)</div>',
             webpage, 'description', fatal=False)
         thumbnail = self._html_search_regex(
             r'playerData\.screenShot\s*=\s*["\']([^"\']+)["\']',
@@ -64,14 +64,14 @@ class SpankwireIE(InfoExtractor):
             r'<div id="viewsCounter"><span>([\d,\.]+)</span> views</div>',
             webpage, 'view count', fatal=False))
         comment_count = str_to_int(self._html_search_regex(
-            r'Comments<span[^>]+>\s*\(([\d,\.]+)\)</span>',
+            r'<span\s+id="spCommentCount"[^>]*>([\d,\.]+)</span>',
             webpage, 'comment count', fatal=False))
 
         video_urls = list(map(
-            compat_urllib_parse.unquote,
-            re.findall(r'playerData\.cdnPath[0-9]{3,}\s*=\s*["\']([^"\']+)["\']', webpage)))
+            compat_urllib_parse_unquote,
+            re.findall(r'playerData\.cdnPath[0-9]{3,}\s*=\s*(?:encodeURIComponent\()?["\']([^"\']+)["\']', webpage)))
         if webpage.find('flashvars\.encrypted = "true"') != -1:
-            password = self._html_search_regex(
+            password = self._search_regex(
                 r'flashvars\.video_title = "([^"]+)',
                 webpage, 'password').replace('+', ' ')
             video_urls = list(map(
index 98cf92d89a1151edfd11b8f15a86eeaa6a83178d..27f4033c547a9700db6af520c4fe4a957e3755c0 100644 (file)
@@ -2,7 +2,11 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..utils import float_or_none
+from ..compat import compat_urllib_parse_urlparse
+from ..utils import (
+    determine_ext,
+    float_or_none,
+)
 
 
 class SpiegeltvIE(InfoExtractor):
@@ -17,7 +21,7 @@ class SpiegeltvIE(InfoExtractor):
             'thumbnail': 're:http://.*\.jpg$',
         },
         'params': {
-            # rtmp download
+            # m3u8 download
             'skip_download': True,
         }
     }, {
@@ -51,9 +55,39 @@ class SpiegeltvIE(InfoExtractor):
         is_wide = media_json['is_wide']
 
         server_json = self._download_json(
-            'http://www.spiegel.tv/streaming_servers/', video_id,
-            note='Downloading server information')
-        server = server_json[0]['endpoint']
+            'http://spiegeltv-prod-static.s3.amazonaws.com/projectConfigs/projectConfig.json',
+            video_id, note='Downloading server information')
+
+        format = '16x9' if is_wide else '4x3'
+
+        formats = []
+        for streamingserver in server_json['streamingserver']:
+            endpoint = streamingserver.get('endpoint')
+            if not endpoint:
+                continue
+            play_path = 'mp4:%s_spiegeltv_0500_%s.m4v' % (uuid, format)
+            if endpoint.startswith('rtmp'):
+                formats.append({
+                    'url': endpoint,
+                    'format_id': 'rtmp',
+                    'app': compat_urllib_parse_urlparse(endpoint).path[1:],
+                    'play_path': play_path,
+                    'player_path': 'http://prod-static.spiegel.tv/frontend-076.swf',
+                    'ext': 'flv',
+                    'rtmp_live': True,
+                })
+            elif determine_ext(endpoint) == 'm3u8':
+                m3u8_formats = self._extract_m3u8_formats(
+                    endpoint.replace('[video]', play_path),
+                    video_id, 'm4v',
+                    preference=1,  # Prefer hls since it allows to workaround georestriction
+                    m3u8_id='hls', fatal=False)
+                if m3u8_formats is not False:
+                    formats.extend(m3u8_formats)
+            else:
+                formats.append({
+                    'url': endpoint,
+                })
 
         thumbnails = []
         for image in media_json['images']:
@@ -65,16 +99,12 @@ class SpiegeltvIE(InfoExtractor):
 
         description = media_json['subtitle']
         duration = float_or_none(media_json.get('duration_in_ms'), scale=1000)
-        format = '16x9' if is_wide else '4x3'
-
-        url = server + 'mp4:' + uuid + '_spiegeltv_0500_' + format + '.m4v'
 
         return {
             'id': video_id,
             'title': title,
-            'url': url,
-            'ext': 'm4v',
             'description': description,
             'duration': duration,
-            'thumbnails': thumbnails
+            'thumbnails': thumbnails,
+            'formats': formats,
         }
index e529bb55ccccb1beefdf12d2df1ea689dd0d6f2e..182f286dfefc4023483c422fbf6c6a73203b86ff 100644 (file)
@@ -5,7 +5,7 @@ from .mtv import MTVServicesInfoExtractor
 
 class SpikeIE(MTVServicesInfoExtractor):
     _VALID_URL = r'''(?x)https?://
-        (?:www\.spike\.com/(?:video-clips|(?:full-)?episodes)/.+|
+        (?:www\.spike\.com/(?:video-(?:clips|playlists)|(?:full-)?episodes)/.+|
          m\.spike\.com/videos/video\.rbml\?id=(?P<id>[^&]+))
         '''
     _TEST = {
index becdf658f6e0ce8b209dffc0ce4c96a2857099dc..86d509ae5351a3cc15be66dda9f485d63ec166ba 100644 (file)
@@ -4,37 +4,36 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..compat import compat_urlparse
 from ..utils import (
-    parse_duration,
-    parse_iso8601,
+    unified_strdate,
 )
 
 
 class SportBoxIE(InfoExtractor):
-    _VALID_URL = r'https?://news\.sportbox\.ru/Vidy_sporta/(?:[^/]+/)+spbvideo_NI\d+_(?P<display_id>.+)'
-    _TESTS = [
-        {
-            'url': 'http://news.sportbox.ru/Vidy_sporta/Avtosport/Rossijskij/spbvideo_NI483529_Gonka-2-zaezd-Obyedinenniy-2000-klassi-Turing-i-S',
-            'md5': 'ff56a598c2cf411a9a38a69709e97079',
-            'info_dict': {
-                'id': '80822',
-                'ext': 'mp4',
-                'title': 'Гонка 2  заезд ««Объединенный 2000»: классы Туринг и Супер-продакшн',
-                'description': 'md5:81715fa9c4ea3d9e7915dc8180c778ed',
-                'thumbnail': 're:^https?://.*\.jpg$',
-                'timestamp': 1411896237,
-                'upload_date': '20140928',
-                'duration': 4846,
-            },
-            'params': {
-                # m3u8 download
-                'skip_download': True,
-            },
-        }, {
-            'url': 'http://news.sportbox.ru/Vidy_sporta/billiard/spbvideo_NI486287_CHempionat-mira-po-dinamichnoy-piramide-4',
-            'only_matching': True,
-        }
-    ]
+    _VALID_URL = r'https?://news\.sportbox\.ru/(?:[^/]+/)+spbvideo_NI\d+_(?P<display_id>.+)'
+    _TESTS = [{
+        'url': 'http://news.sportbox.ru/Vidy_sporta/Avtosport/Rossijskij/spbvideo_NI483529_Gonka-2-zaezd-Obyedinenniy-2000-klassi-Turing-i-S',
+        'md5': 'ff56a598c2cf411a9a38a69709e97079',
+        'info_dict': {
+            'id': '80822',
+            'ext': 'mp4',
+            'title': 'Гонка 2  заезд ««Объединенный 2000»: классы Туринг и Супер-продакшн',
+            'description': 'md5:3d72dc4a006ab6805d82f037fdc637ad',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'upload_date': '20140928',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://news.sportbox.ru/Vidy_sporta/billiard/spbvideo_NI486287_CHempionat-mira-po-dinamichnoy-piramide-4',
+        'only_matching': True,
+    }, {
+        'url': 'http://news.sportbox.ru/video/no_ads/spbvideo_NI536574_V_Novorossijske_proshel_detskij_turnir_Pole_slavy_bojevoj?ci=211355',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
@@ -42,35 +41,75 @@ class SportBoxIE(InfoExtractor):
 
         webpage = self._download_webpage(url, display_id)
 
-        video_id = self._search_regex(
-            r'src="/vdl/player/media/(\d+)"', webpage, 'video id')
+        player = self._search_regex(
+            r'src="/?(vdl/player/[^"]+)"', webpage, 'player')
+
+        title = self._html_search_regex(
+            [r'"nodetitle"\s*:\s*"([^"]+)"', r'class="node-header_{1,2}title">([^<]+)'],
+            webpage, 'title')
+        description = self._og_search_description(webpage) or self._html_search_meta(
+            'description', webpage, 'description')
+        thumbnail = self._og_search_thumbnail(webpage)
+        upload_date = unified_strdate(self._html_search_meta(
+            'dateCreated', webpage, 'upload date'))
+
+        return {
+            '_type': 'url_transparent',
+            'url': compat_urlparse.urljoin(url, '/%s' % player),
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'upload_date': upload_date,
+        }
 
-        player = self._download_webpage(
-            'http://news.sportbox.ru/vdl/player/media/%s' % video_id,
-            display_id, 'Downloading player webpage')
+
+class SportBoxEmbedIE(InfoExtractor):
+    _VALID_URL = r'https?://news\.sportbox\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://news.sportbox.ru/vdl/player/ci/211355',
+        'info_dict': {
+            'id': '211355',
+            'ext': 'mp4',
+            'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»',
+            'thumbnail': 're:^https?://.*\.jpg$',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://news.sportbox.ru/vdl/player?nid=370908&only_player=1&autostart=false&playeri=2&height=340&width=580',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return re.findall(
+            r'<iframe[^>]+src="(https?://news\.sportbox\.ru/vdl/player[^"]+)"',
+            webpage)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
 
         hls = self._search_regex(
-            r"var\s+original_hls_file\s*=\s*'([^']+)'", player, 'hls file')
+            r"sportboxPlayer\.jwplayer_common_params\.file\s*=\s*['\"]([^'\"]+)['\"]",
+            webpage, 'hls file')
 
-        formats = self._extract_m3u8_formats(hls, display_id, 'mp4')
+        formats = self._extract_m3u8_formats(hls, video_id, 'mp4')
 
-        title = self._html_search_regex(
-            r'<h1 itemprop="name">([^<]+)</h1>', webpage, 'title')
-        description = self._html_search_regex(
-            r'(?s)<div itemprop="description">(.+?)</div>', webpage, 'description', fatal=False)
-        thumbnail = self._og_search_thumbnail(webpage)
-        timestamp = parse_iso8601(self._search_regex(
-            r'<span itemprop="uploadDate">([^<]+)</span>', webpage, 'timestamp', fatal=False))
-        duration = parse_duration(self._html_search_regex(
-            r'<meta itemprop="duration" content="PT([^"]+)">', webpage, 'duration', fatal=False))
+        title = self._search_regex(
+            r'sportboxPlayer\.node_title\s*=\s*"([^"]+)"', webpage, 'title')
+
+        thumbnail = self._search_regex(
+            r'sportboxPlayer\.jwplayer_common_params\.image\s*=\s*"([^"]+)"',
+            webpage, 'thumbnail', default=None)
 
         return {
             'id': video_id,
-            'display_id': display_id,
             'title': title,
-            'description': description,
             'thumbnail': thumbnail,
-            'timestamp': timestamp,
-            'duration': duration,
             'formats': formats,
         }
diff --git a/youtube_dl/extractor/srf.py b/youtube_dl/extractor/srf.py
new file mode 100644 (file)
index 0000000..77eec0b
--- /dev/null
@@ -0,0 +1,104 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    parse_iso8601,
+    xpath_text,
+)
+
+
+class SrfIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.srf\.ch/play(?:er)?/tv/[^/]+/video/(?P<display_id>[^?]+)\?id=|tp\.srgssr\.ch/p/flash\?urn=urn:srf:ais:video:)(?P<id>[0-9a-f\-]{36})'
+    _TESTS = [{
+        'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5',
+        'md5': '4cd93523723beff51bb4bee974ee238d',
+        'info_dict': {
+            'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5',
+            'display_id': 'snowden-beantragt-asyl-in-russland',
+            'ext': 'm4v',
+            'upload_date': '20130701',
+            'title': 'Snowden beantragt Asyl in Russland',
+            'timestamp': 1372713995,
+        }
+    }, {
+        # No Speichern (Save) button
+        'url': 'http://www.srf.ch/play/tv/top-gear/video/jaguar-xk120-shadow-und-tornado-dampflokomotive?id=677f5829-e473-4823-ac83-a1087fe97faa',
+        'md5': 'd97e236e80d1d24729e5d0953d276a4f',
+        'info_dict': {
+            'id': '677f5829-e473-4823-ac83-a1087fe97faa',
+            'display_id': 'jaguar-xk120-shadow-und-tornado-dampflokomotive',
+            'ext': 'flv',
+            'upload_date': '20130710',
+            'title': 'Jaguar XK120, Shadow und Tornado-Dampflokomotive',
+            'timestamp': 1373493600,
+        },
+    }, {
+        'url': 'http://www.srf.ch/player/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5',
+        'only_matching': True,
+    }, {
+        'url': 'https://tp.srgssr.ch/p/flash?urn=urn:srf:ais:video:28e1a57d-5b76-4399-8ab3-9097f071e6c5',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        display_id = re.match(self._VALID_URL, url).group('display_id') or video_id
+
+        video_data = self._download_xml(
+            'http://il.srgssr.ch/integrationlayer/1.0/ue/srf/video/play/%s.xml' % video_id,
+            display_id)
+
+        title = xpath_text(
+            video_data, './AssetMetadatas/AssetMetadata/title', fatal=True)
+        thumbnails = [{
+            'url': s.text
+        } for s in video_data.findall('.//ImageRepresentation/url')]
+        timestamp = parse_iso8601(xpath_text(video_data, './createdDate'))
+        # The <duration> field in XML is different from the exact duration, skipping
+
+        formats = []
+        for item in video_data.findall('./Playlists/Playlist') + video_data.findall('./Downloads/Download'):
+            for url_node in item.findall('url'):
+                quality = url_node.attrib['quality']
+                full_url = url_node.text
+                original_ext = determine_ext(full_url)
+                format_id = '%s-%s' % (quality, item.attrib['protocol'])
+                if original_ext == 'f4m':
+                    formats.extend(self._extract_f4m_formats(
+                        full_url + '?hdcore=3.4.0', display_id, f4m_id=format_id))
+                elif original_ext == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        full_url, display_id, 'mp4', m3u8_id=format_id))
+                else:
+                    formats.append({
+                        'url': full_url,
+                        'ext': original_ext,
+                        'format_id': format_id,
+                        'quality': 0 if 'HD' in quality else -1,
+                        'preference': 1,
+                    })
+
+        self._sort_formats(formats)
+
+        subtitles = {}
+        subtitles_data = video_data.find('Subtitles')
+        if subtitles_data is not None:
+            subtitles_list = [{
+                'url': sub.text,
+                'ext': determine_ext(sub.text),
+            } for sub in subtitles_data]
+            if subtitles_list:
+                subtitles['de'] = subtitles_list
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'formats': formats,
+            'title': title,
+            'thumbnails': thumbnails,
+            'timestamp': timestamp,
+            'subtitles': subtitles,
+        }
index 854d01beeb5cefd1f82d7991ee2c0ce75ad33dfa..e527aa97188b1860e054f8af7c7bd7a33301729e 100644 (file)
@@ -44,7 +44,7 @@ class SunPornoIE(InfoExtractor):
             webpage, 'duration', fatal=False))
 
         view_count = int_or_none(self._html_search_regex(
-            r'class="views">\s*(\d+)\s*<',
+            r'class="views">(?:<noscript>)?\s*(\d+)\s*<',
             webpage, 'view count', fatal=False))
         comment_count = int_or_none(self._html_search_regex(
             r'(\d+)</b> Comments?',
diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py
new file mode 100644 (file)
index 0000000..fc20f66
--- /dev/null
@@ -0,0 +1,117 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+)
+
+
+class SVTBaseIE(InfoExtractor):
+    def _extract_video(self, url, video_id):
+        info = self._download_json(url, video_id)
+
+        title = info['context']['title']
+        thumbnail = info['context'].get('thumbnailImage')
+
+        video_info = info['video']
+        formats = []
+        for vr in video_info['videoReferences']:
+            vurl = vr['url']
+            ext = determine_ext(vurl)
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    vurl, video_id,
+                    ext='mp4', entry_protocol='m3u8_native',
+                    m3u8_id=vr.get('playerType')))
+            elif ext == 'f4m':
+                formats.extend(self._extract_f4m_formats(
+                    vurl + '?hdcore=3.3.0', video_id,
+                    f4m_id=vr.get('playerType')))
+            else:
+                formats.append({
+                    'format_id': vr.get('playerType'),
+                    'url': vurl,
+                })
+        self._sort_formats(formats)
+
+        duration = video_info.get('materialLength')
+        age_limit = 18 if video_info.get('inappropriateForChildren') else 0
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'age_limit': age_limit,
+        }
+
+
+class SVTIE(SVTBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?svt\.se/wd\?(?:.*?&)?widgetId=(?P<widget_id>\d+)&.*?\barticleId=(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.svt.se/wd?widgetId=23991&sectionId=541&articleId=2900353&type=embed&contextSectionId=123&autostart=false',
+        'md5': '9648197555fc1b49e3dc22db4af51d46',
+        'info_dict': {
+            'id': '2900353',
+            'ext': 'flv',
+            'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)',
+            'duration': 27,
+            'age_limit': 0,
+        },
+    }
+
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            r'(?:<iframe src|href)="(?P<url>%s[^"]*)"' % SVTIE._VALID_URL, webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        widget_id = mobj.group('widget_id')
+        article_id = mobj.group('id')
+        return self._extract_video(
+            'http://www.svt.se/wd?widgetId=%s&articleId=%s&format=json&type=embed&output=json' % (widget_id, article_id),
+            article_id)
+
+
+class SVTPlayIE(SVTBaseIE):
+    IE_DESC = 'SVT Play and Öppet arkiv'
+    _VALID_URL = r'https?://(?:www\.)?(?P<host>svtplay|oppetarkiv)\.se/video/(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://www.svtplay.se/video/2609989/sm-veckan/sm-veckan-rally-final-sasong-1-sm-veckan-rally-final',
+        'md5': 'ade3def0643fa1c40587a422f98edfd9',
+        'info_dict': {
+            'id': '2609989',
+            'ext': 'flv',
+            'title': 'SM veckan vinter, Örebro - Rally, final',
+            'duration': 4500,
+            'thumbnail': 're:^https?://.*[\.-]jpg$',
+            'age_limit': 0,
+        },
+    }, {
+        'url': 'http://www.oppetarkiv.se/video/1058509/rederiet-sasong-1-avsnitt-1-av-318',
+        'md5': 'c3101a17ce9634f4c1f9800f0746c187',
+        'info_dict': {
+            'id': '1058509',
+            'ext': 'flv',
+            'title': 'Farlig kryssning',
+            'duration': 2566,
+            'thumbnail': 're:^https?://.*[\.-]jpg$',
+            'age_limit': 0,
+        },
+        'skip': 'Only works from Sweden',
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        host = mobj.group('host')
+        return self._extract_video(
+            'http://www.%s.se/video/%s?output=json' % (host, video_id),
+            video_id)
diff --git a/youtube_dl/extractor/svtplay.py b/youtube_dl/extractor/svtplay.py
deleted file mode 100644 (file)
index 433dfd1..0000000
+++ /dev/null
@@ -1,82 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import (
-    determine_ext,
-)
-
-
-class SVTPlayIE(InfoExtractor):
-    IE_DESC = 'SVT Play and Öppet arkiv'
-    _VALID_URL = r'https?://(?:www\.)?(?P<host>svtplay|oppetarkiv)\.se/video/(?P<id>[0-9]+)'
-    _TESTS = [{
-        'url': 'http://www.svtplay.se/video/2609989/sm-veckan/sm-veckan-rally-final-sasong-1-sm-veckan-rally-final',
-        'md5': 'ade3def0643fa1c40587a422f98edfd9',
-        'info_dict': {
-            'id': '2609989',
-            'ext': 'flv',
-            'title': 'SM veckan vinter, Örebro - Rally, final',
-            'duration': 4500,
-            'thumbnail': 're:^https?://.*[\.-]jpg$',
-            'age_limit': 0,
-        },
-    }, {
-        'url': 'http://www.oppetarkiv.se/video/1058509/rederiet-sasong-1-avsnitt-1-av-318',
-        'md5': 'c3101a17ce9634f4c1f9800f0746c187',
-        'info_dict': {
-            'id': '1058509',
-            'ext': 'flv',
-            'title': 'Farlig kryssning',
-            'duration': 2566,
-            'thumbnail': 're:^https?://.*[\.-]jpg$',
-            'age_limit': 0,
-        },
-        'skip': 'Only works from Sweden',
-    }]
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        host = mobj.group('host')
-
-        info = self._download_json(
-            'http://www.%s.se/video/%s?output=json' % (host, video_id), video_id)
-
-        title = info['context']['title']
-        thumbnail = info['context'].get('thumbnailImage')
-
-        video_info = info['video']
-        formats = []
-        for vr in video_info['videoReferences']:
-            vurl = vr['url']
-            ext = determine_ext(vurl)
-            if ext == 'm3u8':
-                formats.extend(self._extract_m3u8_formats(
-                    vurl, video_id,
-                    ext='mp4', entry_protocol='m3u8_native',
-                    m3u8_id=vr.get('playerType')))
-            elif ext == 'f4m':
-                formats.extend(self._extract_f4m_formats(
-                    vurl + '?hdcore=3.3.0', video_id,
-                    f4m_id=vr.get('playerType')))
-            else:
-                formats.append({
-                    'format_id': vr.get('playerType'),
-                    'url': vurl,
-                })
-        self._sort_formats(formats)
-
-        duration = video_info.get('materialLength')
-        age_limit = 18 if video_info.get('inappropriateForChildren') else 0
-
-        return {
-            'id': video_id,
-            'title': title,
-            'formats': formats,
-            'thumbnail': thumbnail,
-            'duration': duration,
-            'age_limit': age_limit,
-        }
index bfe07b02417a2a44f23a09c10c25d48ec18b5535..cf1b37a7533f3af05022cd2fa2a7f908e8edb9a0 100644 (file)
@@ -11,13 +11,13 @@ class TagesschauIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:sendung/ts|video/video)(?P<id>-?[0-9]+)\.html'
 
     _TESTS = [{
-        'url': 'http://www.tagesschau.de/multimedia/video/video1399128.html',
-        'md5': 'bcdeac2194fb296d599ce7929dfa4009',
+        'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html',
+        'md5': '917a228bc7df7850783bc47979673a09',
         'info_dict': {
-            'id': '1399128',
+            'id': '102143',
             'ext': 'mp4',
-            'title': 'Harald Range, Generalbundesanwalt, zu den Ermittlungen',
-            'description': 'md5:69da3c61275b426426d711bde96463ab',
+            'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt',
+            'description': 'md5:171feccd9d9b3dd54d05d501568f6359',
             'thumbnail': 're:^http:.*\.jpg$',
         },
     }, {
index 1caf08cb752d201066f0a1d4679efffc26598d1e..d1b7264b4ca4a0cb72e491da26d7f5bbc1cc66b7 100644 (file)
@@ -1,13 +1,18 @@
+# -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 
 import base64
+import binascii
 import re
+import json
 
 from .common import InfoExtractor
 from ..utils import (
     ExtractorError,
     qualities,
+    determine_ext,
 )
+from ..compat import compat_ord
 
 
 class TeamcocoIE(InfoExtractor):
@@ -35,6 +40,28 @@ class TeamcocoIE(InfoExtractor):
                 'duration': 288,
                 'age_limit': 0,
             }
+        }, {
+            'url': 'http://teamcoco.com/video/timothy-olyphant-drinking-whiskey',
+            'info_dict': {
+                'id': '88748',
+                'ext': 'mp4',
+                'title': 'Timothy Olyphant Raises A Toast To “Justified”',
+                'description': 'md5:15501f23f020e793aeca761205e42c24',
+            },
+            'params': {
+                'skip_download': True,  # m3u8 downloads
+            }
+        }, {
+            'url': 'http://teamcoco.com/video/full-episode-mon-6-1-joel-mchale-jake-tapper-and-musical-guest-courtney-barnett?playlist=x;eyJ0eXBlIjoidGFnIiwiaWQiOjl9',
+            'info_dict': {
+                'id': '89341',
+                'ext': 'mp4',
+                'title': 'Full Episode - Mon. 6/1 - Joel McHale, Jake Tapper, And Musical Guest Courtney Barnett',
+                'description': 'Guests: Joel McHale, Jake Tapper, And Musical Guest Courtney Barnett',
+            },
+            'params': {
+                'skip_download': True,  # m3u8 downloads
+            }
         }
     ]
     _VIDEO_ID_REGEXES = (
@@ -47,27 +74,70 @@ class TeamcocoIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
 
         display_id = mobj.group('display_id')
-        webpage = self._download_webpage(url, display_id)
+        webpage, urlh = self._download_webpage_handle(url, display_id)
+        if 'src=expired' in urlh.geturl():
+            raise ExtractorError('This video is expired.', expected=True)
 
         video_id = mobj.group('video_id')
         if not video_id:
             video_id = self._html_search_regex(
                 self._VIDEO_ID_REGEXES, webpage, 'video id')
 
-        preloads = re.findall(r'"preload":\s*"([^"]+)"', webpage)
-        if not preloads:
-            raise ExtractorError('Preload information could not be extracted')
-        preload = max([(len(p), p) for p in preloads])[1]
-        data = self._parse_json(
-            base64.b64decode(preload.encode('ascii')).decode('utf-8'), video_id)
+        data = None
+
+        preload_codes = self._html_search_regex(
+            r'(function.+)setTimeout\(function\(\)\{playlist',
+            webpage, 'preload codes')
+        base64_fragments = re.findall(r'"([a-zA-z0-9+/=]+)"', preload_codes)
+        base64_fragments.remove('init')
+
+        def _check_sequence(cur_fragments):
+            if not cur_fragments:
+                return
+            for i in range(len(cur_fragments)):
+                cur_sequence = (''.join(cur_fragments[i:] + cur_fragments[:i])).encode('ascii')
+                try:
+                    raw_data = base64.b64decode(cur_sequence)
+                    if compat_ord(raw_data[0]) == compat_ord('{'):
+                        return json.loads(raw_data.decode('utf-8'))
+                except (TypeError, binascii.Error, UnicodeDecodeError, ValueError):
+                    continue
+
+        def _check_data():
+            for i in range(len(base64_fragments) + 1):
+                for j in range(i, len(base64_fragments) + 1):
+                    data = _check_sequence(base64_fragments[:i] + base64_fragments[j:])
+                    if data:
+                        return data
+
+        self.to_screen('Try to compute possible data sequence. This may take some time.')
+        data = _check_data()
+
+        if not data:
+            raise ExtractorError(
+                'Preload information could not be extracted', expected=True)
 
         formats = []
         get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p'])
         for filed in data['files']:
-            if filed['type'] == 'hls':
-                formats.extend(self._extract_m3u8_formats(
-                    filed['url'], video_id, ext='mp4'))
+            if determine_ext(filed['url']) == 'm3u8':
+                # compat_urllib_parse.urljoin does not work here
+                if filed['url'].startswith('/'):
+                    m3u8_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + filed['url']
+                else:
+                    m3u8_url = filed['url']
+                m3u8_formats = self._extract_m3u8_formats(
+                    m3u8_url, video_id, ext='mp4')
+                for m3u8_format in m3u8_formats:
+                    if m3u8_format not in formats:
+                        formats.append(m3u8_format)
+            elif determine_ext(filed['url']) == 'f4m':
+                # TODO Correct f4m extraction
+                continue
             else:
+                if filed['url'].startswith('/mp4:protected/'):
+                    # TODO Correct extraction for these files
+                    continue
                 m_format = re.search(r'(\d+(k|p))\.mp4', filed['url'])
                 if m_format is not None:
                     format_id = m_format.group(1)
index a2dc14c2b652527930af81bd845466b795e8866f..a48d77c309dcd1f9984cd0a6c71b7af574ca5498 100644 (file)
@@ -10,6 +10,7 @@ from ..utils import int_or_none
 
 
 class TEDIE(InfoExtractor):
+    IE_NAME = 'ted'
     _VALID_URL = r'''(?x)
         (?P<proto>https?://)
         (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
@@ -194,14 +195,24 @@ class TEDIE(InfoExtractor):
                         'tbr': int_or_none(resource.get('bitrate')),
                     })
             elif format_id == 'hls':
-                formats.extend(self._extract_m3u8_formats(
-                    resources.get('stream'), video_name, 'mp4', m3u8_id=format_id))
+                hls_formats = self._extract_m3u8_formats(
+                    resources.get('stream'), video_name, 'mp4', m3u8_id=format_id)
+                for f in hls_formats:
+                    if f.get('format_id') == 'hls-meta':
+                        continue
+                    if not f.get('height'):
+                        f['vcodec'] = 'none'
+                    else:
+                        f['acodec'] = 'none'
+                formats.extend(hls_formats)
 
         audio_download = talk_info.get('audioDownload')
         if audio_download:
             formats.append({
                 'url': audio_download,
                 'format_id': 'audio',
+                'vcodec': 'none',
+                'preference': -0.5,
             })
 
         self._sort_formats(formats)
index 251a686804b6f26915c3fa25d9f6b2cc1f98ed4b..a0c744fd16b633b08e7bbb632b77cf40e8410710 100644 (file)
@@ -16,6 +16,10 @@ class TelecincoIE(MiTeleIE):
             'title': 'Con Martín Berasategui, hacer un bacalao al ...',
             'duration': 662,
         },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
     }, {
         'url': 'http://www.telecinco.es/informativos/nacional/Pablo_Iglesias-Informativos_Telecinco-entrevista-Pedro_Piqueras_2_1945155182.html',
         'only_matching': True,
index 466155ef800fbc540292eb343bc9092ab9da6416..f6694149b8e3509b4446458300824a3f3d5fc5de 100644 (file)
@@ -2,6 +2,10 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    float_or_none,
+)
 
 
 class TenPlayIE(InfoExtractor):
@@ -49,18 +53,23 @@ class TenPlayIE(InfoExtractor):
             if protocol == 'rtmp':
                 url = url.replace('&mp4:', '')
 
+                tbr = int_or_none(rendition.get('encodingRate'), 1000)
+
             formats.append({
-                'format_id': '_'.join(['rtmp', rendition['videoContainer'].lower(), rendition['videoCodec'].lower()]),
-                'width': rendition['frameWidth'],
-                'height': rendition['frameHeight'],
-                'tbr': rendition['encodingRate'] / 1024,
-                'filesize': rendition['size'],
+                'format_id': '_'.join(
+                    ['rtmp', rendition['videoContainer'].lower(),
+                     rendition['videoCodec'].lower(), '%sk' % tbr]),
+                'width': int_or_none(rendition['frameWidth']),
+                'height': int_or_none(rendition['frameHeight']),
+                'tbr': tbr,
+                'filesize': int_or_none(rendition['size']),
                 'protocol': protocol,
                 'ext': ext,
                 'vcodec': rendition['videoCodec'].lower(),
                 'container': rendition['videoContainer'].lower(),
                 'url': url,
             })
+        self._sort_formats(formats)
 
         return {
             'id': video_id,
@@ -74,8 +83,8 @@ class TenPlayIE(InfoExtractor):
                 'url': json['thumbnailURL']
             }],
             'thumbnail': json['videoStillURL'],
-            'duration': json['length'] / 1000,
-            'timestamp': float(json['creationDate']) / 1000,
-            'uploader': json['customFields']['production_company_distributor'] if 'production_company_distributor' in json['customFields'] else 'TENplay',
-            'view_count': json['playsTotal']
+            'duration': float_or_none(json.get('length'), 1000),
+            'timestamp': float_or_none(json.get('creationDate'), 1000),
+            'uploader': json.get('customFields', {}).get('production_company_distributor') or 'TENplay',
+            'view_count': int_or_none(json.get('playsTotal')),
         }
index 6a7b5e49de2d348cb76b88abd57bea59113138a6..26655d690250f495caf98de2cfaad6aff3eda331 100644 (file)
@@ -15,19 +15,37 @@ class TestTubeIE(InfoExtractor):
             'id': '60163',
             'display_id': '5-weird-ways-plants-can-eat-animals',
             'duration': 275,
-            'ext': 'mp4',
+            'ext': 'webm',
             'title': '5 Weird Ways Plants Can Eat Animals',
             'description': 'Why have some plants evolved to eat meat?',
             'thumbnail': 're:^https?://.*\.jpg$',
             'uploader': 'DNews',
             'uploader_id': 'dnews',
         },
+    }, {
+        'url': 'https://testtube.com/iflscience/insane-jet-ski-flipping',
+        'info_dict': {
+            'id': 'fAGfJ4YjVus',
+            'ext': 'mp4',
+            'title': 'Flipping Jet-Ski Skills | Outrageous Acts of Science',
+            'uploader': 'Science Channel',
+            'uploader_id': 'ScienceChannel',
+            'upload_date': '20150203',
+            'description': 'md5:e61374030015bae1d2e22f096d4769d6',
+        }
     }]
 
     def _real_extract(self, url):
         display_id = self._match_id(url)
 
         webpage = self._download_webpage(url, display_id)
+
+        youtube_url = self._html_search_regex(
+            r'<iframe[^>]+src="((?:https?:)?//www.youtube.com/embed/[^"]+)"',
+            webpage, 'youtube iframe', default=None)
+        if youtube_url:
+            return self.url_result(youtube_url, 'Youtube', video_id=display_id)
+
         video_id = self._search_regex(
             r"player\.loadRevision3Item\('video_id',\s*([0-9]+)\);",
             webpage, 'video ID')
index 025d0877cb928bb433aff9f6eff19a29d253e006..3a68eaa80ea6867e6806a4f242a8afc910b8ba06 100644 (file)
@@ -6,8 +6,8 @@ from .common import InfoExtractor
 
 class TF1IE(InfoExtractor):
     """TF1 uses the wat.tv player."""
-    _VALID_URL = r'http://(?:videos\.tf1|www\.tfou)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html'
-    _TESTS = {
+    _VALID_URL = r'http://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html'
+    _TESTS = [{
         'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
         'info_dict': {
             'id': '10635995',
@@ -32,7 +32,13 @@ class TF1IE(InfoExtractor):
             # Sometimes wat serves the whole file with the --test option
             'skip_download': True,
         },
-    }
+    }, {
+        'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://lci.tf1.fr/sept-a-huit/videos/sept-a-huit-du-24-mai-2015-8611550.html',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
index 6a006b2d201365eab62c5b090633a1c5e345a48f..83d833e30dbeb60caa43aa272bfd4d35f4507a53 100644 (file)
@@ -26,7 +26,7 @@ _x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language
 class ThePlatformIE(InfoExtractor):
     _VALID_URL = r'''(?x)
         (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/
-           (?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/)?
+           (?:(?P<media>(?:[^/]+/)+select/media/)|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))?
          |theplatform:)(?P<id>[^/\?&]+)'''
 
     _TESTS = [{
@@ -56,6 +56,17 @@ class ThePlatformIE(InfoExtractor):
             # rtmp download
             'skip_download': True,
         }
+    }, {
+        'url': 'https://player.theplatform.com/p/D6x-PC/pulse_preview/embed/select/media/yMBg9E8KFxZD',
+        'info_dict': {
+            'id': 'yMBg9E8KFxZD',
+            'ext': 'mp4',
+            'description': 'md5:644ad9188d655b742f942bf2e06b002d',
+            'title': 'HIGHLIGHTS: USA bag first ever series Cup win',
+        }
+    }, {
+        'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7',
+        'only_matching': True,
     }]
 
     @staticmethod
@@ -85,6 +96,11 @@ class ThePlatformIE(InfoExtractor):
         if not provider_id:
             provider_id = 'dJ5BDC'
 
+        path = provider_id
+        if mobj.group('media'):
+            path += '/media'
+        path += '/' + video_id
+
         if smuggled_data.get('force_smil_url', False):
             smil_url = url
         elif mobj.group('config'):
@@ -94,8 +110,7 @@ class ThePlatformIE(InfoExtractor):
             config = self._download_json(config_url, video_id, 'Downloading config')
             smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4&manifest=f4m'
         else:
-            smil_url = ('http://link.theplatform.com/s/{0}/{1}/meta.smil?'
-                        'format=smil&mbr=true'.format(provider_id, video_id))
+            smil_url = 'http://link.theplatform.com/s/%s/meta.smil?format=smil&mbr=true' % path
 
         sig = smuggled_data.get('sig')
         if sig:
@@ -112,7 +127,7 @@ class ThePlatformIE(InfoExtractor):
         else:
             raise ExtractorError(error_msg, expected=True)
 
-        info_url = 'http://link.theplatform.com/s/{0}/{1}?format=preview'.format(provider_id, video_id)
+        info_url = 'http://link.theplatform.com/s/%s?format=preview' % path
         info_json = self._download_webpage(info_url, video_id)
         info = json.loads(info_json)
 
@@ -129,7 +144,9 @@ class ThePlatformIE(InfoExtractor):
         head = meta.find(_x('smil:head'))
         body = meta.find(_x('smil:body'))
 
-        f4m_node = body.find(_x('smil:seq//smil:video')) or body.find(_x('smil:seq/smil:video'))
+        f4m_node = body.find(_x('smil:seq//smil:video'))
+        if f4m_node is None:
+            f4m_node = body.find(_x('smil:seq/smil:video'))
         if f4m_node is not None and '.f4m' in f4m_node.attrib['src']:
             f4m_url = f4m_node.attrib['src']
             if 'manifest.f4m?' not in f4m_url:
@@ -142,7 +159,9 @@ class ThePlatformIE(InfoExtractor):
             formats = []
             switch = body.find(_x('smil:switch'))
             if switch is None:
-                switch = body.find(_x('smil:par//smil:switch')) or body.find(_x('smil:par/smil:switch'))
+                switch = body.find(_x('smil:par//smil:switch'))
+            if switch is None:
+                switch = body.find(_x('smil:par/smil:switch'))
             if switch is None:
                 switch = body.find(_x('smil:par'))
             if switch is not None:
@@ -163,7 +182,9 @@ class ThePlatformIE(InfoExtractor):
                         'vbr': vbr,
                     })
             else:
-                switch = body.find(_x('smil:seq//smil:switch')) or body.find(_x('smil:seq/smil:switch'))
+                switch = body.find(_x('smil:seq//smil:switch'))
+                if switch is None:
+                    switch = body.find(_x('smil:seq/smil:switch'))
                 for f in switch.findall(_x('smil:video')):
                     attr = f.attrib
                     vbr = int_or_none(attr.get('system-bitrate'), 1000)
index a77c6a2fc9f2838305145c97e9920d09635ceba7..5d09eb9a8b28cdd2f8bea0743d947f98415564a2 100644 (file)
@@ -1,9 +1,6 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import json
-import re
-
 from .common import InfoExtractor
 from ..utils import unified_strdate
 
@@ -17,7 +14,7 @@ class TheSixtyOneIE(InfoExtractor):
             song
         )/(?P<id>[A-Za-z0-9]+)/?$'''
     _SONG_URL_TEMPLATE = 'http://thesixtyone.com/s/{0:}'
-    _SONG_FILE_URL_TEMPLATE = 'http://{audio_server:}.thesixtyone.com/thesixtyone_production/audio/{0:}_stream'
+    _SONG_FILE_URL_TEMPLATE = 'http://{audio_server:}/thesixtyone_production/audio/{0:}_stream'
     _THUMBNAIL_URL_TEMPLATE = '{photo_base_url:}_desktop'
     _TESTS = [
         {
@@ -70,14 +67,19 @@ class TheSixtyOneIE(InfoExtractor):
     }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        song_id = mobj.group('id')
+        song_id = self._match_id(url)
 
         webpage = self._download_webpage(
             self._SONG_URL_TEMPLATE.format(song_id), song_id)
 
-        song_data = json.loads(self._search_regex(
-            r'"%s":\s(\{.*?\})' % song_id, webpage, 'song_data'))
+        song_data = self._parse_json(self._search_regex(
+            r'"%s":\s(\{.*?\})' % song_id, webpage, 'song_data'), song_id)
+
+        if self._search_regex(r'(t61\.s3_audio_load\s*=\s*1\.0;)', webpage, 's3_audio_load marker', default=None):
+            song_data['audio_server'] = 's3.amazonaws.com'
+        else:
+            song_data['audio_server'] = song_data['audio_server'] + '.thesixtyone.com'
+
         keys = [self._DECODE_MAP.get(s, s) for s in song_data['key']]
         url = self._SONG_FILE_URL_TEMPLATE.format(
             "".join(reversed(keys)), **song_data)
diff --git a/youtube_dl/extractor/thisamericanlife.py b/youtube_dl/extractor/thisamericanlife.py
new file mode 100644 (file)
index 0000000..36493a5
--- /dev/null
@@ -0,0 +1,40 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class ThisAmericanLifeIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?thisamericanlife\.org/(?:radio-archives/episode/|play_full\.php\?play=)(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://www.thisamericanlife.org/radio-archives/episode/487/harper-high-school-part-one',
+        'md5': '8f7d2da8926298fdfca2ee37764c11ce',
+        'info_dict': {
+            'id': '487',
+            'ext': 'm4a',
+            'title': '487: Harper High School, Part One',
+            'description': 'md5:ee40bdf3fb96174a9027f76dbecea655',
+            'thumbnail': 're:^https?://.*\.jpg$',
+        },
+    }, {
+        'url': 'http://www.thisamericanlife.org/play_full.php?play=487',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'http://www.thisamericanlife.org/radio-archives/episode/%s' % video_id, video_id)
+
+        return {
+            'id': video_id,
+            'url': 'http://stream.thisamericanlife.org/{0}/stream/{0}_64k.m3u8'.format(video_id),
+            'protocol': 'm3u8_native',
+            'ext': 'm4a',
+            'acodec': 'aac',
+            'vcodec': 'none',
+            'abr': 64,
+            'title': self._html_search_meta(r'twitter:title', webpage, 'title', fatal=True),
+            'description': self._html_search_meta(r'description', webpage, 'description'),
+            'thumbnail': self._og_search_thumbnail(webpage),
+        }
index 9f9e388c50948d658d1022f8514122643b623a03..13263614cc06b099d929ee71564899ac3620f76a 100644 (file)
@@ -12,17 +12,22 @@ class TlcIE(DiscoveryIE):
     IE_NAME = 'tlc.com'
     _VALID_URL = r'http://www\.tlc\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9\-]*)(.htm)?'
 
-    _TEST = {
+    # DiscoveryIE has _TESTS
+    _TESTS = [{
         'url': 'http://www.tlc.com/tv-shows/cake-boss/videos/too-big-to-fly.htm',
-        'md5': 'c4038f4a9b44d0b5d74caaa64ed2a01a',
         'info_dict': {
-            'id': '853232',
+            'id': '104493',
             'ext': 'mp4',
-            'title': 'Cake Boss: Too Big to Fly',
+            'title': 'Too Big to Fly',
             'description': 'Buddy has taken on a high flying task.',
             'duration': 119,
+            'timestamp': 1393365060,
+            'upload_date': '20140225',
         },
-    }
+        'params': {
+            'skip_download': True,  # requires ffmpef
+        },
+    }]
 
 
 class TlcDeIE(InfoExtractor):
index c5c6fdc51b19fce90d45298858ce345bc901307e..7dbe68b5c228ea6d3fa20d835a60ecf38b820816 100644 (file)
@@ -30,3 +30,31 @@ class TMZIE(InfoExtractor):
             'description': self._og_search_description(webpage),
             'thumbnail': self._html_search_meta('ThumbURL', webpage),
         }
+
+
+class TMZArticleIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?tmz\.com/\d{4}/\d{2}/\d{2}/(?P<id>[^/]+)/?'
+    _TEST = {
+        'url': 'http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert',
+        'md5': 'e482a414a38db73087450e3a6ce69d00',
+        'info_dict': {
+            'id': '0_6snoelag',
+            'ext': 'mp4',
+            'title': 'Bobby Brown Tells Crowd ... Bobbi Kristina is Awake',
+            'description': 'Bobby Brown stunned his audience during a concert Saturday night, when he told the crowd, "Bobbi is awake.  She\'s watching me."',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+        embedded_video_info_str = self._html_search_regex(
+            r'tmzVideoEmbedV2\("([^)]+)"\);', webpage, 'embedded video info')
+
+        embedded_video_info = self._parse_json(
+            embedded_video_info_str, video_id,
+            transform_source=lambda s: s.replace('\\', ''))
+
+        return self.url_result(
+            'http://www.tmz.com/videos/%s/' % embedded_video_info['id'])
index d48cbbf140054e639f7191acfa0909972ef3ab76..49516abca690721a83dee5044bb2cdd6540d4a07 100644 (file)
@@ -3,33 +3,70 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..compat import compat_str
 from ..utils import (
-    parse_duration,
     fix_xml_ampersands,
+    float_or_none,
+    int_or_none,
+    parse_duration,
+    str_to_int,
+    xpath_text,
 )
 
 
-class TNAFlixIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/(?P<cat_id>[\w-]+)/(?P<display_id>[\w-]+)/video(?P<id>\d+)'
+class TNAFlixNetworkBaseIE(InfoExtractor):
+    # May be overridden in descendants if necessary
+    _CONFIG_REGEX = [
+        r'flashvars\.config\s*=\s*escape\("([^"]+)"',
+        r'<input[^>]+name="config\d?" value="([^"]+)"',
+    ]
+    _TITLE_REGEX = r'<input[^>]+name="title" value="([^"]+)"'
+    _DESCRIPTION_REGEX = r'<input[^>]+name="description" value="([^"]+)"'
+    _UPLOADER_REGEX = r'<input[^>]+name="username" value="([^"]+)"'
+    _VIEW_COUNT_REGEX = None
+    _COMMENT_COUNT_REGEX = None
+    _AVERAGE_RATING_REGEX = None
+    _CATEGORIES_REGEX = r'<li[^>]*>\s*<span[^>]+class="infoTitle"[^>]*>Categories:</span>\s*<span[^>]+class="listView"[^>]*>(.+?)</span>\s*</li>'
 
-    _TITLE_REGEX = r'<title>(.+?) - TNAFlix Porn Videos</title>'
-    _DESCRIPTION_REGEX = r'<h3 itemprop="description">([^<]+)</h3>'
-    _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"'
+    def _extract_thumbnails(self, flix_xml):
 
-    _TEST = {
-        'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878',
-        'md5': 'ecf3498417d09216374fc5907f9c6ec0',
-        'info_dict': {
-            'id': '553878',
-            'display_id': 'Carmella-Decesare-striptease',
-            'ext': 'mp4',
-            'title': 'Carmella Decesare - striptease',
-            'description': '',
-            'thumbnail': 're:https?://.*\.jpg$',
-            'duration': 91,
-            'age_limit': 18,
-        }
-    }
+        def get_child(elem, names):
+            for name in names:
+                child = elem.find(name)
+                if child is not None:
+                    return child
+
+        timeline = get_child(flix_xml, ['timeline', 'rolloverBarImage'])
+        if timeline is None:
+            return
+
+        pattern_el = get_child(timeline, ['imagePattern', 'pattern'])
+        if pattern_el is None or not pattern_el.text:
+            return
+
+        first_el = get_child(timeline, ['imageFirst', 'first'])
+        last_el = get_child(timeline, ['imageLast', 'last'])
+        if first_el is None or last_el is None:
+            return
+
+        first_text = first_el.text
+        last_text = last_el.text
+        if not first_text.isdigit() or not last_text.isdigit():
+            return
+
+        first = int(first_text)
+        last = int(last_text)
+        if first > last:
+            return
+
+        width = int_or_none(xpath_text(timeline, './imageWidth', 'thumbnail width'))
+        height = int_or_none(xpath_text(timeline, './imageHeight', 'thumbnail height'))
+
+        return [{
+            'url': self._proto_relative_url(pattern_el.text.replace('#', compat_str(i)), 'http:'),
+            'width': width,
+            'height': height,
+        } for i in range(first, last + 1)]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
@@ -38,47 +75,195 @@ class TNAFlixIE(InfoExtractor):
 
         webpage = self._download_webpage(url, display_id)
 
-        title = self._html_search_regex(
-            self._TITLE_REGEX, webpage, 'title') if self._TITLE_REGEX else self._og_search_title(webpage)
-        description = self._html_search_regex(
-            self._DESCRIPTION_REGEX, webpage, 'description', fatal=False, default='')
-
-        age_limit = self._rta_search(webpage)
-
-        duration = self._html_search_meta('duration', webpage, 'duration', default=None)
-        if duration:
-            duration = parse_duration(duration[1:])
-
         cfg_url = self._proto_relative_url(self._html_search_regex(
             self._CONFIG_REGEX, webpage, 'flashvars.config'), 'http:')
 
         cfg_xml = self._download_xml(
-            cfg_url, display_id, note='Downloading metadata',
+            cfg_url, display_id, 'Downloading metadata',
             transform_source=fix_xml_ampersands)
 
-        thumbnail = cfg_xml.find('./startThumb').text
-
         formats = []
+
+        def extract_video_url(vl):
+            return re.sub('speed=\d+', 'speed=', vl.text)
+
+        video_link = cfg_xml.find('./videoLink')
+        if video_link is not None:
+            formats.append({
+                'url': extract_video_url(video_link),
+                'ext': xpath_text(cfg_xml, './videoConfig/type', 'type', default='flv'),
+            })
+
         for item in cfg_xml.findall('./quality/item'):
-            video_url = re.sub('speed=\d+', 'speed=', item.find('videoLink').text)
-            format_id = item.find('res').text
-            fmt = {
-                'url': video_url,
+            video_link = item.find('./videoLink')
+            if video_link is None:
+                continue
+            res = item.find('res')
+            format_id = None if res is None else res.text
+            height = int_or_none(self._search_regex(
+                r'^(\d+)[pP]', format_id, 'height', default=None))
+            formats.append({
+                'url': self._proto_relative_url(extract_video_url(video_link), 'http:'),
                 'format_id': format_id,
-            }
-            m = re.search(r'^(\d+)', format_id)
-            if m:
-                fmt['height'] = int(m.group(1))
-            formats.append(fmt)
+                'height': height,
+            })
+
         self._sort_formats(formats)
 
+        thumbnail = self._proto_relative_url(
+            xpath_text(cfg_xml, './startThumb', 'thumbnail'), 'http:')
+        thumbnails = self._extract_thumbnails(cfg_xml)
+
+        title = self._html_search_regex(
+            self._TITLE_REGEX, webpage, 'title') if self._TITLE_REGEX else self._og_search_title(webpage)
+
+        age_limit = self._rta_search(webpage)
+
+        duration = parse_duration(self._html_search_meta(
+            'duration', webpage, 'duration', default=None))
+
+        def extract_field(pattern, name):
+            return self._html_search_regex(pattern, webpage, name, default=None) if pattern else None
+
+        description = extract_field(self._DESCRIPTION_REGEX, 'description')
+        uploader = extract_field(self._UPLOADER_REGEX, 'uploader')
+        view_count = str_to_int(extract_field(self._VIEW_COUNT_REGEX, 'view count'))
+        comment_count = str_to_int(extract_field(self._COMMENT_COUNT_REGEX, 'comment count'))
+        average_rating = float_or_none(extract_field(self._AVERAGE_RATING_REGEX, 'average rating'))
+
+        categories_str = extract_field(self._CATEGORIES_REGEX, 'categories')
+        categories = categories_str.split(', ') if categories_str is not None else []
+
         return {
             'id': video_id,
             'display_id': display_id,
             'title': title,
             'description': description,
             'thumbnail': thumbnail,
+            'thumbnails': thumbnails,
             'duration': duration,
             'age_limit': age_limit,
+            'uploader': uploader,
+            'view_count': view_count,
+            'comment_count': comment_count,
+            'average_rating': average_rating,
+            'categories': categories,
             'formats': formats,
         }
+
+
+class TNAFlixIE(TNAFlixNetworkBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)'
+
+    _TITLE_REGEX = r'<title>(.+?) - TNAFlix Porn Videos</title>'
+    _DESCRIPTION_REGEX = r'<h3 itemprop="description">([^<]+)</h3>'
+    _UPLOADER_REGEX = r'(?s)<span[^>]+class="infoTitle"[^>]*>Uploaded By:</span>(.+?)<div'
+
+    _TESTS = [{
+        # anonymous uploader, no categories
+        'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878',
+        'md5': 'ecf3498417d09216374fc5907f9c6ec0',
+        'info_dict': {
+            'id': '553878',
+            'display_id': 'Carmella-Decesare-striptease',
+            'ext': 'mp4',
+            'title': 'Carmella Decesare - striptease',
+            'thumbnail': 're:https?://.*\.jpg$',
+            'duration': 91,
+            'age_limit': 18,
+            'uploader': 'Anonymous',
+            'categories': [],
+        }
+    }, {
+        # non-anonymous uploader, categories
+        'url': 'https://www.tnaflix.com/teen-porn/Educational-xxx-video/video6538',
+        'md5': '0f5d4d490dbfd117b8607054248a07c0',
+        'info_dict': {
+            'id': '6538',
+            'display_id': 'Educational-xxx-video',
+            'ext': 'mp4',
+            'title': 'Educational xxx video',
+            'description': 'md5:b4fab8f88a8621c8fabd361a173fe5b8',
+            'thumbnail': 're:https?://.*\.jpg$',
+            'duration': 164,
+            'age_limit': 18,
+            'uploader': 'bobwhite39',
+            'categories': ['Amateur Porn', 'Squirting Videos', 'Teen Girls 18+'],
+        }
+    }, {
+        'url': 'https://www.tnaflix.com/amateur-porn/bunzHD-Ms.Donk/video358632',
+        'only_matching': True,
+    }]
+
+
+class EMPFlixIE(TNAFlixNetworkBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?empflix\.com/videos/(?P<display_id>.+?)-(?P<id>[0-9]+)\.html'
+
+    _UPLOADER_REGEX = r'<span[^>]+class="infoTitle"[^>]*>Uploaded By:</span>(.+?)</li>'
+
+    _TESTS = [{
+        'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html',
+        'md5': 'b1bc15b6412d33902d6e5952035fcabc',
+        'info_dict': {
+            'id': '33051',
+            'display_id': 'Amateur-Finger-Fuck',
+            'ext': 'mp4',
+            'title': 'Amateur Finger Fuck',
+            'description': 'Amateur solo finger fucking.',
+            'thumbnail': 're:https?://.*\.jpg$',
+            'duration': 83,
+            'age_limit': 18,
+            'uploader': 'cwbike',
+            'categories': ['Amateur', 'Anal', 'Fisting', 'Home made', 'Solo'],
+        }
+    }, {
+        'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html',
+        'only_matching': True,
+    }]
+
+
+class MovieFapIE(TNAFlixNetworkBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?moviefap\.com/videos/(?P<id>[0-9a-f]+)/(?P<display_id>[^/]+)\.html'
+
+    _VIEW_COUNT_REGEX = r'<br>Views\s*<strong>([\d,.]+)</strong>'
+    _COMMENT_COUNT_REGEX = r'<span[^>]+id="comCount"[^>]*>([\d,.]+)</span>'
+    _AVERAGE_RATING_REGEX = r'Current Rating\s*<br>\s*<strong>([\d.]+)</strong>'
+    _CATEGORIES_REGEX = r'(?s)<div[^>]+id="vid_info"[^>]*>\s*<div[^>]*>.+?</div>(.*?)<br>'
+
+    _TESTS = [{
+        # normal, multi-format video
+        'url': 'http://www.moviefap.com/videos/be9867c9416c19f54a4a/experienced-milf-amazing-handjob.html',
+        'md5': '26624b4e2523051b550067d547615906',
+        'info_dict': {
+            'id': 'be9867c9416c19f54a4a',
+            'display_id': 'experienced-milf-amazing-handjob',
+            'ext': 'mp4',
+            'title': 'Experienced MILF Amazing Handjob',
+            'description': 'Experienced MILF giving an Amazing Handjob',
+            'thumbnail': 're:https?://.*\.jpg$',
+            'age_limit': 18,
+            'uploader': 'darvinfred06',
+            'view_count': int,
+            'comment_count': int,
+            'average_rating': float,
+            'categories': ['Amateur', 'Masturbation', 'Mature', 'Flashing'],
+        }
+    }, {
+        # quirky single-format case where the extension is given as fid, but the video is really an flv
+        'url': 'http://www.moviefap.com/videos/e5da0d3edce5404418f5/jeune-couple-russe.html',
+        'md5': 'fa56683e291fc80635907168a743c9ad',
+        'info_dict': {
+            'id': 'e5da0d3edce5404418f5',
+            'display_id': 'jeune-couple-russe',
+            'ext': 'flv',
+            'title': 'Jeune Couple Russe',
+            'description': 'Amateur',
+            'thumbnail': 're:https?://.*\.jpg$',
+            'age_limit': 18,
+            'uploader': 'whiskeyjar',
+            'view_count': int,
+            'comment_count': int,
+            'average_rating': float,
+            'categories': ['Amateur', 'Teen'],
+        }
+    }]
index d73ad3762a1b455cfd4bc384c27e2dd85e776dde..c9cb69333f7da0a9f4fe009e79b06433bca83726 100644 (file)
@@ -47,7 +47,7 @@ class Tube8IE(InfoExtractor):
         webpage = self._download_webpage(req, display_id)
 
         flashvars = json.loads(self._html_search_regex(
-            r'var flashvars\s*=\s*({.+?})', webpage, 'flashvars'))
+            r'flashvars\s*=\s*({.+?});\r?\n', webpage, 'flashvars'))
 
         video_url = flashvars['video_url']
         if flashvars.get('encrypted') is True:
@@ -58,19 +58,19 @@ class Tube8IE(InfoExtractor):
         thumbnail = flashvars.get('image_url')
 
         title = self._html_search_regex(
-            r'videotitle\s*=\s*"([^"]+)', webpage, 'title')
+            r'videoTitle\s*=\s*"([^"]+)', webpage, 'title')
         description = self._html_search_regex(
-            r'>Description:</strong>(.+?)<', webpage, 'description', fatal=False)
+            r'>Description:</strong>\s*(.+?)\s*<', webpage, 'description', fatal=False)
         uploader = self._html_search_regex(
-            r'<strong class="video-username">(?:<a href="[^"]+">)?([^<]+)(?:</a>)?</strong>',
+            r'<span class="username">\s*(.+?)\s*<',
             webpage, 'uploader', fatal=False)
 
         like_count = int_or_none(self._html_search_regex(
-            r"rupVar\s*=\s*'(\d+)'", webpage, 'like count', fatal=False))
+            r'rupVar\s*=\s*"(\d+)"', webpage, 'like count', fatal=False))
         dislike_count = int_or_none(self._html_search_regex(
-            r"rdownVar\s*=\s*'(\d+)'", webpage, 'dislike count', fatal=False))
+            r'rdownVar\s*=\s*"(\d+)"', webpage, 'dislike count', fatal=False))
         view_count = self._html_search_regex(
-            r'<strong>Views: </strong>([\d,\.]+)</li>', webpage, 'view count', fatal=False)
+            r'<strong>Views: </strong>([\d,\.]+)\s*</li>', webpage, 'view count', fatal=False)
         if view_count:
             view_count = str_to_int(view_count)
         comment_count = self._html_search_regex(
diff --git a/youtube_dl/extractor/tubitv.py b/youtube_dl/extractor/tubitv.py
new file mode 100644 (file)
index 0000000..2c4b218
--- /dev/null
@@ -0,0 +1,84 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import codecs
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_parse,
+    compat_urllib_request
+)
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+)
+
+
+class TubiTvIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?tubitv\.com/video\?id=(?P<id>[0-9]+)'
+    _LOGIN_URL = 'http://tubitv.com/login'
+    _NETRC_MACHINE = 'tubitv'
+    _TEST = {
+        'url': 'http://tubitv.com/video?id=54411&title=The_Kitchen_Musical_-_EP01',
+        'info_dict': {
+            'id': '54411',
+            'ext': 'mp4',
+            'title': 'The Kitchen Musical - EP01',
+            'thumbnail': 're:^https?://.*\.png$',
+            'description': 'md5:37532716166069b353e8866e71fefae7',
+            'duration': 2407,
+        },
+        'params': {
+            'skip_download': 'HLS download',
+        },
+    }
+
+    def _login(self):
+        (username, password) = self._get_login_info()
+        if username is None:
+            return
+        self.report_login()
+        form_data = {
+            'username': username,
+            'password': password,
+        }
+        payload = compat_urllib_parse.urlencode(form_data).encode('utf-8')
+        request = compat_urllib_request.Request(self._LOGIN_URL, payload)
+        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+        login_page = self._download_webpage(
+            request, None, False, 'Wrong login info')
+        if not re.search(r'id="tubi-logout"', login_page):
+            raise ExtractorError(
+                'Login failed (invalid username/password)', expected=True)
+
+    def _real_initialize(self):
+        self._login()
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+        if re.search(r"<(?:DIV|div) class='login-required-screen'>", webpage):
+            raise ExtractorError(
+                'This video requires login, use --username and --password '
+                'options to provide account credentials.', expected=True)
+
+        title = self._og_search_title(webpage)
+        description = self._og_search_description(webpage)
+        thumbnail = self._og_search_thumbnail(webpage)
+        duration = int_or_none(self._html_search_meta(
+            'video:duration', webpage, 'duration'))
+
+        apu = self._search_regex(r"apu='([^']+)'", webpage, 'apu')
+        m3u8_url = codecs.decode(apu, 'rot_13')[::-1]
+        formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': thumbnail,
+            'description': description,
+            'duration': duration,
+        }
index 2a1ae5a717cf7b2af16bf5a1ce3ef7494e28a7a6..3d3b635e4cb362515b365ccd8f9321e5124aadeb 100644 (file)
@@ -28,6 +28,28 @@ class TumblrIE(InfoExtractor):
             'description': 'md5:dba62ac8639482759c8eb10ce474586a',
             'thumbnail': 're:http://.*\.jpg',
         }
+    }, {
+        'url': 'http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching',
+        'md5': 'de07e5211d60d4f3a2c3df757ea9f6ab',
+        'info_dict': {
+            'id': 'Wmur',
+            'ext': 'mp4',
+            'title': 'naked smoking & stretching',
+            'upload_date': '20150506',
+            'timestamp': 1430931613,
+        },
+        'add_ie': ['Vidme'],
+    }, {
+        'url': 'http://camdamage.tumblr.com/post/98846056295/',
+        'md5': 'a9e0c8371ea1ca306d6554e3fecf50b6',
+        'info_dict': {
+            'id': '105463834',
+            'ext': 'mp4',
+            'title': 'Cam Damage-HD 720p',
+            'uploader': 'John Moyer',
+            'uploader_id': 'user32021558',
+        },
+        'add_ie': ['Vimeo'],
     }]
 
     def _real_extract(self, url):
@@ -36,12 +58,16 @@ class TumblrIE(InfoExtractor):
         blog = m_url.group('blog_name')
 
         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
-        webpage = self._download_webpage(url, video_id)
+        webpage, urlh = self._download_webpage_handle(url, video_id)
 
         iframe_url = self._search_regex(
             r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'',
-            webpage, 'iframe url')
-        iframe = self._download_webpage(iframe_url, video_id)
+            webpage, 'iframe url', default=None)
+        if iframe_url is None:
+            return self.url_result(urlh.geturl(), 'Generic')
+
+        iframe = self._download_webpage(iframe_url, video_id,
+                                        'Downloading iframe page')
         video_url = self._search_regex(r'<source src="([^"]+)"',
                                        iframe, 'video url')
 
@@ -56,6 +82,6 @@ class TumblrIE(InfoExtractor):
             'url': video_url,
             'ext': 'mp4',
             'title': video_title,
-            'description': self._og_search_description(webpage),
-            'thumbnail': self._og_search_thumbnail(webpage),
+            'description': self._og_search_description(webpage, default=None),
+            'thumbnail': self._og_search_thumbnail(webpage, default=None),
         }
index 29703a8a9a6ddf0981642c28cd2f1f68cc07c7b7..7ae63a4992a74368ec8b5f6a266a298cb6776b79 100644 (file)
@@ -23,7 +23,7 @@ class TurboIE(InfoExtractor):
             'ext': 'mp4',
             'duration': 3715,
             'title': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia... ',
-            'description': 'Retrouvez dans cette rubrique toutes les vidéos de l\'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia... ',
+            'description': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia...',
             'thumbnail': 're:^https?://.*\.jpg$',
         }
     }
@@ -42,7 +42,7 @@ class TurboIE(InfoExtractor):
         title = xpath_text(item, './title', 'title')
         duration = int_or_none(xpath_text(item, './durate', 'duration'))
         thumbnail = xpath_text(item, './visuel_clip', 'thumbnail')
-        description = self._og_search_description(webpage)
+        description = self._html_search_meta('description', webpage)
 
         formats = []
         get_quality = qualities(['3g', 'sd', 'hq'])
index 4de0aac523313eced334aab38a9a20c7bf08dfc7..fad720b681e125ac495b26ba3870dbd65340e3ce 100644 (file)
@@ -26,7 +26,7 @@ class TutvIE(InfoExtractor):
 
         data_content = self._download_webpage(
             'http://tu.tv/flvurl.php?codVideo=%s' % internal_id, video_id, 'Downloading video info')
-        video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0]).decode('utf-8')
+        video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0].encode('utf-8')).decode('utf-8')
 
         return {
             'id': internal_id,
diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py
new file mode 100644 (file)
index 0000000..fa338b9
--- /dev/null
@@ -0,0 +1,126 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    float_or_none,
+    parse_iso8601,
+    remove_end,
+)
+
+
+class TV2IE(InfoExtractor):
+    _VALID_URL = 'http://(?:www\.)?tv2\.no/v/(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.tv2.no/v/916509/',
+        'md5': '9cb9e3410b18b515d71892f27856e9b1',
+        'info_dict': {
+            'id': '916509',
+            'ext': 'flv',
+            'title': 'Se Gryttens hyllest av Steven Gerrard',
+            'description': 'TV 2 Sportens huspoet tar avskjed med Liverpools kaptein Steven Gerrard.',
+            'timestamp': 1431715610,
+            'upload_date': '20150515',
+            'duration': 156.967,
+            'view_count': int,
+            'categories': list,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        formats = []
+        format_urls = []
+        for protocol in ('HDS', 'HLS'):
+            data = self._download_json(
+                'http://sumo.tv2.no/api/web/asset/%s/play.json?protocol=%s&videoFormat=SMIL+ISMUSP' % (video_id, protocol),
+                video_id, 'Downloading play JSON')['playback']
+            for item in data['items']['item']:
+                video_url = item.get('url')
+                if not video_url or video_url in format_urls:
+                    continue
+                format_id = '%s-%s' % (protocol.lower(), item.get('mediaFormat'))
+                if not self._is_valid_url(video_url, video_id, format_id):
+                    continue
+                format_urls.append(video_url)
+                ext = determine_ext(video_url)
+                if ext == 'f4m':
+                    formats.extend(self._extract_f4m_formats(
+                        video_url, video_id, f4m_id=format_id))
+                elif ext == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        video_url, video_id, 'mp4', m3u8_id=format_id))
+                elif ext == 'ism' or video_url.endswith('.ism/Manifest'):
+                    pass
+                else:
+                    formats.append({
+                        'url': video_url,
+                        'format_id': format_id,
+                        'tbr': int_or_none(item.get('bitrate')),
+                        'filesize': int_or_none(item.get('fileSize')),
+                    })
+        self._sort_formats(formats)
+
+        asset = self._download_json(
+            'http://sumo.tv2.no/api/web/asset/%s.json' % video_id,
+            video_id, 'Downloading metadata JSON')['asset']
+
+        title = asset['title']
+        description = asset.get('description')
+        timestamp = parse_iso8601(asset.get('createTime'))
+        duration = float_or_none(asset.get('accurateDuration') or asset.get('duration'))
+        view_count = int_or_none(asset.get('views'))
+        categories = asset.get('keywords', '').split(',')
+
+        thumbnails = [{
+            'id': thumbnail.get('@type'),
+            'url': thumbnail.get('url'),
+        } for _, thumbnail in asset.get('imageVersions', {}).items()]
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'description': description,
+            'thumbnails': thumbnails,
+            'timestamp': timestamp,
+            'duration': duration,
+            'view_count': view_count,
+            'categories': categories,
+            'formats': formats,
+        }
+
+
+class TV2ArticleIE(InfoExtractor):
+    _VALID_URL = 'http://(?:www\.)?tv2\.no/(?:a|\d{4}/\d{2}/\d{2}(/[^/]+)+)/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://www.tv2.no/2015/05/16/nyheter/alesund/krim/pingvin/6930542',
+        'info_dict': {
+            'id': '6930542',
+            'title': 'Russen hetses etter pingvintyveri – innrømmer å ha åpnet luken på buret',
+            'description': 'md5:339573779d3eea3542ffe12006190954',
+        },
+        'playlist_count': 2,
+    }, {
+        'url': 'http://www.tv2.no/a/6930542',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        entries = [
+            self.url_result('http://www.tv2.no/v/%s' % video_id, 'TV2')
+            for video_id in re.findall(r'data-assetid="(\d+)"', webpage)]
+
+        title = remove_end(self._og_search_title(webpage), ' - TV2.no')
+        description = remove_end(self._og_search_description(webpage), ' - TV2.no')
+
+        return self.playlist_result(entries, playlist_id, title, description)
diff --git a/youtube_dl/extractor/tvc.py b/youtube_dl/extractor/tvc.py
new file mode 100644 (file)
index 0000000..3a4f393
--- /dev/null
@@ -0,0 +1,109 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    int_or_none,
+)
+
+
+class TVCIE(InfoExtractor):
+    _VALID_URL = r'http://(?:www\.)?tvc\.ru/video/iframe/id/(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.tvc.ru/video/iframe/id/74622/isPlay/false/id_stat/channel/?acc_video_id=/channel/brand/id/17/show/episodes/episode_id/39702',
+        'md5': 'bbc5ff531d1e90e856f60fc4b3afd708',
+        'info_dict': {
+            'id': '74622',
+            'ext': 'mp4',
+            'title': 'События. "События". Эфир от 22.05.2015 14:30',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 1122,
+        },
+    }
+
+    @classmethod
+    def _extract_url(cls, webpage):
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:http:)?//(?:www\.)?tvc\.ru/video/iframe/id/[^"]+)\1', webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video = self._download_json(
+            'http://www.tvc.ru/video/json/id/%s' % video_id, video_id)
+
+        formats = []
+        for info in video.get('path', {}).get('quality', []):
+            video_url = info.get('url')
+            if not video_url:
+                continue
+            format_id = self._search_regex(
+                r'cdnvideo/([^/]+?)(?:-[^/]+?)?/', video_url,
+                'format id', default=None)
+            formats.append({
+                'url': video_url,
+                'format_id': format_id,
+                'width': int_or_none(info.get('width')),
+                'height': int_or_none(info.get('height')),
+                'tbr': int_or_none(info.get('bitrate')),
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': video['title'],
+            'thumbnail': video.get('picture'),
+            'duration': int_or_none(video.get('duration')),
+            'formats': formats,
+        }
+
+
+class TVCArticleIE(InfoExtractor):
+    _VALID_URL = r'http://(?:www\.)?tvc\.ru/(?!video/iframe/id/)(?P<id>[^?#]+)'
+    _TESTS = [{
+        'url': 'http://www.tvc.ru/channel/brand/id/29/show/episodes/episode_id/39702/',
+        'info_dict': {
+            'id': '74622',
+            'ext': 'mp4',
+            'title': 'События. "События". Эфир от 22.05.2015 14:30',
+            'description': 'md5:ad7aa7db22903f983e687b8a3e98c6dd',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 1122,
+        },
+    }, {
+        'url': 'http://www.tvc.ru/news/show/id/69944',
+        'info_dict': {
+            'id': '75399',
+            'ext': 'mp4',
+            'title': 'Эксперты: в столице встал вопрос о максимально безопасных остановках',
+            'description': 'md5:f2098f71e21f309e89f69b525fd9846e',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 278,
+        },
+    }, {
+        'url': 'http://www.tvc.ru/channel/brand/id/47/show/episodes#',
+        'info_dict': {
+            'id': '2185',
+            'ext': 'mp4',
+            'title': 'Ещё не поздно. Эфир от 03.08.2013',
+            'description': 'md5:51fae9f3f8cfe67abce014e428e5b027',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 3316,
+        },
+    }]
+
+    def _real_extract(self, url):
+        webpage = self._download_webpage(url, self._match_id(url))
+        return {
+            '_type': 'url_transparent',
+            'ie_key': 'TVC',
+            'url': self._og_search_video_url(webpage),
+            'title': clean_html(self._og_search_title(webpage)),
+            'description': clean_html(self._og_search_description(webpage)),
+            'thumbnail': self._og_search_thumbnail(webpage),
+        }
index 102362b295450f58ff085ec9be7d21921a1ac494..dc3a8334a6b335143dff417d805a26df412d8783 100644 (file)
@@ -5,7 +5,9 @@ import re
 
 from .common import InfoExtractor
 from ..utils import (
+    ExtractorError,
     float_or_none,
+    int_or_none,
     parse_age_limit,
 )
 
@@ -24,22 +26,24 @@ class TvigleIE(InfoExtractor):
                 'display_id': 'sokrat',
                 'ext': 'flv',
                 'title': 'Сократ',
-                'description': 'md5:a05bd01be310074d5833efc6743be95e',
+                'description': 'md5:d6b92ffb7217b4b8ebad2e7665253c17',
                 'duration': 6586,
-                'age_limit': 0,
+                'age_limit': 12,
             },
+            'skip': 'georestricted',
         },
         {
             'url': 'http://www.tvigle.ru/video/vladimir-vysotskii/vedushchii-teleprogrammy-60-minut-ssha-o-vladimire-vysotskom/',
-            'md5': 'd9012d7c7c598fe7a11d7fb46dc1f574',
+            'md5': 'e7efe5350dd5011d0de6550b53c3ba7b',
             'info_dict': {
                 'id': '5142516',
-                'ext': 'mp4',
+                'ext': 'flv',
                 'title': 'Ведущий телепрограммы «60 минут» (США) о Владимире Высоцком',
                 'description': 'md5:027f7dc872948f14c96d19b4178428a4',
                 'duration': 186.080,
                 'age_limit': 0,
             },
+            'skip': 'georestricted',
         }, {
             'url': 'https://cloud.tvigle.ru/video/5267604/',
             'only_matching': True,
@@ -54,7 +58,7 @@ class TvigleIE(InfoExtractor):
         if not video_id:
             webpage = self._download_webpage(url, display_id)
             video_id = self._html_search_regex(
-                r'<li class="video-preview current_playing" id="(\d+)">',
+                r'class="video-preview current_playing" id="(\d+)">',
                 webpage, 'video id')
 
         video_data = self._download_json(
@@ -62,21 +66,34 @@ class TvigleIE(InfoExtractor):
 
         item = video_data['playlist']['items'][0]
 
+        videos = item.get('videos')
+
+        error_message = item.get('errorMessage')
+        if not videos and error_message:
+            raise ExtractorError(
+                '%s returned error: %s' % (self.IE_NAME, error_message), expected=True)
+
         title = item['title']
-        description = item['description']
-        thumbnail = item['thumbnail']
+        description = item.get('description')
+        thumbnail = item.get('thumbnail')
         duration = float_or_none(item.get('durationMilliseconds'), 1000)
         age_limit = parse_age_limit(item.get('ageRestrictions'))
 
         formats = []
         for vcodec, fmts in item['videos'].items():
-            for quality, video_url in fmts.items():
+            for format_id, video_url in fmts.items():
+                if format_id == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        video_url, video_id, 'mp4', m3u8_id=vcodec))
+                    continue
+                height = self._search_regex(
+                    r'^(\d+)[pP]$', format_id, 'height', default=None)
                 formats.append({
                     'url': video_url,
-                    'format_id': '%s-%s' % (vcodec, quality),
+                    'format_id': '%s-%s' % (vcodec, format_id),
                     'vcodec': vcodec,
-                    'height': int(quality[:-1]),
-                    'filesize': item['video_files_size'][vcodec][quality],
+                    'height': int_or_none(height),
+                    'filesize': int_or_none(item.get('video_files_size', {}).get(vcodec, {}).get(format_id)),
                 })
         self._sort_formats(formats)
 
index e83e31a31640fa32e4a19a48a745d279a14d3753..79863e781fd41101c76659ab3b43a85433d25665 100644 (file)
@@ -26,6 +26,7 @@ class TVPlayIE(InfoExtractor):
            viasat4play\.no/programmer|
            tv6play\.no/programmer|
            tv3play\.dk/programmer|
+           play\.novatv\.bg/programi
         )/[^/]+/(?P<id>\d+)
         '''
     _TESTS = [
@@ -173,6 +174,22 @@ class TVPlayIE(InfoExtractor):
                 'skip_download': True,
             },
         },
+        {
+            'url': 'http://play.novatv.bg/programi/zdravei-bulgariya/624952?autostart=true',
+            'info_dict': {
+                'id': '624952',
+                'ext': 'flv',
+                'title': 'Здравей, България (12.06.2015 г.) ',
+                'description': 'md5:99f3700451ac5bb71a260268b8daefd7',
+                'duration': 8838,
+                'timestamp': 1434100372,
+                'upload_date': '20150612',
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
     ]
 
     def _real_extract(self, url):
index 67e8bfea03476ccf78d2470a973655f2a7213730..c1ee1decc433627ffa52196d44f7563b46d309cc 100644 (file)
@@ -15,7 +15,7 @@ class TwentyFourVideoIE(InfoExtractor):
     _TESTS = [
         {
             'url': 'http://www.24video.net/video/view/1044982',
-            'md5': '48dd7646775690a80447a8dca6a2df76',
+            'md5': 'd041af8b5b4246ea466226a0d6693345',
             'info_dict': {
                 'id': '1044982',
                 'ext': 'mp4',
@@ -54,7 +54,7 @@ class TwentyFourVideoIE(InfoExtractor):
             webpage, 'upload date'))
 
         uploader = self._html_search_regex(
-            r'Загрузил\s*<a href="/jsecUser/movies/[^"]+" class="link">([^<]+)</a>',
+            r'class="video-uploaded"[^>]*>\s*<a href="/jsecUser/movies/[^"]+"[^>]*>([^<]+)</a>',
             webpage, 'uploader', fatal=False)
 
         view_count = int_or_none(self._html_search_regex(
index 94bd6345da18815a50b72502a8b91ae4e30ae2b5..73ce335b7f0a5b5790f8dd65e3ac170e9791b7d8 100644 (file)
@@ -22,8 +22,8 @@ class TwitchBaseIE(InfoExtractor):
 
     _API_BASE = 'https://api.twitch.tv'
     _USHER_BASE = 'http://usher.twitch.tv'
-    _LOGIN_URL = 'https://secure.twitch.tv/user/login'
-    _LOGIN_POST_URL = 'https://secure-login.twitch.tv/login'
+    _LOGIN_URL = 'https://secure.twitch.tv/login'
+    _LOGIN_POST_URL = 'https://passport.twitch.tv/authorize'
     _NETRC_MACHINE = 'twitch'
 
     def _handle_error(self, response):
@@ -59,20 +59,12 @@ class TwitchBaseIE(InfoExtractor):
         login_page = self._download_webpage(
             self._LOGIN_URL, None, 'Downloading login page')
 
-        authenticity_token = self._search_regex(
-            r'<input name="authenticity_token" type="hidden" value="([^"]+)"',
-            login_page, 'authenticity token')
-
-        login_form = {
-            'utf8': '✓'.encode('utf-8'),
-            'authenticity_token': authenticity_token,
-            'redirect_on_login': '',
-            'embed_form': 'false',
-            'mp_source_action': 'login-button',
-            'follow': '',
-            'login': username,
-            'password': password,
-        }
+        login_form = self._hidden_inputs(login_page)
+
+        login_form.update({
+            'login': username.encode('utf-8'),
+            'password': password.encode('utf-8'),
+        })
 
         request = compat_urllib_request.Request(
             self._LOGIN_POST_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
@@ -80,11 +72,15 @@ class TwitchBaseIE(InfoExtractor):
         response = self._download_webpage(
             request, None, 'Logging in as %s' % username)
 
-        m = re.search(
-            r"id=([\"'])login_error_message\1[^>]*>(?P<msg>[^<]+)", response)
-        if m:
+        error_message = self._search_regex(
+            r'<div[^>]+class="subwindow_notice"[^>]*>([^<]+)</div>',
+            response, 'error message', default=None)
+        if error_message:
             raise ExtractorError(
-                'Unable to login: %s' % m.group('msg').strip(), expected=True)
+                'Unable to login. Twitch said: %s' % error_message, expected=True)
+
+        if '>Reset your password<' in response:
+            self.report_warning('Twitch asks you to reset your password, go to https://secure.twitch.tv/reset/submit')
 
     def _prefer_source(self, formats):
         try:
@@ -189,17 +185,17 @@ class TwitchVodIE(TwitchItemBaseIE):
     _ITEM_SHORTCUT = 'v'
 
     _TEST = {
-        'url': 'http://www.twitch.tv/ksptv/v/3622000',
+        'url': 'http://www.twitch.tv/riotgames/v/6528877',
         'info_dict': {
-            'id': 'v3622000',
+            'id': 'v6528877',
             'ext': 'mp4',
-            'title': '''KSPTV: Squadcast: "Everyone's on vacation so here's Dahud" Edition!''',
+            'title': 'LCK Summer Split - Week 6 Day 1',
             'thumbnail': 're:^https?://.*\.jpg$',
-            'duration': 6951,
-            'timestamp': 1419028564,
-            'upload_date': '20141219',
-            'uploader': 'KSPTV',
-            'uploader_id': 'ksptv',
+            'duration': 17208,
+            'timestamp': 1435131709,
+            'upload_date': '20150624',
+            'uploader': 'Riot Games',
+            'uploader_id': 'riotgames',
             'view_count': int,
         },
         'params': {
@@ -215,7 +211,7 @@ class TwitchVodIE(TwitchItemBaseIE):
             '%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id,
             'Downloading %s access token' % self._ITEM_TYPE)
         formats = self._extract_m3u8_formats(
-            '%s/vod/%s?nauth=%s&nauthsig=%s'
+            '%s/vod/%s?nauth=%s&nauthsig=%s&allow_source=true'
             % (self._USHER_BASE, item_id, access_token['token'], access_token['sig']),
             item_id, 'mp4')
         self._prefer_source(formats)
@@ -314,9 +310,9 @@ class TwitchBookmarksIE(TwitchPlaylistBaseIE):
 
 class TwitchStreamIE(TwitchBaseIE):
     IE_NAME = 'twitch:stream'
-    _VALID_URL = r'%s/(?P<id>[^/]+)/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
+    _VALID_URL = r'%s/(?P<id>[^/#?]+)/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.twitch.tv/shroomztv',
         'info_dict': {
             'id': '12772022048',
@@ -335,7 +331,10 @@ class TwitchStreamIE(TwitchBaseIE):
             # m3u8 download
             'skip_download': True,
         },
-    }
+    }, {
+        'url': 'http://www.twitch.tv/miracle_doto#profile-0',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         channel_id = self._match_id(url)
@@ -350,6 +349,12 @@ class TwitchStreamIE(TwitchBaseIE):
                 'http://www.twitch.tv/%s/profile' % channel_id,
                 'TwitchProfile', channel_id)
 
+        # Channel name may be typed if different case than the original channel name
+        # (e.g. http://www.twitch.tv/TWITCHPLAYSPOKEMON) that will lead to constructing
+        # an invalid m3u8 URL. Working around by use of original channel name from stream
+        # JSON and fallback to lowercase if it's not available.
+        channel_id = stream.get('channel', {}).get('name') or channel_id.lower()
+
         access_token = self._download_json(
             '%s/api/channels/%s/access_token' % (self._API_BASE, channel_id), channel_id,
             'Downloading channel access token')
diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py
new file mode 100644 (file)
index 0000000..1aaa063
--- /dev/null
@@ -0,0 +1,72 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_request
+from ..utils import (
+    float_or_none,
+    unescapeHTML,
+)
+
+
+class TwitterCardIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/cards/tfw/v1/(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889',
+        'md5': 'a74f50b310c83170319ba16de6955192',
+        'info_dict': {
+            'id': '560070183650213889',
+            'ext': 'mp4',
+            'title': 'TwitterCard',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 30.033,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        # Different formats served for different User-Agents
+        USER_AGENTS = [
+            'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',  # mp4
+            'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0',  # webm
+        ]
+
+        config = None
+        formats = []
+        for user_agent in USER_AGENTS:
+            request = compat_urllib_request.Request(url)
+            request.add_header('User-Agent', user_agent)
+            webpage = self._download_webpage(request, video_id)
+
+            config = self._parse_json(
+                unescapeHTML(self._search_regex(
+                    r'data-player-config="([^"]+)"', webpage, 'data player config')),
+                video_id)
+
+            video_url = config['playlist'][0]['source']
+
+            f = {
+                'url': video_url,
+            }
+
+            m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url)
+            if m:
+                f.update({
+                    'width': int(m.group('width')),
+                    'height': int(m.group('height')),
+                })
+            formats.append(f)
+        self._sort_formats(formats)
+
+        thumbnail = config.get('posterImageUrl')
+        duration = float_or_none(config.get('duration'))
+
+        return {
+            'id': video_id,
+            'title': 'TwitterCard',
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'formats': formats,
+        }
index 4667ed83b71f4aec5f081741834e2c9cca010e82..e2bab52fef3451596ec1cf0de19e3131e378b5dd 100644 (file)
@@ -15,7 +15,8 @@ from ..utils import (
 class UdemyIE(InfoExtractor):
     IE_NAME = 'udemy'
     _VALID_URL = r'https?://www\.udemy\.com/(?:[^#]+#/lecture/|lecture/view/?\?lectureId=)(?P<id>\d+)'
-    _LOGIN_URL = 'https://www.udemy.com/join/login-submit/'
+    _LOGIN_URL = 'https://www.udemy.com/join/login-popup/?displayType=ajax&showSkipButton=1'
+    _ORIGIN_URL = 'https://www.udemy.com'
     _NETRC_MACHINE = 'udemy'
 
     _TESTS = [{
@@ -74,29 +75,33 @@ class UdemyIE(InfoExtractor):
                 expected=True)
 
         login_popup = self._download_webpage(
-            'https://www.udemy.com/join/login-popup?displayType=ajax&showSkipButton=1', None,
-            'Downloading login popup')
+            self._LOGIN_URL, None, 'Downloading login popup')
 
         if login_popup == '<div class="run-command close-popup redirect" data-url="https://www.udemy.com/"></div>':
             return
 
-        csrf = self._html_search_regex(
-            r'<input type="hidden" name="csrf" value="(.+?)"',
-            login_popup, 'csrf token')
+        login_form = self._form_hidden_inputs('login-form', login_popup)
+
+        login_form.update({
+            'email': username.encode('utf-8'),
+            'password': password.encode('utf-8'),
+        })
 
-        login_form = {
-            'email': username,
-            'password': password,
-            'csrf': csrf,
-            'displayType': 'json',
-            'isSubmitted': '1',
-        }
         request = compat_urllib_request.Request(
             self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
-        response = self._download_json(
+        request.add_header('Referer', self._ORIGIN_URL)
+        request.add_header('Origin', self._ORIGIN_URL)
+
+        response = self._download_webpage(
             request, None, 'Logging in as %s' % username)
 
-        if 'returnUrl' not in response:
+        if all(logout_pattern not in response
+               for logout_pattern in ['href="https://www.udemy.com/user/logout/', '>Logout<']):
+            error = self._html_search_regex(
+                r'(?s)<div[^>]+class="form-errors[^"]*">(.+?)</div>',
+                response, 'error message', default=None)
+            if error:
+                raise ExtractorError('Unable to login: %s' % error, expected=True)
             raise ExtractorError('Unable to log in')
 
     def _real_extract(self, url):
index bba25bb58041ddca902749d32c72ca3ad3d619a1..2151f83382d6b3185722b54de2d0eab2a988c6ae 100644 (file)
@@ -3,12 +3,16 @@ from __future__ import unicode_literals
 
 import json
 from .common import InfoExtractor
-from ..utils import js_to_json
+from ..utils import (
+    js_to_json,
+    ExtractorError,
+)
 from ..compat import compat_urlparse
 
 
 class UDNEmbedIE(InfoExtractor):
-    _VALID_URL = r'(?:https?:)?//video\.udn\.com/embed/news/(?P<id>\d+)'
+    IE_DESC = '聯合影音'
+    _VALID_URL = r'https?://video\.udn\.com/(?:embed|play)/news/(?P<id>\d+)'
     _TESTS = [{
         'url': 'http://video.udn.com/embed/news/300040',
         'md5': 'de06b4c90b042c128395a88f0384817e',
@@ -19,7 +23,11 @@ class UDNEmbedIE(InfoExtractor):
             'thumbnail': 're:^https?://.*\.jpg$',
         }
     }, {
-        'url': '//video.udn.com/embed/news/300040',
+        'url': 'https://video.udn.com/embed/news/300040',
+        'only_matching': True,
+    }, {
+        # From https://video.udn.com/news/303776
+        'url': 'https://video.udn.com/play/news/303776',
         'only_matching': True,
     }]
 
@@ -47,7 +55,10 @@ class UDNEmbedIE(InfoExtractor):
                 'retrieve url for %s video' % video_type),
             'format_id': video_type,
             'preference': 0 if video_type == 'mp4' else -1,
-        } for video_type, api_url in video_urls.items()]
+        } for video_type, api_url in video_urls.items() if api_url]
+
+        if not formats:
+            raise ExtractorError('No videos found', expected=True)
 
         self._sort_formats(formats)
 
index 96c809eaf7155290210e0f8b18d3a2c7c948ba97..c4751050ec60901c2750b2f1692059f6246e23dc 100644 (file)
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..compat import compat_urllib_parse_urlparse
 from ..utils import (
     ExtractorError,
     qualities,
@@ -44,9 +45,9 @@ class UltimediaIE(InfoExtractor):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
 
-        deliver_url = self._search_regex(
-            r'<iframe[^>]+src="(https?://(?:www\.)?ultimedia\.com/deliver/[^"]+)"',
-            webpage, 'deliver URL')
+        deliver_url = self._proto_relative_url(self._search_regex(
+            r'<iframe[^>]+src="((?:https?:)?//(?:www\.)?ultimedia\.com/deliver/[^"]+)"',
+            webpage, 'deliver URL'), compat_urllib_parse_urlparse(url).scheme + ':')
 
         deliver_page = self._download_webpage(
             deliver_url, video_id, 'Downloading iframe page')
@@ -57,7 +58,8 @@ class UltimediaIE(InfoExtractor):
 
         player = self._parse_json(
             self._search_regex(
-                r"jwplayer\('player(?:_temp)?'\)\.setup\(({.+?})\)\.on", deliver_page, 'player'),
+                r"jwplayer\('player(?:_temp)?'\)\.setup\(({.+?})\)\.on",
+                deliver_page, 'player'),
             video_id)
 
         quality = qualities(['flash', 'html5'])
index 68d03b99905cce848eb38fde8b6d8e643c548105..c39c278ab211c45809e594f64cc90f71304e9d92 100644 (file)
@@ -1,17 +1,19 @@
 from __future__ import unicode_literals
 
+import json
 import re
 
 from .common import InfoExtractor
 from ..compat import (
     compat_urlparse,
 )
+from ..utils import ExtractorError
 
 
 class UstreamIE(InfoExtractor):
     _VALID_URL = r'https?://www\.ustream\.tv/(?P<type>recorded|embed|embed/recorded)/(?P<videoID>\d+)'
     IE_NAME = 'ustream'
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.ustream.tv/recorded/20274954',
         'md5': '088f151799e8f572f84eb62f17d73e5c',
         'info_dict': {
@@ -20,7 +22,18 @@ class UstreamIE(InfoExtractor):
             'uploader': 'Young Americans for Liberty',
             'title': 'Young Americans for Liberty February 7, 2012 2:28 AM',
         },
-    }
+    }, {
+        # From http://sportscanada.tv/canadagames/index.php/week2/figure-skating/444
+        # Title and uploader available only from params JSON
+        'url': 'http://www.ustream.tv/embed/recorded/59307601?ub=ff0000&lc=ff0000&oc=ffffff&uc=ffffff&v=3&wmode=direct',
+        'md5': '5a2abf40babeac9812ed20ae12d34e10',
+        'info_dict': {
+            'id': '59307601',
+            'ext': 'flv',
+            'title': '-CG11- Canada Games Figure Skating',
+            'uploader': 'sportscanadatv',
+        }
+    }]
 
     def _real_extract(self, url):
         m = re.match(self._VALID_URL, url)
@@ -39,16 +52,42 @@ class UstreamIE(InfoExtractor):
             desktop_url = 'http://www.ustream.tv/recorded/' + desktop_video_id
             return self.url_result(desktop_url, 'Ustream')
 
-        video_url = 'http://tcdn.ustream.tv/video/%s' % video_id
+        params = self._download_json(
+            'http://cdngw.ustream.tv/rgwjson/Viewer.getVideo/' + json.dumps({
+                'brandId': 1,
+                'videoId': int(video_id),
+                'autoplay': False,
+            }), video_id)
+
+        if 'error' in params:
+            raise ExtractorError(params['error']['message'], expected=True)
+
+        video_url = params['flv']
+
         webpage = self._download_webpage(url, video_id)
 
         self.report_extraction(video_id)
 
         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
-                                              webpage, 'title')
+                                              webpage, 'title', default=None)
+
+        if not video_title:
+            try:
+                video_title = params['moduleConfig']['meta']['title']
+            except KeyError:
+                pass
+
+        if not video_title:
+            video_title = 'Ustream video ' + video_id
 
         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
-                                           webpage, 'uploader', fatal=False, flags=re.DOTALL)
+                                           webpage, 'uploader', fatal=False, flags=re.DOTALL, default=None)
+
+        if not uploader:
+            try:
+                uploader = params['moduleConfig']['meta']['userName']
+            except KeyError:
+                uploader = None
 
         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
                                             webpage, 'thumbnail', fatal=False)
index dd026748dcbb536f9f49181b0d211bf0a9157777..722eb52368825b92c88506ff33d79bf1f2f91a32 100644 (file)
@@ -5,6 +5,7 @@ from .common import InfoExtractor
 from ..compat import (
     compat_urllib_parse,
     compat_urllib_request,
+    compat_urlparse,
 )
 from ..utils import (
     ExtractorError,
@@ -26,11 +27,21 @@ class Vbox7IE(InfoExtractor):
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        redirect_page, urlh = self._download_webpage_handle(url, video_id)
-        new_location = self._search_regex(r'window\.location = \'(.*)\';',
-                                          redirect_page, 'redirect location')
-        redirect_url = urlh.geturl() + new_location
-        webpage = self._download_webpage(redirect_url, video_id,
+        # need to get the page 3 times for the correct jsSecretToken cookie
+        # which is necessary for the correct title
+        def get_session_id():
+            redirect_page = self._download_webpage(url, video_id)
+            session_id_url = self._search_regex(
+                r'var\s*url\s*=\s*\'([^\']+)\';', redirect_page,
+                'session id url')
+            self._download_webpage(
+                compat_urlparse.urljoin(url, session_id_url), video_id,
+                'Getting session id')
+
+        get_session_id()
+        get_session_id()
+
+        webpage = self._download_webpage(url, video_id,
                                          'Downloading redirect page')
 
         title = self._html_search_regex(r'<title>(.*)</title>',
index 96353f5250783be95fd4bf308190309baae70187..0d8d832cc0890a8cb99c8388b1b9fc1964396d23 100644 (file)
@@ -5,6 +5,7 @@ import json
 
 from .common import InfoExtractor
 from ..compat import (
+    compat_urllib_parse_unquote,
     compat_urlparse,
 )
 from ..utils import (
@@ -17,7 +18,9 @@ from ..utils import (
 class VeeHDIE(InfoExtractor):
     _VALID_URL = r'https?://veehd\.com/video/(?P<id>\d+)'
 
-    _TEST = {
+    # Seems VeeHD videos have multiple copies on several servers, all of
+    # whom have different MD5 checksums, so omit md5 field in all tests
+    _TESTS = [{
         'url': 'http://veehd.com/video/4639434_Solar-Sinter',
         'info_dict': {
             'id': '4639434',
@@ -26,7 +29,26 @@ class VeeHDIE(InfoExtractor):
             'uploader_id': 'VideoEyes',
             'description': 'md5:46a840e8692ddbaffb5f81d9885cb457',
         },
-    }
+        'skip': 'Video deleted',
+    }, {
+        'url': 'http://veehd.com/video/4905758_Elysian-Fields-Channeling',
+        'info_dict': {
+            'id': '4905758',
+            'ext': 'mp4',
+            'title': 'Elysian Fields - Channeling',
+            'description': 'md5:360e4e95fdab58aefbea0f2a19e5604b',
+            'uploader_id': 'spotted',
+        }
+    }, {
+        'url': 'http://veehd.com/video/2046729_2012-2009-DivX-Trailer',
+        'info_dict': {
+            'id': '2046729',
+            'ext': 'avi',
+            'title': '2012 (2009) DivX Trailer',
+            'description': 'md5:75435ee95255e6a9838ac6f6f3a2396b',
+            'uploader_id': 'Movie_Trailers',
+        }
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
@@ -48,13 +70,21 @@ class VeeHDIE(InfoExtractor):
         player_page = self._download_webpage(
             player_url, video_id, 'Downloading player page')
 
+        video_url = None
+
         config_json = self._search_regex(
             r'value=\'config=({.+?})\'', player_page, 'config json', default=None)
 
         if config_json:
             config = json.loads(config_json)
-            video_url = compat_urlparse.unquote(config['clip']['url'])
-        else:
+            video_url = compat_urllib_parse_unquote(config['clip']['url'])
+
+        if not video_url:
+            video_url = self._html_search_regex(
+                r'<embed[^>]+type="video/divx"[^>]+src="([^"]+)"',
+                player_page, 'video url', default=None)
+
+        if not video_url:
             iframe_src = self._search_regex(
                 r'<iframe[^>]+src="/?([^"]+)"', player_page, 'iframe url')
             iframe_url = 'http://veehd.com/%s' % iframe_src
@@ -82,7 +112,6 @@ class VeeHDIE(InfoExtractor):
             'id': video_id,
             'title': title,
             'url': video_url,
-            'ext': 'mp4',
             'uploader_id': uploader_id,
             'thumbnail': thumbnail,
             'description': description,
index 6215f06420554d424fa82b0a07fd51d13173dc87..3c8d2a9437af3021df921f5692ad5ae984b14ead 100644 (file)
@@ -38,9 +38,13 @@ class VesselIE(InfoExtractor):
         return req
 
     @staticmethod
-    def find_assets(data, asset_type):
+    def find_assets(data, asset_type, asset_id=None):
         for asset in data.get('assets', []):
-            if asset.get('type') == asset_type:
+            if not asset.get('type') == asset_type:
+                continue
+            elif asset_id is not None and not asset.get('id') == asset_id:
+                continue
+            else:
                 yield asset
 
     def _check_access_rights(self, data):
@@ -82,11 +86,13 @@ class VesselIE(InfoExtractor):
         req = VesselIE.make_json_request(
             self._API_URL_TEMPLATE % asset_id, {'client': 'web'})
         data = self._download_json(req, video_id)
+        video_asset_id = data.get('main_video_asset')
 
         self._check_access_rights(data)
 
         try:
-            video_asset = next(VesselIE.find_assets(data, 'video'))
+            video_asset = next(
+                VesselIE.find_assets(data, 'video', asset_id=video_asset_id))
         except StopIteration:
             raise ExtractorError('No video assets found')
 
index 2f111bf7ee042de1fce790a3f0d0f13be7f1feff..f38a72fde8974a7a1ea290de04281f67079b1a16 100644 (file)
@@ -4,11 +4,26 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..utils import float_or_none
+from ..utils import (
+    ExtractorError,
+    float_or_none,
+)
 
 
 class VGTVIE(InfoExtractor):
-    _VALID_URL = r'http://(?:www\.)?vgtv\.no/#!/(?:.*)/(?P<id>[0-9]+)'
+    IE_DESC = 'VGTV and BTTV'
+    _VALID_URL = r'''(?x)
+                    (?:
+                        vgtv:|
+                        http://(?:www\.)?
+                    )
+                    (?P<host>vgtv|bt)
+                    (?:
+                        :|
+                        \.no/(?:tv/)?\#!/(?:video|live)/
+                    )
+                    (?P<id>[0-9]+)
+                    '''
     _TESTS = [
         {
             # streamType: vod
@@ -47,16 +62,16 @@ class VGTVIE(InfoExtractor):
         },
         {
             # streamType: live
-            'url': 'http://www.vgtv.no/#!/live/100015/direkte-her-kan-du-se-laksen-live-fra-suldalslaagen',
+            'url': 'http://www.vgtv.no/#!/live/113063/direkte-v75-fra-solvalla',
             'info_dict': {
-                'id': '100015',
+                'id': '113063',
                 'ext': 'flv',
-                'title': 'DIREKTE: Her kan du se laksen live fra Suldalslågen!',
-                'description': 'md5:9a60cc23fa349f761628924e56eeec2d',
+                'title': 're:^DIREKTE: V75 fra Solvalla [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+                'description': 'md5:b3743425765355855f88e096acc93231',
                 'thumbnail': 're:^https?://.*\.jpg',
                 'duration': 0,
-                'timestamp': 1407423348,
-                'upload_date': '20140807',
+                'timestamp': 1432975582,
+                'upload_date': '20150530',
                 'view_count': int,
             },
             'params': {
@@ -64,25 +79,47 @@ class VGTVIE(InfoExtractor):
                 'skip_download': True,
             },
         },
+        {
+            'url': 'http://www.bt.no/tv/#!/video/100250/norling-dette-er-forskjellen-paa-1-divisjon-og-eliteserien',
+            'only_matching': True,
+        },
     ]
 
     def _real_extract(self, url):
-        video_id = self._match_id(url)
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        host = mobj.group('host')
+
+        HOST_WEBSITES = {
+            'vgtv': 'vgtv',
+            'bt': 'bttv',
+        }
+
         data = self._download_json(
-            'http://svp.vg.no/svp/api/v1/vgtv/assets/%s?appName=vgtv-website' % video_id,
+            'http://svp.vg.no/svp/api/v1/%s/assets/%s?appName=%s-website'
+            % (host, video_id, HOST_WEBSITES[host]),
             video_id, 'Downloading media JSON')
 
+        if data.get('status') == 'inactive':
+            raise ExtractorError(
+                'Video %s is no longer available' % video_id, expected=True)
+
         streams = data['streamUrls']
+        stream_type = data.get('streamType')
 
         formats = []
 
         hls_url = streams.get('hls')
         if hls_url:
-            formats.extend(self._extract_m3u8_formats(hls_url, video_id, 'mp4'))
+            formats.extend(self._extract_m3u8_formats(
+                hls_url, video_id, 'mp4', m3u8_id='hls'))
 
         hds_url = streams.get('hds')
-        if hds_url:
-            formats.extend(self._extract_f4m_formats(hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', video_id))
+        # wasLive hds are always 404
+        if hds_url and stream_type != 'wasLive':
+            formats.extend(self._extract_f4m_formats(
+                hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18',
+                video_id, f4m_id='hds'))
 
         mp4_url = streams.get('mp4')
         if mp4_url:
@@ -107,11 +144,60 @@ class VGTVIE(InfoExtractor):
 
         return {
             'id': video_id,
-            'title': data['title'],
+            'title': self._live_title(data['title']),
             'description': data['description'],
             'thumbnail': data['images']['main'] + '?t[]=900x506q80',
             'timestamp': data['published'],
             'duration': float_or_none(data['duration'], 1000),
             'view_count': data['displays'],
             'formats': formats,
+            'is_live': True if stream_type == 'live' else False,
         }
+
+
+class BTArticleIE(InfoExtractor):
+    IE_NAME = 'bt:article'
+    IE_DESC = 'Bergens Tidende Articles'
+    _VALID_URL = 'http://(?:www\.)?bt\.no/(?:[^/]+/)+(?P<id>[^/]+)-\d+\.html'
+    _TEST = {
+        'url': 'http://www.bt.no/nyheter/lokalt/Kjemper-for-internatet-1788214.html',
+        'md5': 'd055e8ee918ef2844745fcfd1a4175fb',
+        'info_dict': {
+            'id': '23199',
+            'ext': 'mp4',
+            'title': 'Alrekstad internat',
+            'description': 'md5:dc81a9056c874fedb62fc48a300dac58',
+            'thumbnail': 're:^https?://.*\.jpg',
+            'duration': 191,
+            'timestamp': 1289991323,
+            'upload_date': '20101117',
+            'view_count': int,
+        },
+    }
+
+    def _real_extract(self, url):
+        webpage = self._download_webpage(url, self._match_id(url))
+        video_id = self._search_regex(
+            r'SVP\.Player\.load\(\s*(\d+)', webpage, 'video id')
+        return self.url_result('vgtv:bt:%s' % video_id, 'VGTV')
+
+
+class BTVestlendingenIE(InfoExtractor):
+    IE_NAME = 'bt:vestlendingen'
+    IE_DESC = 'Bergens Tidende - Vestlendingen'
+    _VALID_URL = 'http://(?:www\.)?bt\.no/spesial/vestlendingen/#!/(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.bt.no/spesial/vestlendingen/#!/86588',
+        'md5': 'd7d17e3337dc80de6d3a540aefbe441b',
+        'info_dict': {
+            'id': '86588',
+            'ext': 'mov',
+            'title': 'Otto Wollertsen',
+            'description': 'Vestlendingen Otto Fredrik Wollertsen',
+            'timestamp': 1430473209,
+            'upload_date': '20150501',
+        },
+    }
+
+    def _real_extract(self, url):
+        return self.url_result('xstream:btno:%s' % self._match_id(url), 'Xstream')
index 71f520fb525a5bef424061be1ff881408b762624..01af7a99574401b38e487b01dd5104e674740bbc 100644 (file)
@@ -1,5 +1,4 @@
 from __future__ import unicode_literals
-import re
 
 from .common import InfoExtractor
 from .ooyala import OoyalaIE
@@ -7,31 +6,34 @@ from ..utils import ExtractorError
 
 
 class ViceIE(InfoExtractor):
-    _VALID_URL = r'http://www\.vice\.com/.*?/(?P<name>.+)'
+    _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)+(?P<id>.+)'
 
-    _TEST = {
-        'url': 'http://www.vice.com/Fringes/cowboy-capitalists-part-1',
-        'info_dict': {
-            'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp',
-            'ext': 'mp4',
-            'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
-        },
-        'params': {
-            # Requires ffmpeg (m3u8 manifest)
-            'skip_download': True,
-        },
-    }
+    _TESTS = [
+        {
+            'url': 'http://www.vice.com/Fringes/cowboy-capitalists-part-1',
+            'info_dict': {
+                'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp',
+                'ext': 'mp4',
+                'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
+            },
+            'params': {
+                # Requires ffmpeg (m3u8 manifest)
+                'skip_download': True,
+            },
+        }, {
+            'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab',
+            'only_matching': True,
+        }
+    ]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        name = mobj.group('name')
-        webpage = self._download_webpage(url, name)
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
         try:
             embed_code = self._search_regex(
                 r'embedCode=([^&\'"]+)', webpage,
                 'ooyala embed code')
             ooyala_url = OoyalaIE._url_for_embed_code(embed_code)
-            print(ooyala_url)
         except ExtractorError:
             raise ExtractorError('The page doesn\'t contain a video', expected=True)
         return self.url_result(ooyala_url, ie='Ooyala')
index eb309a7cdf99b3ebc4bde755fe09d47505516f28..78ff6310a07f6864abda658ca1e804e26594460b 100644 (file)
@@ -8,20 +8,23 @@ from ..compat import compat_urllib_request
 
 
 class VideoMegaIE(InfoExtractor):
-    _VALID_URL = r'''(?x)https?://
-        (?:www\.)?videomega\.tv/
-        (?:iframe\.php|cdn\.php)?\?ref=(?P<id>[A-Za-z0-9]+)
-        '''
-    _TEST = {
-        'url': 'http://videomega.tv/?ref=4GNA688SU99US886ANG4',
-        'md5': 'bf5c2f95c4c917536e80936af7bc51e1',
+    _VALID_URL = r'(?:videomega:|https?://(?:www\.)?videomega\.tv/(?:(?:view|iframe|cdn)\.php)?\?ref=)(?P<id>[A-Za-z0-9]+)'
+    _TESTS = [{
+        'url': 'http://videomega.tv/cdn.php?ref=AOSQBJYKIDDIKYJBQSOA',
+        'md5': 'cc1920a58add3f05c6a93285b84fb3aa',
         'info_dict': {
-            'id': '4GNA688SU99US886ANG4',
+            'id': 'AOSQBJYKIDDIKYJBQSOA',
             'ext': 'mp4',
-            'title': 'BigBuckBunny_320x180',
+            'title': '1254207',
             'thumbnail': 're:^https?://.*\.jpg$',
         }
-    }
+    }, {
+        'url': 'http://videomega.tv/cdn.php?ref=AOSQBJYKIDDIKYJBQSOA&width=1070&height=600',
+        'only_matching': True,
+    }, {
+        'url': 'http://videomega.tv/view.php?ref=090051111052065112106089103052052103089106112065052111051090',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
@@ -29,12 +32,13 @@ class VideoMegaIE(InfoExtractor):
         iframe_url = 'http://videomega.tv/cdn.php?ref=%s' % video_id
         req = compat_urllib_request.Request(iframe_url)
         req.add_header('Referer', url)
+        req.add_header('Cookie', 'noadvtday=0')
         webpage = self._download_webpage(req, video_id)
 
         title = self._html_search_regex(
-            r'<title>(.*?)</title>', webpage, 'title')
+            r'<title>(.+?)</title>', webpage, 'title')
         title = re.sub(
-            r'(?:^[Vv]ideo[Mm]ega\.tv\s-\s?|\s?-\svideomega\.tv$)', '', title)
+            r'(?:^[Vv]ideo[Mm]ega\.tv\s-\s*|\s*-\svideomega\.tv$)', '', title)
         thumbnail = self._search_regex(
             r'<video[^>]+?poster="([^"]+)"', webpage, 'thumbnail', fatal=False)
         video_url = self._search_regex(
index ececc7ee0118932716ca9bdb06779cf94e6dc0ec..591024eaded0cdddbb0779bd942c0fa8f63d86a6 100644 (file)
@@ -43,7 +43,7 @@ class VideoTtIE(InfoExtractor):
 
         formats = [
             {
-                'url': base64.b64decode(res['u']).decode('utf-8'),
+                'url': base64.b64decode(res['u'].encode('utf-8')).decode('utf-8'),
                 'ext': 'flv',
                 'format_id': res['l'],
             } for res in settings['res'] if res['u']
index bd953fb4cc212f50dce2cac624c9391a14e82898..e0b55078b2c9af8bf654ee6cd6982305074cb39b 100644 (file)
@@ -10,7 +10,7 @@ from ..utils import (
 
 class VidmeIE(InfoExtractor):
     _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]+)'
-    _TEST = {
+    _TESTS = [{
         'url': 'https://vid.me/QNB',
         'md5': 'f42d05e7149aeaec5c037b17e5d3dc82',
         'info_dict': {
@@ -23,9 +23,14 @@ class VidmeIE(InfoExtractor):
             'upload_date': '20140725',
             'thumbnail': 're:^https?://.*\.jpg',
         },
-    }
+    }, {
+        # From http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching
+        'url': 'https://vid.me/e/Wmur',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
+        url = url.replace('vid.me/e/', 'vid.me/')
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
 
index 619039e516c96209c38953e6b73f93a6895df54c..15377097e658b20e75a08f19b370be3bef2158c7 100644 (file)
@@ -38,11 +38,14 @@ class VierIE(InfoExtractor):
         webpage = self._download_webpage(url, display_id)
 
         video_id = self._search_regex(
-            r'"nid"\s*:\s*"(\d+)"', webpage, 'video id')
+            [r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'],
+            webpage, 'video id')
         application = self._search_regex(
-            r'"application"\s*:\s*"([^"]+)"', webpage, 'application', default='vier_vod')
+            [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'],
+            webpage, 'application', default='vier_vod')
         filename = self._search_regex(
-            r'"filename"\s*:\s*"([^"]+)"', webpage, 'filename')
+            [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'],
+            webpage, 'filename')
 
         playlist_url = 'http://vod.streamcloud.be/%s/mp4:_definst_/%s.mp4/playlist.m3u8' % (application, filename)
         formats = self._extract_m3u8_formats(playlist_url, display_id, 'mp4')
index 1742e66f481c91477ae41e5c42a262e35b7adc47..6ef36290b417a846bb2a6f36cc80fd2a6e59e105 100644 (file)
+# coding: utf-8
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..compat import compat_urllib_request
+from ..compat import (
+    compat_urllib_request,
+    compat_urllib_parse,
+)
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    parse_iso8601,
+)
 
 
 class ViewsterIE(InfoExtractor):
-    _VALID_URL = r'http://(?:www\.)?viewster\.com/movie/(?P<id>\d+-\d+-\d+)'
+    _VALID_URL = r'http://(?:www\.)?viewster\.com/(?:serie|movie)/(?P<id>\d+-\d+-\d+)'
     _TESTS = [{
-        # movielink, paymethod=fre
-        'url': 'http://www.viewster.com/movie/1293-19341-000/hout-wood/',
-        'playlist': [{
-            'md5': '8f9d94b282d80c42b378dffdbb11caf3',
-            'info_dict': {
-                'id': '1293-19341-000-movie',
-                'ext': 'flv',
-                'title': "'Hout' (Wood) - Movie",
-            },
-        }],
-        'info_dict': {
-            'id': '1293-19341-000',
-            'title': "'Hout' (Wood)",
-            'description': 'md5:925733185a9242ef96f436937683f33b',
-        }
-    }, {
-        # movielink, paymethod=adv
+        # movie, Type=Movie
         'url': 'http://www.viewster.com/movie/1140-11855-000/the-listening-project/',
-        'playlist': [{
-            'md5': '77a005453ca7396cbe3d35c9bea30aef',
-            'info_dict': {
-                'id': '1140-11855-000-movie',
-                'ext': 'flv',
-                'title': "THE LISTENING PROJECT - Movie",
-            },
-        }],
+        'md5': '14d3cfffe66d57b41ae2d9c873416f01',
         'info_dict': {
             'id': '1140-11855-000',
-            'title': "THE LISTENING PROJECT",
-            'description': 'md5:714421ae9957e112e672551094bf3b08',
-        }
+            'ext': 'flv',
+            'title': 'The listening Project',
+            'description': 'md5:bac720244afd1a8ea279864e67baa071',
+            'timestamp': 1214870400,
+            'upload_date': '20080701',
+            'duration': 4680,
+        },
     }, {
-        # direct links, no movielink
-        'url': 'http://www.viewster.com/movie/1198-56411-000/sinister/',
-        'playlist': [{
-            'md5': '0307b7eac6bfb21ab0577a71f6eebd8f',
-            'info_dict': {
-                'id': '1198-56411-000-trailer',
-                'ext': 'mp4',
-                'title': "Sinister - Trailer",
-            },
-        }, {
-            'md5': '80b9ee3ad69fb368f104cb5d9732ae95',
-            'info_dict': {
-                'id': '1198-56411-000-behind-scenes',
-                'ext': 'mp4',
-                'title': "Sinister - Behind Scenes",
-            },
-        }, {
-            'md5': '3b3ea897ecaa91fca57a8a94ac1b15c5',
-            'info_dict': {
-                'id': '1198-56411-000-scene-from-movie',
-                'ext': 'mp4',
-                'title': "Sinister - Scene from movie",
-            },
-        }],
+        # series episode, Type=Episode
+        'url': 'http://www.viewster.com/serie/1284-19427-001/the-world-and-a-wall/',
+        'md5': 'd5434c80fcfdb61651cc2199a88d6ba3',
         'info_dict': {
-            'id': '1198-56411-000',
-            'title': "Sinister",
-            'description': 'md5:014c40b0488848de9683566a42e33372',
-        }
+            'id': '1284-19427-001',
+            'ext': 'flv',
+            'title': 'The World and a Wall',
+            'description': 'md5:24814cf74d3453fdf5bfef9716d073e3',
+            'timestamp': 1428192000,
+            'upload_date': '20150405',
+            'duration': 1500,
+        },
+    }, {
+        # serie, Type=Serie
+        'url': 'http://www.viewster.com/serie/1303-19426-000/',
+        'info_dict': {
+            'id': '1303-19426-000',
+            'title': 'Is It Wrong to Try to Pick up Girls in a Dungeon?',
+            'description': 'md5:eeda9bef25b0d524b3a29a97804c2f11',
+        },
+        'playlist_count': 13,
+    }, {
+        # unfinished serie, no Type
+        'url': 'http://www.viewster.com/serie/1284-19427-000/baby-steps-season-2/',
+        'info_dict': {
+            'id': '1284-19427-000',
+            'title': 'Baby Steps—Season 2',
+            'description': 'md5:e7097a8fc97151e25f085c9eb7a1cdb1',
+        },
+        'playlist_mincount': 16,
     }]
 
     _ACCEPT_HEADER = 'application/json, text/javascript, */*; q=0.01'
+    _AUTH_TOKEN = '/YqhSYsx8EaU9Bsta3ojlA=='
 
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-
-        request = compat_urllib_request.Request(
-            'http://api.live.viewster.com/api/v1/movie/%s' % video_id)
+    def _download_json(self, url, video_id, note='Downloading JSON metadata', fatal=True):
+        request = compat_urllib_request.Request(url)
         request.add_header('Accept', self._ACCEPT_HEADER)
+        request.add_header('Auth-token', self._AUTH_TOKEN)
+        return super(ViewsterIE, self)._download_json(request, video_id, note, fatal=fatal)
 
-        movie = self._download_json(
-            request, video_id, 'Downloading movie metadata JSON')
-
-        title = movie.get('title') or movie['original_title']
-        description = movie.get('synopsis')
-        thumbnail = movie.get('large_artwork') or movie.get('artwork')
-
-        entries = []
-        for clip in movie['play_list']:
-            entry = None
-
-            # movielink api
-            link_request = clip.get('link_request')
-            if link_request:
-                request = compat_urllib_request.Request(
-                    'http://api.live.viewster.com/api/v1/movielink?movieid=%(movieid)s&action=%(action)s&paymethod=%(paymethod)s&price=%(price)s&currency=%(currency)s&language=%(language)s&subtitlelanguage=%(subtitlelanguage)s&ischromecast=%(ischromecast)s'
-                    % link_request)
-                request.add_header('Accept', self._ACCEPT_HEADER)
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
 
-                movie_link = self._download_json(
-                    request, video_id, 'Downloading movie link JSON', fatal=False)
+        info = self._download_json(
+            'https://public-api.viewster.com/search/%s' % video_id,
+            video_id, 'Downloading entry JSON')
 
-                if movie_link:
-                    formats = self._extract_f4m_formats(
-                        movie_link['url'] + '&hdcore=3.2.0&plugin=flowplayer-3.2.0.1', video_id)
-                    self._sort_formats(formats)
-                    entry = {
-                        'formats': formats,
-                    }
+        entry_id = info.get('Id') or info['id']
 
-            # direct link
-            clip_url = clip.get('clip_data', {}).get('url')
-            if clip_url:
-                entry = {
-                    'url': clip_url,
-                    'ext': 'mp4',
-                }
+        # unfinished serie has no Type
+        if info.get('Type') in ['Serie', None]:
+            episodes = self._download_json(
+                'https://public-api.viewster.com/series/%s/episodes' % entry_id,
+                video_id, 'Downloading series JSON')
+            entries = [
+                self.url_result(
+                    'http://www.viewster.com/movie/%s' % episode['OriginId'], 'Viewster')
+                for episode in episodes]
+            title = (info.get('Title') or info['Synopsis']['Title']).strip()
+            description = info.get('Synopsis', {}).get('Detailed')
+            return self.playlist_result(entries, video_id, title, description)
 
-            if entry:
-                entry.update({
-                    'id': '%s-%s' % (video_id, clip['canonical_title']),
-                    'title': '%s - %s' % (title, clip['title']),
+        formats = []
+        for media_type in ('application/f4m+xml', 'application/x-mpegURL'):
+            media = self._download_json(
+                'https://public-api.viewster.com/movies/%s/video?mediaType=%s'
+                % (entry_id, compat_urllib_parse.quote(media_type)),
+                video_id, 'Downloading %s JSON' % media_type, fatal=False)
+            if not media:
+                continue
+            video_url = media.get('Uri')
+            if not video_url:
+                continue
+            ext = determine_ext(video_url)
+            if ext == 'f4m':
+                video_url += '&' if '?' in video_url else '?'
+                video_url += 'hdcore=3.2.0&plugin=flowplayer-3.2.0.1'
+                formats.extend(self._extract_f4m_formats(
+                    video_url, video_id, f4m_id='hds'))
+            elif ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    video_url, video_id, 'mp4', m3u8_id='hls',
+                    fatal=False  # m3u8 sometimes fail
+                ))
+            else:
+                formats.append({
+                    'url': video_url,
                 })
-                entries.append(entry)
+        self._sort_formats(formats)
 
-        playlist = self.playlist_result(entries, video_id, title, description)
-        playlist['thumbnail'] = thumbnail
-        return playlist
+        synopsis = info.get('Synopsis', {})
+        # Prefer title outside synopsis since it's less messy
+        title = (info.get('Title') or synopsis['Title']).strip()
+        description = synopsis.get('Detailed') or info.get('Synopsis', {}).get('Short')
+        duration = int_or_none(info.get('Duration'))
+        timestamp = parse_iso8601(info.get('ReleaseDate'))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'timestamp': timestamp,
+            'duration': duration,
+            'formats': formats,
+        }
index 6816dacb665e2253a132cfe678999a1129860a0b..51cdc6b65143aaf4a0d2823ffa8c859c96e25972 100644 (file)
@@ -1,22 +1,98 @@
+# coding: utf-8
 from __future__ import unicode_literals
 
-import re
+import json
+import time
+import hmac
+import hashlib
+import itertools
 
-from ..compat import compat_urlparse
 from ..utils import (
     ExtractorError,
-    unescapeHTML,
-    unified_strdate,
-    US_RATINGS,
+    int_or_none,
+    parse_age_limit,
+    parse_iso8601,
 )
+from ..compat import compat_urllib_request
 from .common import InfoExtractor
 
 
-class VikiIE(InfoExtractor):
-    IE_NAME = 'viki'
+class VikiBaseIE(InfoExtractor):
+    _VALID_URL_BASE = r'https?://(?:www\.)?viki\.(?:com|net|mx|jp|fr)/'
+    _API_QUERY_TEMPLATE = '/v4/%sapp=%s&t=%s&site=www.viki.com'
+    _API_URL_TEMPLATE = 'http://api.viki.io%s&sig=%s'
+
+    _APP = '65535a'
+    _APP_VERSION = '2.2.5.1428709186'
+    _APP_SECRET = '-$iJ}@p7!G@SyU/je1bEyWg}upLu-6V6-Lg9VD(]siH,r.,m-r|ulZ,U4LC/SeR)'
+
+    _NETRC_MACHINE = 'viki'
+
+    _token = None
+
+    def _prepare_call(self, path, timestamp=None, post_data=None):
+        path += '?' if '?' not in path else '&'
+        if not timestamp:
+            timestamp = int(time.time())
+        query = self._API_QUERY_TEMPLATE % (path, self._APP, timestamp)
+        if self._token:
+            query += '&token=%s' % self._token
+        sig = hmac.new(
+            self._APP_SECRET.encode('ascii'),
+            query.encode('ascii'),
+            hashlib.sha1
+        ).hexdigest()
+        url = self._API_URL_TEMPLATE % (query, sig)
+        return compat_urllib_request.Request(
+            url, json.dumps(post_data).encode('utf-8')) if post_data else url
+
+    def _call_api(self, path, video_id, note, timestamp=None, post_data=None):
+        resp = self._download_json(
+            self._prepare_call(path, timestamp, post_data), video_id, note)
+
+        error = resp.get('error')
+        if error:
+            if error == 'invalid timestamp':
+                resp = self._download_json(
+                    self._prepare_call(path, int(resp['current_timestamp']), post_data),
+                    video_id, '%s (retry)' % note)
+                error = resp.get('error')
+            if error:
+                self._raise_error(resp['error'])
+
+        return resp
+
+    def _raise_error(self, error):
+        raise ExtractorError(
+            '%s returned error: %s' % (self.IE_NAME, error),
+            expected=True)
+
+    def _real_initialize(self):
+        self._login()
+
+    def _login(self):
+        (username, password) = self._get_login_info()
+        if username is None:
+            return
+
+        login_form = {
+            'login_id': username,
+            'password': password,
+        }
+
+        login = self._call_api(
+            'sessions.json', None,
+            'Logging in as %s' % username, post_data=login_form)
+
+        self._token = login.get('token')
+        if not self._token:
+            self.report_warning('Unable to get session token, login has probably failed')
 
-    _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)'
-    _TEST = {
+
+class VikiIE(VikiBaseIE):
+    IE_NAME = 'viki'
+    _VALID_URL = r'%s(?:videos|player)/(?P<id>[0-9]+v)' % VikiBaseIE._VALID_URL_BASE
+    _TESTS = [{
         'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14',
         'info_dict': {
             'id': '1023585v',
@@ -28,70 +104,219 @@ class VikiIE(InfoExtractor):
             'age_limit': 13,
         },
         'skip': 'Blocked in the US',
-    }
+    }, {
+        # clip
+        'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference',
+        'md5': '86c0b5dbd4d83a6611a79987cc7a1989',
+        'info_dict': {
+            'id': '1067139v',
+            'ext': 'mp4',
+            'title': "'The Avengers: Age of Ultron' Press Conference",
+            'description': 'md5:d70b2f9428f5488321bfe1db10d612ea',
+            'duration': 352,
+            'timestamp': 1430380829,
+            'upload_date': '20150430',
+            'uploader': 'Arirang TV',
+            'like_count': int,
+            'age_limit': 0,
+        }
+    }, {
+        'url': 'http://www.viki.com/videos/1048879v-ankhon-dekhi',
+        'info_dict': {
+            'id': '1048879v',
+            'ext': 'mp4',
+            'title': 'Ankhon Dekhi',
+            'duration': 6512,
+            'timestamp': 1408532356,
+            'upload_date': '20140820',
+            'uploader': 'Spuul',
+            'like_count': int,
+            'age_limit': 13,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        }
+    }, {
+        # episode
+        'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1',
+        'md5': '190f3ef426005ba3a080a63325955bc3',
+        'info_dict': {
+            'id': '44699v',
+            'ext': 'mp4',
+            'title': 'Boys Over Flowers - Episode 1',
+            'description': 'md5:52617e4f729c7d03bfd4bcbbb6e946f2',
+            'duration': 4155,
+            'timestamp': 1270496524,
+            'upload_date': '20100405',
+            'uploader': 'group8',
+            'like_count': int,
+            'age_limit': 13,
+        }
+    }, {
+        # youtube external
+        'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1',
+        'md5': '216d1afdc0c64d1febc1e9f2bd4b864b',
+        'info_dict': {
+            'id': '50562v',
+            'ext': 'mp4',
+            'title': 'Poor Nastya [COMPLETE] - Episode 1',
+            'description': '',
+            'duration': 607,
+            'timestamp': 1274949505,
+            'upload_date': '20101213',
+            'uploader': 'ad14065n',
+            'uploader_id': 'ad14065n',
+            'like_count': int,
+            'age_limit': 13,
+        }
+    }, {
+        'url': 'http://www.viki.com/player/44699v',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        webpage = self._download_webpage(url, video_id)
-        title = self._og_search_title(webpage)
-        description = self._og_search_description(webpage)
-        thumbnail = self._og_search_thumbnail(webpage)
-
-        uploader_m = re.search(
-            r'<strong>Broadcast Network: </strong>\s*([^<]*)<', webpage)
-        if uploader_m is None:
-            uploader = None
-        else:
-            uploader = uploader_m.group(1).strip()
-
-        rating_str = self._html_search_regex(
-            r'<strong>Rating: </strong>\s*([^<]*)<', webpage,
-            'rating information', default='').strip()
-        age_limit = US_RATINGS.get(rating_str)
-
-        info_url = 'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id
-        info_webpage = self._download_webpage(
-            info_url, video_id, note='Downloading info page')
-        if re.match(r'\s*<div\s+class="video-error', info_webpage):
-            raise ExtractorError(
-                'Video %s is blocked from your location.' % video_id,
-                expected=True)
-        video_url = self._html_search_regex(
-            r'<source[^>]+src="([^"]+)"', info_webpage, 'video URL')
-
-        upload_date_str = self._html_search_regex(
-            r'"created_at":"([^"]+)"', info_webpage, 'upload date')
-        upload_date = (
-            unified_strdate(upload_date_str)
-            if upload_date_str is not None
-            else None
-        )
-
-        # subtitles
-        video_subtitles = self.extract_subtitles(video_id, info_webpage)
-
-        return {
+        video = self._call_api(
+            'videos/%s.json' % video_id, video_id, 'Downloading video JSON')
+
+        title = None
+        titles = video.get('titles')
+        if titles:
+            title = titles.get('en') or titles[titles.keys()[0]]
+        if not title:
+            title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id
+            container_titles = video.get('container', {}).get('titles')
+            if container_titles:
+                container_title = container_titles.get('en') or container_titles[container_titles.keys()[0]]
+                title = '%s - %s' % (container_title, title)
+
+        descriptions = video.get('descriptions')
+        description = descriptions.get('en') or descriptions[titles.keys()[0]] if descriptions else None
+
+        duration = int_or_none(video.get('duration'))
+        timestamp = parse_iso8601(video.get('created_at'))
+        uploader = video.get('author')
+        like_count = int_or_none(video.get('likes', {}).get('count'))
+        age_limit = parse_age_limit(video.get('rating'))
+
+        thumbnails = []
+        for thumbnail_id, thumbnail in video.get('images', {}).items():
+            thumbnails.append({
+                'id': thumbnail_id,
+                'url': thumbnail.get('url'),
+            })
+
+        subtitles = {}
+        for subtitle_lang, _ in video.get('subtitle_completions', {}).items():
+            subtitles[subtitle_lang] = [{
+                'ext': subtitles_format,
+                'url': self._prepare_call(
+                    'videos/%s/subtitles/%s.%s' % (video_id, subtitle_lang, subtitles_format)),
+            } for subtitles_format in ('srt', 'vtt')]
+
+        result = {
             'id': video_id,
             'title': title,
-            'url': video_url,
             'description': description,
-            'thumbnail': thumbnail,
-            'age_limit': age_limit,
+            'duration': duration,
+            'timestamp': timestamp,
             'uploader': uploader,
-            'subtitles': video_subtitles,
-            'upload_date': upload_date,
+            'like_count': like_count,
+            'age_limit': age_limit,
+            'thumbnails': thumbnails,
+            'subtitles': subtitles,
         }
 
-    def _get_subtitles(self, video_id, info_webpage):
-        res = {}
-        for sturl_html in re.findall(r'<track src="([^"]+)"', info_webpage):
-            sturl = unescapeHTML(sturl_html)
-            m = re.search(r'/(?P<lang>[a-z]+)\.vtt', sturl)
-            if not m:
-                continue
-            res[m.group('lang')] = [{
-                'url': compat_urlparse.urljoin('http://www.viki.com', sturl),
-                'ext': 'vtt',
-            }]
-        return res
+        streams = self._call_api(
+            'videos/%s/streams.json' % video_id, video_id,
+            'Downloading video streams JSON')
+
+        if 'external' in streams:
+            result.update({
+                '_type': 'url_transparent',
+                'url': streams['external']['url'],
+            })
+            return result
+
+        formats = []
+        for format_id, stream_dict in streams.items():
+            height = self._search_regex(
+                r'^(\d+)[pP]$', format_id, 'height', default=None)
+            for protocol, format_dict in stream_dict.items():
+                if format_id == 'm3u8':
+                    formats = self._extract_m3u8_formats(
+                        format_dict['url'], video_id, 'mp4', m3u8_id='m3u8-%s' % protocol)
+                else:
+                    formats.append({
+                        'url': format_dict['url'],
+                        'format_id': '%s-%s' % (format_id, protocol),
+                        'height': height,
+                    })
+        self._sort_formats(formats)
+
+        result['formats'] = formats
+        return result
+
+
+class VikiChannelIE(VikiBaseIE):
+    IE_NAME = 'viki:channel'
+    _VALID_URL = r'%s(?:tv|news|movies|artists)/(?P<id>[0-9]+c)' % VikiBaseIE._VALID_URL_BASE
+    _TESTS = [{
+        'url': 'http://www.viki.com/tv/50c-boys-over-flowers',
+        'info_dict': {
+            'id': '50c',
+            'title': 'Boys Over Flowers',
+            'description': 'md5:ecd3cff47967fe193cff37c0bec52790',
+        },
+        'playlist_count': 70,
+    }, {
+        'url': 'http://www.viki.com/tv/1354c-poor-nastya-complete',
+        'info_dict': {
+            'id': '1354c',
+            'title': 'Poor Nastya [COMPLETE]',
+            'description': 'md5:05bf5471385aa8b21c18ad450e350525',
+        },
+        'playlist_count': 127,
+    }, {
+        'url': 'http://www.viki.com/news/24569c-showbiz-korea',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.viki.com/movies/22047c-pride-and-prejudice-2005',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.viki.com/artists/2141c-shinee',
+        'only_matching': True,
+    }]
+
+    _PER_PAGE = 25
+
+    def _real_extract(self, url):
+        channel_id = self._match_id(url)
+
+        channel = self._call_api(
+            'containers/%s.json' % channel_id, channel_id,
+            'Downloading channel JSON')
+
+        titles = channel['titles']
+        title = titles.get('en') or titles[titles.keys()[0]]
+
+        descriptions = channel['descriptions']
+        description = descriptions.get('en') or descriptions[descriptions.keys()[0]]
+
+        entries = []
+        for video_type in ('episodes', 'clips', 'movies'):
+            for page_num in itertools.count(1):
+                page = self._call_api(
+                    'containers/%s/%s.json?per_page=%d&sort=number&direction=asc&with_paging=true&page=%d'
+                    % (channel_id, video_type, self._PER_PAGE, page_num), channel_id,
+                    'Downloading %s JSON page #%d' % (video_type, page_num))
+                for video in page['response']:
+                    video_id = video['id']
+                    entries.append(self.url_result(
+                        'http://www.viki.com/videos/%s' % video_id, 'Viki'))
+                if not page['pagination']['next']:
+                    break
+
+        return self.playlist_result(entries, channel_id, title, description)
index 28bcc89cd7423dafa40032076d1bd3ad12f4bdcf..10d6745af703e00d6962d3e14c8b01f2419ad955 100644 (file)
@@ -22,6 +22,7 @@ from ..utils import (
     unified_strdate,
     unsmuggle_url,
     urlencode_postdata,
+    unescapeHTML,
 )
 
 
@@ -38,7 +39,7 @@ class VimeoBaseInfoExtractor(InfoExtractor):
         self.report_login()
         login_url = 'https://vimeo.com/log_in'
         webpage = self._download_webpage(login_url, None, False)
-        token = self._search_regex(r'xsrft = \'(.*?)\'', webpage, 'login token')
+        token = self._search_regex(r'xsrft":"(.*?)"', webpage, 'login token')
         data = urlencode_postdata({
             'email': username,
             'password': password,
@@ -173,11 +174,26 @@ class VimeoIE(VimeoBaseInfoExtractor):
         },
     ]
 
+    @staticmethod
+    def _extract_vimeo_url(url, webpage):
+        # Look for embedded (iframe) Vimeo player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
+        if mobj:
+            player_url = unescapeHTML(mobj.group('url'))
+            surl = smuggle_url(player_url, {'Referer': url})
+            return surl
+        # Look for embedded (swf embed) Vimeo player
+        mobj = re.search(
+            r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
+        if mobj:
+            return mobj.group(1)
+
     def _verify_video_password(self, url, video_id, webpage):
         password = self._downloader.params.get('videopassword', None)
         if password is None:
             raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True)
-        token = self._search_regex(r'xsrft = \'(.*?)\'', webpage, 'login token')
+        token = self._search_regex(r'xsrft[\s=:"\']+([^"\']+)', webpage, 'login token')
         data = urlencode_postdata({
             'password': password,
             'token': token,
@@ -223,6 +239,8 @@ class VimeoIE(VimeoBaseInfoExtractor):
         orig_url = url
         if mobj.group('pro') or mobj.group('player'):
             url = 'https://player.vimeo.com/video/' + video_id
+        else:
+            url = 'https://vimeo.com/' + video_id
 
         # Retrieve video webpage to extract further information
         request = compat_urllib_request.Request(url, None, headers)
@@ -434,12 +452,8 @@ class VimeoChannelIE(InfoExtractor):
         password = self._downloader.params.get('videopassword', None)
         if password is None:
             raise ExtractorError('This album is protected by a password, use the --video-password option', expected=True)
-        fields = dict(re.findall(r'''(?x)<input\s+
-            type="hidden"\s+
-            name="([^"]+)"\s+
-            value="([^"]*)"
-            ''', login_form))
-        token = self._search_regex(r'xsrft = \'(.*?)\'', webpage, 'login token')
+        fields = self._hidden_inputs(login_form)
+        token = self._search_regex(r'xsrft[\s=:"\']+([^"\']+)', webpage, 'login token')
         fields['token'] = token
         fields['password'] = password
         post = urlencode_postdata(fields)
index ee3d86117e625cca66303aeeee229f1a091b4602..92321d66e369626c0adfeda6cb4fae282a6f7abb 100644 (file)
@@ -1,75 +1,60 @@
-# coding: utf-8
 from __future__ import unicode_literals
 
-import base64
-import re
-import xml.etree.ElementTree
-import zlib
-
 from .common import InfoExtractor
 from ..utils import int_or_none
 
 
-class VimpleIE(InfoExtractor):
-    IE_DESC = 'Vimple.ru'
-    _VALID_URL = r'https?://(player.vimple.ru/iframe|vimple.ru)/(?P<id>[a-f0-9]{10,})'
+class SprutoBaseIE(InfoExtractor):
+    def _extract_spruto(self, spruto, video_id):
+        playlist = spruto['playlist'][0]
+        title = playlist['title']
+        video_id = playlist.get('videoId') or video_id
+        thumbnail = playlist.get('posterUrl') or playlist.get('thumbnailUrl')
+        duration = int_or_none(playlist.get('duration'))
+
+        formats = [{
+            'url': f['url'],
+        } for f in playlist['video']]
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'formats': formats,
+        }
+
+
+class VimpleIE(SprutoBaseIE):
+    IE_DESC = 'Vimple - one-click video hosting'
+    _VALID_URL = r'https?://(?:player\.vimple\.ru/iframe|vimple\.ru)/(?P<id>[\da-f-]{32,36})'
     _TESTS = [
         {
             'url': 'http://vimple.ru/c0f6b1687dcd4000a97ebe70068039cf',
             'md5': '2e750a330ed211d3fd41821c6ad9a279',
             'info_dict': {
-                'id': 'c0f6b1687dcd4000a97ebe70068039cf',
+                'id': 'c0f6b168-7dcd-4000-a97e-be70068039cf',
                 'ext': 'mp4',
                 'title': 'Sunset',
                 'duration': 20,
                 'thumbnail': 're:https?://.*?\.jpg',
             },
-        },
+        }, {
+            'url': 'http://player.vimple.ru/iframe/52e1beec-1314-4a83-aeac-c61562eadbf9',
+            'only_matching': True,
+        }
     ]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-
-        iframe_url = 'http://player.vimple.ru/iframe/%s' % video_id
+        video_id = self._match_id(url)
 
-        iframe = self._download_webpage(
-            iframe_url, video_id,
-            note='Downloading iframe', errnote='unable to fetch iframe')
-        player_url = self._html_search_regex(
-            r'"(http://player.vimple.ru/flash/.+?)"', iframe, 'player url')
+        webpage = self._download_webpage(
+            'http://player.vimple.ru/iframe/%s' % video_id, video_id)
 
-        player = self._request_webpage(
-            player_url, video_id, note='Downloading swf player').read()
+        spruto = self._parse_json(
+            self._search_regex(
+                r'sprutoData\s*:\s*({.+?}),\r\n', webpage, 'spruto data'),
+            video_id)
 
-        player = zlib.decompress(player[8:])
-
-        xml_pieces = re.findall(b'([a-zA-Z0-9 =+/]{500})', player)
-        xml_pieces = [piece[1:-1] for piece in xml_pieces]
-
-        xml_data = b''.join(xml_pieces)
-        xml_data = base64.b64decode(xml_data)
-
-        xml_data = xml.etree.ElementTree.fromstring(xml_data)
-
-        video = xml_data.find('Video')
-        quality = video.get('quality')
-        q_tag = video.find(quality.capitalize())
-
-        formats = [
-            {
-                'url': q_tag.get('url'),
-                'tbr': int(q_tag.get('bitrate')),
-                'filesize': int(q_tag.get('filesize')),
-                'format_id': quality,
-            },
-        ]
-
-        return {
-            'id': video_id,
-            'title': video.find('Title').text,
-            'formats': formats,
-            'thumbnail': video.find('Poster').get('url'),
-            'duration': int_or_none(video.get('duration')),
-            'webpage_url': video.find('Share').get('videoPageUrl'),
-        }
+        return self._extract_spruto(spruto, video_id)
index 65c459fad39c36798d3d262d37ee95e52a2c313c..c733a48fa26edce6b219d3bf2404267b8c346bf6 100644 (file)
@@ -75,7 +75,7 @@ class VineIE(InfoExtractor):
         return {
             'id': video_id,
             'title': self._og_search_title(webpage),
-            'alt_title': self._og_search_description(webpage),
+            'alt_title': self._og_search_description(webpage, default=None),
             'description': data['description'],
             'thumbnail': data['thumbnailUrl'],
             'upload_date': unified_strdate(data['created']),
index cc384adbf9837f35f90c64d0e8dc0396b0b601ec..c30c5a8e524324a29d7c4a2dad49d5b9d50dda30 100644 (file)
@@ -13,14 +13,26 @@ from ..compat import (
 from ..utils import (
     ExtractorError,
     orderedSet,
+    str_to_int,
     unescapeHTML,
     unified_strdate,
 )
 
 
 class VKIE(InfoExtractor):
-    IE_NAME = 'vk.com'
-    _VALID_URL = r'https?://(?:m\.)?vk\.com/(?:video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)|(?:.+?\?.*?z=)?video(?P<videoid>[^s].*?)(?:\?|%2F|$))'
+    IE_NAME = 'vk'
+    IE_DESC = 'VK'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            (?:m\.)?vk\.com/video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)|
+                            (?:
+                                (?:m\.)?vk\.com/(?:.+?\?.*?z=)?video|
+                                (?:www\.)?biqle\.ru/watch/
+                            )
+                            (?P<videoid>[^s].*?)(?:\?(?:.*\blist=(?P<list_id>[\da-f]+))?|%2F|$)
+                        )
+                    '''
     _NETRC_MACHINE = 'vk'
 
     _TESTS = [
@@ -34,6 +46,7 @@ class VKIE(InfoExtractor):
                 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*',
                 'duration': 195,
                 'upload_date': '20120212',
+                'view_count': int,
             },
         },
         {
@@ -45,7 +58,8 @@ class VKIE(InfoExtractor):
                 'uploader': 'Tom Cruise',
                 'title': 'No name',
                 'duration': 9,
-                'upload_date': '20130721'
+                'upload_date': '20130721',
+                'view_count': int,
             }
         },
         {
@@ -59,6 +73,7 @@ class VKIE(InfoExtractor):
                 'title': 'Lin Dan',
                 'duration': 101,
                 'upload_date': '20120730',
+                'view_count': int,
             }
         },
         {
@@ -73,7 +88,8 @@ class VKIE(InfoExtractor):
                 'uploader': 'Триллеры',
                 'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]',
                 'duration': 8352,
-                'upload_date': '20121218'
+                'upload_date': '20121218',
+                'view_count': int,
             },
             'skip': 'Requires vk account credentials',
         },
@@ -100,14 +116,54 @@ class VKIE(InfoExtractor):
                 'title': 'Книга Илая',
                 'duration': 6771,
                 'upload_date': '20140626',
+                'view_count': int,
             },
             'skip': 'Only works from Russia',
         },
+        {
+            # video (removed?) only available with list id
+            'url': 'https://vk.com/video30481095_171201961?list=8764ae2d21f14088d4',
+            'md5': '091287af5402239a1051c37ec7b92913',
+            'info_dict': {
+                'id': '171201961',
+                'ext': 'mp4',
+                'title': 'ТюменцевВВ_09.07.2015',
+                'uploader': 'Anton Ivanov',
+                'duration': 109,
+                'upload_date': '20150709',
+                'view_count': int,
+            },
+        },
+        {
+            # youtube embed
+            'url': 'https://vk.com/video276849682_170681728',
+            'info_dict': {
+                'id': 'V3K4mi0SYkc',
+                'ext': 'mp4',
+                'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate",
+                'description': 'md5:bf9c26cfa4acdfb146362682edd3827a',
+                'duration': 179,
+                'upload_date': '20130116',
+                'uploader': "Children's Joy Foundation",
+                'uploader_id': 'thecjf',
+                'view_count': int,
+            },
+        },
         {
             # removed video, just testing that we match the pattern
             'url': 'http://vk.com/feed?z=video-43215063_166094326%2Fbb50cacd3177146d7a',
             'only_matching': True,
         },
+        {
+            # age restricted video, requires vk account credentials
+            'url': 'https://vk.com/video205387401_164765225',
+            'only_matching': True,
+        },
+        {
+            # vk wrapper
+            'url': 'http://www.biqle.ru/watch/847655_160197695',
+            'only_matching': True,
+        }
     ]
 
     def _login(self):
@@ -115,20 +171,25 @@ class VKIE(InfoExtractor):
         if username is None:
             return
 
-        login_form = {
-            'act': 'login',
-            'role': 'al_frame',
-            'expire': '1',
-            'email': username,
-            'pass': password,
-        }
+        login_page = self._download_webpage(
+            'https://vk.com', None, 'Downloading login page')
+
+        login_form = self._hidden_inputs(login_page)
+
+        login_form.update({
+            'email': username.encode('cp1251'),
+            'pass': password.encode('cp1251'),
+        })
 
-        request = compat_urllib_request.Request('https://login.vk.com/?act=login',
-                                                compat_urllib_parse.urlencode(login_form).encode('utf-8'))
-        login_page = self._download_webpage(request, None, note='Logging in as %s' % username)
+        request = compat_urllib_request.Request(
+            'https://login.vk.com/?act=login',
+            compat_urllib_parse.urlencode(login_form).encode('utf-8'))
+        login_page = self._download_webpage(
+            request, None, note='Logging in as %s' % username)
 
         if re.search(r'onLoginFailed', login_page):
-            raise ExtractorError('Unable to login, incorrect username and/or password', expected=True)
+            raise ExtractorError(
+                'Unable to login, incorrect username and/or password', expected=True)
 
     def _real_initialize(self):
         self._login()
@@ -140,9 +201,26 @@ class VKIE(InfoExtractor):
         if not video_id:
             video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id'))
 
-        info_url = 'http://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id
+        info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id
+
+        # Some videos (removed?) can only be downloaded with list id specified
+        list_id = mobj.group('list_id')
+        if list_id:
+            info_url += '&list=%s' % list_id
+
         info_page = self._download_webpage(info_url, video_id)
 
+        error_message = self._html_search_regex(
+            r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>',
+            info_page, 'error message', default=None)
+        if error_message:
+            raise ExtractorError(error_message, expected=True)
+
+        if re.search(r'<!>/login\.php\?.*\bact=security_check', info_page):
+            raise ExtractorError(
+                'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.',
+                expected=True)
+
         ERRORS = {
             r'>Видеозапись .*? была изъята из публичного доступа в связи с обращением правообладателя.<':
             'Video %s has been removed from public access due to rightholder complaint.',
@@ -156,16 +234,20 @@ class VKIE(InfoExtractor):
 
             r'<!>Видео временно недоступно':
             'Video %s is temporarily unavailable.',
+
+            r'<!>Access denied':
+            'Access denied to video %s.',
         }
 
         for error_re, error_msg in ERRORS.items():
             if re.search(error_re, info_page):
                 raise ExtractorError(error_msg % video_id, expected=True)
 
-        m_yt = re.search(r'src="(http://www.youtube.com/.*?)"', info_page)
-        if m_yt is not None:
-            self.to_screen('Youtube video detected')
-            return self.url_result(m_yt.group(1), 'Youtube')
+        youtube_url = self._search_regex(
+            r'<iframe[^>]+src="((?:https?:)?//www.youtube.com/embed/[^"]+)"',
+            info_page, 'youtube iframe', default=None)
+        if youtube_url:
+            return self.url_result(youtube_url, 'Youtube')
 
         m_rutube = re.search(
             r'\ssrc="((?:https?:)?//rutube\.ru\\?/video\\?/embed(?:.*?))\\?"', info_page)
@@ -175,25 +257,29 @@ class VKIE(InfoExtractor):
                 m_rutube.group(1).replace('\\', ''))
             return self.url_result(rutube_url)
 
-        m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.*?});', info_page)
+        m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page)
         if m_opts:
-            m_opts_url = re.search(r"url\s*:\s*'([^']+)", m_opts.group(1))
+            m_opts_url = re.search(r"url\s*:\s*'((?!/\b)[^']+)", m_opts.group(1))
             if m_opts_url:
                 opts_url = m_opts_url.group(1)
                 if opts_url.startswith('//'):
                     opts_url = 'http:' + opts_url
                 return self.url_result(opts_url)
 
-        data_json = self._search_regex(r'var vars = ({.*?});', info_page, 'vars')
+        data_json = self._search_regex(r'var\s+vars\s*=\s*({.+?});', info_page, 'vars')
         data = json.loads(data_json)
 
         # Extract upload date
         upload_date = None
-        mobj = re.search(r'id="mv_date_wrap".*?Added ([a-zA-Z]+ [0-9]+), ([0-9]+) at', info_page)
+        mobj = re.search(r'id="mv_date(?:_views)?_wrap"[^>]*>([a-zA-Z]+ [0-9]+), ([0-9]+) at', info_page)
         if mobj is not None:
             mobj.group(1) + ' ' + mobj.group(2)
             upload_date = unified_strdate(mobj.group(1) + ' ' + mobj.group(2))
 
+        view_count = str_to_int(self._search_regex(
+            r'"mv_views_count_number"[^>]*>([\d,.]+) views<',
+            info_page, 'view count', fatal=False))
+
         formats = [{
             'format_id': k,
             'url': v,
@@ -210,29 +296,39 @@ class VKIE(InfoExtractor):
             'uploader': data.get('md_author'),
             'duration': data.get('duration'),
             'upload_date': upload_date,
+            'view_count': view_count,
         }
 
 
 class VKUserVideosIE(InfoExtractor):
-    IE_NAME = 'vk.com:user-videos'
-    IE_DESC = 'vk.com:All of a user\'s videos'
-    _VALID_URL = r'https?://vk\.com/videos(?P<id>[0-9]+)(?:m\?.*)?'
+    IE_NAME = 'vk:uservideos'
+    IE_DESC = "VK - User's Videos"
+    _VALID_URL = r'https?://vk\.com/videos(?P<id>-?[0-9]+)$'
     _TEMPLATE_URL = 'https://vk.com/videos'
-    _TEST = {
+    _TESTS = [{
         'url': 'http://vk.com/videos205387401',
         'info_dict': {
             'id': '205387401',
+            'title': "Tom Cruise's Videos",
         },
         'playlist_mincount': 4,
-    }
+    }, {
+        'url': 'http://vk.com/videos-77521',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         page_id = self._match_id(url)
-        page = self._download_webpage(url, page_id)
-        video_ids = orderedSet(
-            m.group(1) for m in re.finditer(r'href="/video([0-9_]+)"', page))
-        url_entries = [
+
+        webpage = self._download_webpage(url, page_id)
+
+        entries = [
             self.url_result(
                 'http://vk.com/video' + video_id, 'VK', video_id=video_id)
-            for video_id in video_ids]
-        return self.playlist_result(url_entries, page_id)
+            for video_id in orderedSet(re.findall(r'href="/video(-?[0-9_]+)"', webpage))]
+
+        title = unescapeHTML(self._search_regex(
+            r'<title>\s*([^<]+?)\s+\|\s+\d+\s+videos',
+            webpage, 'title', default=page_id))
+
+        return self.playlist_result(entries, page_id, title)
index 1c0966a793511a2ec3a9d147bd75ff22e8fb7209..ccf1928b5d323f277b4e8a47bd4d008e821b147c 100644 (file)
@@ -1,8 +1,6 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
 from ..compat import (
     compat_urllib_parse,
@@ -28,12 +26,7 @@ class VodlockerIE(InfoExtractor):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
 
-        fields = dict(re.findall(r'''(?x)<input\s+
-            type="hidden"\s+
-            name="([^"]+)"\s+
-            (?:id="[^"]+"\s+)?
-            value="([^"]*)"
-            ''', webpage))
+        fields = self._hidden_inputs(webpage)
 
         if fields['op'] == 'download1':
             self._sleep(3, video_id)  # they do detect when requests happen too fast!
diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py
new file mode 100644 (file)
index 0000000..254383d
--- /dev/null
@@ -0,0 +1,99 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_request,
+    compat_urlparse,
+)
+from ..utils import (
+    ExtractorError,
+    determine_ext,
+    int_or_none,
+)
+
+
+class VoiceRepublicIE(InfoExtractor):
+    _VALID_URL = r'https?://voicerepublic\.com/(?:talks|embed)/(?P<id>[0-9a-z-]+)'
+    _TESTS = [{
+        'url': 'http://voicerepublic.com/talks/watching-the-watchers-building-a-sousveillance-state',
+        'md5': '0554a24d1657915aa8e8f84e15dc9353',
+        'info_dict': {
+            'id': '2296',
+            'display_id': 'watching-the-watchers-building-a-sousveillance-state',
+            'ext': 'm4a',
+            'title': 'Watching the Watchers: Building a Sousveillance State',
+            'description': 'md5:715ba964958afa2398df615809cfecb1',
+            'thumbnail': 're:^https?://.*\.(?:png|jpg)$',
+            'duration': 1800,
+            'view_count': int,
+        }
+    }, {
+        'url': 'http://voicerepublic.com/embed/watching-the-watchers-building-a-sousveillance-state',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        req = compat_urllib_request.Request(
+            compat_urlparse.urljoin(url, '/talks/%s' % display_id))
+        # Older versions of Firefox get redirected to an "upgrade browser" page
+        req.add_header('User-Agent', 'youtube-dl')
+        webpage = self._download_webpage(req, display_id)
+
+        if '>Queued for processing, please stand by...<' in webpage:
+            raise ExtractorError(
+                'Audio is still queued for processing', expected=True)
+
+        config = self._search_regex(
+            r'(?s)return ({.+?});\s*\n', webpage,
+            'data', default=None)
+        data = self._parse_json(config, display_id, fatal=False) if config else None
+        if data:
+            title = data['title']
+            description = data.get('teaser')
+            talk_id = data.get('talk_id') or display_id
+            talk = data['talk']
+            duration = int_or_none(talk.get('duration'))
+            formats = [{
+                'url': compat_urlparse.urljoin(url, talk_url),
+                'format_id': format_id,
+                'ext': determine_ext(talk_url) or format_id,
+                'vcodec': 'none',
+            } for format_id, talk_url in talk['links'].items()]
+        else:
+            title = self._og_search_title(webpage)
+            description = self._html_search_regex(
+                r"(?s)<div class='talk-teaser'[^>]*>(.+?)</div>",
+                webpage, 'description', fatal=False)
+            talk_id = self._search_regex(
+                [r"id='jc-(\d+)'", r"data-shareable-id='(\d+)'"],
+                webpage, 'talk id', default=None) or display_id
+            duration = None
+            player = self._search_regex(
+                r"class='vr-player jp-jplayer'([^>]+)>", webpage, 'player')
+            formats = [{
+                'url': compat_urlparse.urljoin(url, talk_url),
+                'format_id': format_id,
+                'ext': determine_ext(talk_url) or format_id,
+                'vcodec': 'none',
+            } for format_id, talk_url in re.findall(r"data-([^=]+)='([^']+)'", player)]
+        self._sort_formats(formats)
+
+        thumbnail = self._og_search_thumbnail(webpage)
+        view_count = int_or_none(self._search_regex(
+            r"class='play-count[^']*'>\s*(\d+) plays",
+            webpage, 'play count', fatal=False))
+
+        return {
+            'id': talk_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'view_count': view_count,
+            'formats': formats,
+        }
index 2d23effccdff0ba49ff628ded1f72d044fe609d6..92c90e5172e89b98c3309bb01dc9787f15c24859 100644 (file)
@@ -27,9 +27,6 @@ class VpornIE(InfoExtractor):
                 'duration': 393,
                 'age_limit': 18,
                 'view_count': int,
-                'like_count': int,
-                'dislike_count': int,
-                'comment_count': int,
             }
         },
         {
@@ -47,9 +44,6 @@ class VpornIE(InfoExtractor):
                 'duration': 588,
                 'age_limit': 18,
                 'view_count': int,
-                'like_count': int,
-                'dislike_count': int,
-                'comment_count': int,
             }
         },
     ]
@@ -64,29 +58,29 @@ class VpornIE(InfoExtractor):
         title = self._html_search_regex(
             r'videoname\s*=\s*\'([^\']+)\'', webpage, 'title').strip()
         description = self._html_search_regex(
-            r'<div class="description_txt">(.*?)</div>', webpage, 'description', fatal=False)
+            r'class="(?:descr|description_txt)">(.*?)</div>',
+            webpage, 'description', fatal=False)
         thumbnail = self._html_search_regex(
             r'flashvars\.imageUrl\s*=\s*"([^"]+)"', webpage, 'description', fatal=False, default=None)
         if thumbnail:
             thumbnail = 'http://www.vporn.com' + thumbnail
 
         uploader = self._html_search_regex(
-            r'(?s)UPLOADED BY.*?<a href="/user/[^"]+">([^<]+)</a>',
+            r'(?s)Uploaded by:.*?<a href="/user/[^"]+"[^>]*>(.+?)</a>',
             webpage, 'uploader', fatal=False)
 
-        categories = re.findall(r'<a href="/cat/[^"]+">([^<]+)</a>', webpage)
+        categories = re.findall(r'<a href="/cat/[^"]+"[^>]*>([^<]+)</a>', webpage)
 
         duration = parse_duration(self._search_regex(
-            r'duration (\d+ min \d+ sec)', webpage, 'duration', fatal=False))
+            r'Runtime:\s*</span>\s*(\d+ min \d+ sec)',
+            webpage, 'duration', fatal=False))
 
-        view_count = str_to_int(self._html_search_regex(
-            r'<span>([\d,\.]+) VIEWS</span>', webpage, 'view count', fatal=False))
-        like_count = str_to_int(self._html_search_regex(
-            r'<span id="like" class="n">([\d,\.]+)</span>', webpage, 'like count', fatal=False))
-        dislike_count = str_to_int(self._html_search_regex(
-            r'<span id="dislike" class="n">([\d,\.]+)</span>', webpage, 'dislike count', fatal=False))
+        view_count = str_to_int(self._search_regex(
+            r'class="views">([\d,\.]+) [Vv]iews<',
+            webpage, 'view count', fatal=False))
         comment_count = str_to_int(self._html_search_regex(
-            r'<h4>Comments \(<b>([\d,\.]+)</b>\)</h4>', webpage, 'comment count', fatal=False))
+            r"'Comments \(([\d,\.]+)\)'",
+            webpage, 'comment count', default=None))
 
         formats = []
 
@@ -117,8 +111,6 @@ class VpornIE(InfoExtractor):
             'categories': categories,
             'duration': duration,
             'view_count': view_count,
-            'like_count': like_count,
-            'dislike_count': dislike_count,
             'comment_count': comment_count,
             'age_limit': 18,
             'formats': formats,
index 405cb9db49f41a144a4c842d8f99aeb1c2023da9..149e364677fcab4d0374479c4b96ff741277b17e 100644 (file)
@@ -36,6 +36,7 @@ class VubeIE(InfoExtractor):
                 'comment_count': int,
                 'categories': ['amazing', 'hd', 'best drummer ever', 'william wei', 'bucket drumming', 'street drummer', 'epic street drumming'],
             },
+            'skip': 'Not accessible from Travis CI server',
         }, {
             'url': 'http://vube.com/Chiara+Grispo+Video+Channel/YL2qNPkqon',
             'md5': 'db7aba89d4603dadd627e9d1973946fe',
index c3fde53f5ef06a56b54e94b20b72a7e98c1992a5..a6d9b5fee1f4864d82c7f8bb83e87884c96afe3b 100644 (file)
@@ -49,7 +49,7 @@ class VuClipIE(InfoExtractor):
         links_code = self._search_regex(
             r'''(?xs)
                 (?:
-                    <img\s+src="/im/play.gif".*?>|
+                    <img\s+src="[^"]*/play.gif".*?>|
                     <!--\ player\ end\ -->\s*</div><!--\ thumb\ end-->
                 )
                 (.*?)
index 1eb24a3d67ffa92838ce41301b3b47d401482609..faa167e65861af3bb4803ab96fe931c15597dc00 100644 (file)
@@ -44,7 +44,7 @@ class VultureIE(InfoExtractor):
         query_webpage = self._download_webpage(
             query_url, display_id, note='Downloading query page')
         params_json = self._search_regex(
-            r'(?sm)new MagnifyEmbeddablePlayer\({.*?contentItem:\s*(\{.*?\})\n,\n',
+            r'(?sm)new MagnifyEmbeddablePlayer\({.*?contentItem:\s*(\{.*?\})\n?,\n',
             query_webpage,
             'player params')
         params = json.loads(params_json)
index bf9e40bad7c29e01b231b2ecf9e0cbb53295d52f..affcc52f6e244c40bbca6381700c2f15e645580f 100644 (file)
@@ -113,7 +113,7 @@ class WatIE(InfoExtractor):
             video_url = self._download_webpage(
                 'http://www.wat.tv/get%s?token=%s&getURL=1&country=%s' % (webid, compute_token(webid), country),
                 real_id,
-                'Downloding %s video URL' % fmt[0],
+                'Downloading %s video URL' % fmt[0],
                 'Failed to download %s video URL' % fmt[0],
                 False)
             if not video_url:
index 73077a312549f6b883fdf549a2b364f6de35db9f..2037d9b3d57cd5876d85e9552ffcc9f387fcc975 100644 (file)
@@ -1,6 +1,8 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import re
+
 from .common import InfoExtractor
 from ..utils import int_or_none
 
@@ -98,3 +100,42 @@ class WebOfStoriesIE(InfoExtractor):
             'description': description,
             'duration': duration,
         }
+
+
+class WebOfStoriesPlaylistIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?webofstories\.com/playAll/(?P<id>[^/]+)'
+    _TEST = {
+        'url': 'http://www.webofstories.com/playAll/donald.knuth',
+        'info_dict': {
+            'id': 'donald.knuth',
+            'title': 'Donald Knuth (Scientist)',
+        },
+        'playlist_mincount': 97,
+    }
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        entries = [
+            self.url_result('http://www.webofstories.com/play/%s' % video_number, 'WebOfStories')
+            for video_number in set(re.findall('href="/playAll/%s\?sId=(\d+)"' % playlist_id, webpage))
+        ]
+
+        title = self._search_regex(
+            r'<div id="speakerName">\s*<span>([^<]+)</span>',
+            webpage, 'speaker', default=None)
+        if title:
+            field = self._search_regex(
+                r'<span id="primaryField">([^<]+)</span>',
+                webpage, 'field', default=None)
+            if field:
+                title += ' (%s)' % field
+
+        if not title:
+            title = self._search_regex(
+                r'<title>Play\s+all\s+stories\s*-\s*([^<]+)\s*-\s*Web\s+of\s+Stories</title>',
+                webpage, 'title')
+
+        return self.playlist_result(entries, playlist_id, title)
index d6dec25ca9e7bb9de539e89c147e22b7381e3719..f69d46a2858077ed76ec9c8fc86166668f27c705 100644 (file)
@@ -37,7 +37,8 @@ class WimpIE(InfoExtractor):
         video_id = mobj.group(1)
         webpage = self._download_webpage(url, video_id)
         video_url = self._search_regex(
-            r"[\"']file[\"']\s*[:,]\s*[\"'](.+?)[\"']", webpage, 'video URL')
+            [r"[\"']file[\"']\s*[:,]\s*[\"'](.+?)[\"']", r"videoId\s*:\s*[\"']([^\"']+)[\"']"],
+            webpage, 'video URL')
         if YoutubeIE.suitable(video_url):
             self.to_screen('Found YouTube video')
             return {
index d5c26a032bcf28a9c8ae79e1d083d67ed29b2726..a3ea26feb38257071c8ae5d3c1702cf0fcd2650a 100644 (file)
@@ -6,8 +6,8 @@ from .common import InfoExtractor
 
 
 class WorldStarHipHopIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
-    _TEST = {
+    _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/(?:videos|android)/video\.php\?v=(?P<id>.*)'
+    _TESTS = [{
         "url": "http://www.worldstarhiphop.com/videos/video.php?v=wshh6a7q1ny0G34ZwuIO",
         "md5": "9d04de741161603bf7071bbf4e883186",
         "info_dict": {
@@ -15,7 +15,15 @@ class WorldStarHipHopIE(InfoExtractor):
             "ext": "mp4",
             "title": "KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!"
         }
-    }
+    }, {
+        'url': 'http://m.worldstarhiphop.com/android/video.php?v=wshh6a7q1ny0G34ZwuIO',
+        'md5': 'dc1c76c83ecc4190bb1eb143899b87d3',
+        'info_dict': {
+            'id': 'wshh6a7q1ny0G34ZwuIO',
+            'ext': 'mp4',
+            "title": "KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!"
+        }
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
@@ -26,19 +34,22 @@ class WorldStarHipHopIE(InfoExtractor):
             return self.url_result('vevo:%s' % m_vevo_id.group(1), ie='Vevo')
 
         video_url = self._search_regex(
-            r'so\.addVariable\("file","(.*?)"\)', webpage, 'video URL')
+            [r'so\.addVariable\("file","(.*?)"\)',
+             r'<div class="artlist">\s*<a[^>]+href="([^"]+)">'],
+            webpage, 'video URL')
 
         if 'youtube' in video_url:
             return self.url_result(video_url, ie='Youtube')
 
         video_title = self._html_search_regex(
-            r'(?s)<div class="content-heading">\s*<h1>(.*?)</h1>',
+            [r'(?s)<div class="content-heading">\s*<h1>(.*?)</h1>',
+             r'<span[^>]+class="tc-sp-pinned-title">(.*)</span>'],
             webpage, 'title')
 
         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
         thumbnail = self._html_search_regex(
             r'rel="image_src" href="(.*)" />', webpage, 'thumbnail',
-            fatal=False)
+            default=None)
         if not thumbnail:
             _title = r'candytitles.*>(.*)</span>'
             mobj = re.search(_title, webpage)
index 80c48c37d32c0849e689d626811ee34c5b414ee0..4ff99e5ca37fb8f4f0b663cc99761c31e75f1cf4 100644 (file)
@@ -1,9 +1,7 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..compat import (
-    compat_urllib_parse,
-)
+from ..compat import compat_urllib_parse_unquote
 
 
 class XBefIE(InfoExtractor):
@@ -30,7 +28,7 @@ class XBefIE(InfoExtractor):
         config_url_enc = self._download_webpage(
             'http://xbef.com/Main/GetVideoURLEncoded/%s' % video_id, video_id,
             note='Retrieving config URL')
-        config_url = compat_urllib_parse.unquote(config_url_enc)
+        config_url = compat_urllib_parse_unquote(config_url_enc)
         config = self._download_xml(
             config_url, video_id, note='Retrieving config')
 
index 4527567f8fc26c45b091aa868dc77b159576b56c..b4ad513a0d18ecbae558deceb6234efaa5ce5415 100644 (file)
@@ -13,7 +13,6 @@ from ..utils import (
 
 
 class XHamsterIE(InfoExtractor):
-    """Information Extractor for xHamster"""
     _VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'
     _TESTS = [
         {
@@ -133,3 +132,36 @@ class XHamsterIE(InfoExtractor):
             'age_limit': age_limit,
             'formats': formats,
         }
+
+
+class XHamsterEmbedIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?xhamster\.com/xembed\.php\?video=(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://xhamster.com/xembed.php?video=3328539',
+        'info_dict': {
+            'id': '3328539',
+            'ext': 'mp4',
+            'title': 'Pen Masturbation',
+            'upload_date': '20140728',
+            'uploader_id': 'anonymous',
+            'duration': 5,
+            'age_limit': 18,
+        }
+    }
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return [url for _, url in re.findall(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?xhamster\.com/xembed\.php\?video=\d+)\1',
+            webpage)]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_url = self._search_regex(
+            r'href="(https?://xhamster\.com/movies/%s/[^"]+\.html[^"]*)"' % video_id,
+            webpage, 'xhamster url')
+
+        return self.url_result(video_url, 'XHamster')
index 8c6241aedf7249343a725ab705968d0af963294a..7c9d8af6f2585207347d58d08fc607ebf4d28900 100644 (file)
@@ -43,7 +43,7 @@ class XMinusIE(InfoExtractor):
             r'minus_track\.dur_sec=\'([0-9]*?)\'',
             webpage, 'duration', fatal=False))
         filesize_approx = parse_filesize(self._html_search_regex(
-            r'<div class="filesize[^"]*"></div>\s*([0-9.]+\s*[a-zA-Z][bB])',
+            r'<div id="finfo"[^>]*>\s*↓\s*([0-9.]+\s*[a-zA-Z][bB])',
             webpage, 'approximate filesize', fatal=False))
         tbr = int_or_none(self._html_search_regex(
             r'<div class="quality[^"]*"></div>\s*([0-9]+)\s*kbps',
@@ -58,7 +58,7 @@ class XMinusIE(InfoExtractor):
             description = re.sub(' *\r *', '\n', description)
 
         enc_token = self._html_search_regex(
-            r'minus_track\.tkn="(.+?)"', webpage, 'enc_token')
+            r'minus_track\.s?tkn="(.+?)"', webpage, 'enc_token')
         token = ''.join(
             c if pos == 3 else compat_chr(compat_ord(c) - 1)
             for pos, c in enumerate(reversed(enc_token)))
index 79ed6c744242bf132afd033ae35949cc1e2263b5..5a41f8ffa0c5a46a3d0431a6aac8e93ba8ca1cb9 100644 (file)
@@ -2,9 +2,7 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..compat import (
-    compat_urllib_parse,
-)
+from ..compat import compat_urllib_parse_unquote
 
 
 class XNXXIE(InfoExtractor):
@@ -26,7 +24,7 @@ class XNXXIE(InfoExtractor):
 
         video_url = self._search_regex(r'flv_url=(.*?)&amp;',
                                        webpage, 'video URL')
-        video_url = compat_urllib_parse.unquote(video_url)
+        video_url = compat_urllib_parse_unquote(video_url)
 
         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XNXX.COM',
                                               webpage, 'title')
diff --git a/youtube_dl/extractor/xstream.py b/youtube_dl/extractor/xstream.py
new file mode 100644 (file)
index 0000000..71584c2
--- /dev/null
@@ -0,0 +1,115 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_iso8601,
+    xpath_with_ns,
+    xpath_text,
+    find_xpath_attr,
+)
+
+
+class XstreamIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    (?:
+                        xstream:|
+                        https?://frontend\.xstream\.(?:dk|net)/
+                    )
+                    (?P<partner_id>[^/]+)
+                    (?:
+                        :|
+                        /feed/video/\?.*?\bid=
+                    )
+                    (?P<id>\d+)
+                    '''
+    _TESTS = [{
+        'url': 'http://frontend.xstream.dk/btno/feed/video/?platform=web&id=86588',
+        'md5': 'd7d17e3337dc80de6d3a540aefbe441b',
+        'info_dict': {
+            'id': '86588',
+            'ext': 'mov',
+            'title': 'Otto Wollertsen',
+            'description': 'Vestlendingen Otto Fredrik Wollertsen',
+            'timestamp': 1430473209,
+            'upload_date': '20150501',
+        },
+    }, {
+        'url': 'http://frontend.xstream.dk/ap/feed/video/?platform=web&id=21039',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        partner_id = mobj.group('partner_id')
+        video_id = mobj.group('id')
+
+        data = self._download_xml(
+            'http://frontend.xstream.dk/%s/feed/video/?platform=web&id=%s'
+            % (partner_id, video_id),
+            video_id)
+
+        NS_MAP = {
+            'atom': 'http://www.w3.org/2005/Atom',
+            'xt': 'http://xstream.dk/',
+            'media': 'http://search.yahoo.com/mrss/',
+        }
+
+        entry = data.find(xpath_with_ns('./atom:entry', NS_MAP))
+
+        title = xpath_text(
+            entry, xpath_with_ns('./atom:title', NS_MAP), 'title')
+        description = xpath_text(
+            entry, xpath_with_ns('./atom:summary', NS_MAP), 'description')
+        timestamp = parse_iso8601(xpath_text(
+            entry, xpath_with_ns('./atom:published', NS_MAP), 'upload date'))
+
+        formats = []
+        media_group = entry.find(xpath_with_ns('./media:group', NS_MAP))
+        for media_content in media_group.findall(xpath_with_ns('./media:content', NS_MAP)):
+            media_url = media_content.get('url')
+            if not media_url:
+                continue
+            tbr = int_or_none(media_content.get('bitrate'))
+            mobj = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', media_url)
+            if mobj:
+                formats.append({
+                    'url': mobj.group('url'),
+                    'play_path': 'mp4:%s' % mobj.group('playpath'),
+                    'app': mobj.group('app'),
+                    'ext': 'flv',
+                    'tbr': tbr,
+                    'format_id': 'rtmp-%d' % tbr,
+                })
+            else:
+                formats.append({
+                    'url': media_url,
+                    'tbr': tbr,
+                })
+        self._sort_formats(formats)
+
+        link = find_xpath_attr(
+            entry, xpath_with_ns('./atom:link', NS_MAP), 'rel', 'original')
+        if link is not None:
+            formats.append({
+                'url': link.get('href'),
+                'format_id': link.get('rel'),
+            })
+
+        thumbnails = [{
+            'url': splash.get('url'),
+            'width': int_or_none(splash.get('width')),
+            'height': int_or_none(splash.get('height')),
+        } for splash in media_group.findall(xpath_with_ns('./xt:splash', NS_MAP))]
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'timestamp': timestamp,
+            'formats': formats,
+            'thumbnails': thumbnails,
+        }
index 1644f53c876329f053406be3d3dc1aa463cddc1b..779e4f46a1dd5315c6a9be3dad09e65c07a205b2 100644 (file)
@@ -5,7 +5,7 @@ import re
 from .common import InfoExtractor
 from ..compat import (
     compat_urllib_request,
-    compat_urllib_parse,
+    compat_urllib_parse_unquote,
 )
 from ..utils import (
     parse_duration,
@@ -59,7 +59,7 @@ class XTubeIE(InfoExtractor):
         for format_id, video_url in re.findall(
                 r'flashvars\.quality_(.+?)\s*=\s*"([^"]+)"', webpage):
             fmt = {
-                'url': compat_urllib_parse.unquote(video_url),
+                'url': compat_urllib_parse_unquote(video_url),
                 'format_id': format_id,
             }
             m = re.search(r'^(?P<height>\d+)[pP]', format_id)
@@ -68,7 +68,7 @@ class XTubeIE(InfoExtractor):
             formats.append(fmt)
 
         if not formats:
-            video_url = compat_urllib_parse.unquote(self._search_regex(
+            video_url = compat_urllib_parse_unquote(self._search_regex(
                 r'flashvars\.video_url\s*=\s*"([^"]+)"',
                 webpage, 'video URL'))
             formats.append({'url': video_url})
index 81d885fdcee1cf788c217e862629df58f386d73c..5aac8adb36e2ad12e798cb4f0c77e5b204c7b91b 100644 (file)
@@ -13,6 +13,7 @@ from ..utils import (
 
 
 class XuiteIE(InfoExtractor):
+    IE_DESC = '隨意窩Xuite影音'
     _REGEX_BASE64 = r'(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?'
     _VALID_URL = r'https?://vlog\.xuite\.net/(?:play|embed)/(?P<id>%s)' % _REGEX_BASE64
     _TESTS = [{
index 2a45dc574263f7e651020e591fcc40bdf987367d..5dcf2fdd12f9140f0bd373fd5db41c93f4b18b38 100644 (file)
@@ -4,11 +4,13 @@ import re
 
 from .common import InfoExtractor
 from ..compat import (
-    compat_urllib_parse,
+    compat_urllib_parse_unquote,
+    compat_urllib_request,
 )
 from ..utils import (
     clean_html,
     ExtractorError,
+    determine_ext,
 )
 
 
@@ -25,6 +27,8 @@ class XVideosIE(InfoExtractor):
         }
     }
 
+    _ANDROID_USER_AGENT = 'Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19'
+
     def _real_extract(self, url):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
@@ -33,16 +37,37 @@ class XVideosIE(InfoExtractor):
         if mobj:
             raise ExtractorError('%s said: %s' % (self.IE_NAME, clean_html(mobj.group(1))), expected=True)
 
-        video_url = compat_urllib_parse.unquote(
+        video_url = compat_urllib_parse_unquote(
             self._search_regex(r'flv_url=(.+?)&', webpage, 'video URL'))
         video_title = self._html_search_regex(
             r'<title>(.*?)\s+-\s+XVID', webpage, 'title')
         video_thumbnail = self._search_regex(
             r'url_bigthumb=(.+?)&amp', webpage, 'thumbnail', fatal=False)
 
+        formats = [{
+            'url': video_url,
+        }]
+
+        android_req = compat_urllib_request.Request(url)
+        android_req.add_header('User-Agent', self._ANDROID_USER_AGENT)
+        android_webpage = self._download_webpage(android_req, video_id, fatal=False)
+
+        if android_webpage is not None:
+            player_params_str = self._search_regex(
+                'mobileReplacePlayerDivTwoQual\(([^)]+)\)',
+                android_webpage, 'player parameters', default='')
+            player_params = list(map(lambda s: s.strip(' \''), player_params_str.split(',')))
+            if player_params:
+                formats.extend([{
+                    'url': param,
+                    'preference': -10,
+                } for param in player_params if determine_ext(param) == 'mp4'])
+
+        self._sort_formats(formats)
+
         return {
             'id': video_id,
-            'url': video_url,
+            'formats': formats,
             'title': video_title,
             'ext': 'flv',
             'thumbnail': video_thumbnail,
index b777159c5639304edf1433857f626c29299e4bcb..f9afbdbab611e233c7f7014ae7d66e996f2b7c31 100644 (file)
@@ -15,6 +15,7 @@ from ..utils import (
     unescapeHTML,
     ExtractorError,
     int_or_none,
+    mimetype2ext,
 )
 
 from .nbc import NBCSportsVPlayerIE
@@ -22,7 +23,7 @@ from .nbc import NBCSportsVPlayerIE
 
 class YahooIE(InfoExtractor):
     IE_DESC = 'Yahoo screen and movies'
-    _VALID_URL = r'(?P<url>(?P<host>https?://(?:[a-zA-Z]{2}\.)?[\da-zA-Z_-]+\.yahoo\.com)/(?:[^/]+/)*(?P<display_id>.+?)-(?P<id>[0-9]+)(?:-[a-z]+)?\.html)'
+    _VALID_URL = r'(?P<url>(?P<host>https?://(?:[a-zA-Z]{2}\.)?[\da-zA-Z_-]+\.yahoo\.com)/(?:[^/]+/)*(?P<display_id>.+)?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html)'
     _TESTS = [
         {
             'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
@@ -140,12 +141,15 @@ class YahooIE(InfoExtractor):
                 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d',
                 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson',
             }
+        }, {
+            'url': 'https://tw.news.yahoo.com/-100120367.html',
+            'only_matching': True,
         }
     ]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        display_id = mobj.group('display_id')
+        display_id = mobj.group('display_id') or self._match_id(url)
         page_id = mobj.group('id')
         url = mobj.group('url')
         host = mobj.group('host')
@@ -233,6 +237,22 @@ class YahooIE(InfoExtractor):
 
         self._sort_formats(formats)
 
+        closed_captions = self._html_search_regex(
+            r'"closedcaptions":(\[[^\]]+\])', webpage, 'closed captions',
+            default='[]')
+
+        cc_json = self._parse_json(closed_captions, video_id, fatal=False)
+        subtitles = {}
+        if cc_json:
+            for closed_caption in cc_json:
+                lang = closed_caption['lang']
+                if lang not in subtitles:
+                    subtitles[lang] = []
+                subtitles[lang].append({
+                    'url': closed_caption['url'],
+                    'ext': mimetype2ext(closed_caption['content_type']),
+                })
+
         return {
             'id': video_id,
             'display_id': display_id,
@@ -241,6 +261,7 @@ class YahooIE(InfoExtractor):
             'description': clean_html(meta['description']),
             'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage),
             'duration': int_or_none(meta.get('duration')),
+            'subtitles': subtitles,
         }
 
 
index 19f8762ae57e814a11f868be1a008e21bbaf8efc..001ee17b6f93d457bdc2fbdaf802b61ef19e1b41 100644 (file)
@@ -9,10 +9,12 @@ from ..utils import (
     float_or_none,
     month_by_abbreviation,
     ExtractorError,
+    get_element_by_attribute,
 )
 
 
 class YamIE(InfoExtractor):
+    IE_DESC = '蕃薯藤yam天空部落'
     _VALID_URL = r'http://mymedia.yam.com/m/(?P<id>\d+)'
 
     _TESTS = [{
@@ -23,6 +25,7 @@ class YamIE(InfoExtractor):
             'id': '2283921',
             'ext': 'mp3',
             'title': '發現 - 趙薇 京華煙雲主題曲',
+            'description': '發現 - 趙薇 京華煙雲主題曲',
             'uploader_id': 'princekt',
             'upload_date': '20080807',
             'duration': 313.0,
@@ -55,6 +58,17 @@ class YamIE(InfoExtractor):
             'ext': 'mp4',
         },
         'skip': 'invalid YouTube URL',
+    }, {
+        'url': 'http://mymedia.yam.com/m/2373534',
+        'md5': '7ff74b91b7a817269d83796f8c5890b1',
+        'info_dict': {
+            'id': '2373534',
+            'ext': 'mp3',
+            'title': '林俊傑&蔡卓妍-小酒窩',
+            'description': 'md5:904003395a0fcce6cfb25028ff468420',
+            'upload_date': '20080928',
+            'uploader_id': 'onliner2',
+        }
     }]
 
     def _real_extract(self, url):
@@ -75,15 +89,19 @@ class YamIE(InfoExtractor):
         if youtube_url:
             return self.url_result(youtube_url, 'Youtube')
 
+        title = self._html_search_regex(
+            r'<h1[^>]+class="heading"[^>]*>\s*(.+)\s*</h1>', page, 'title')
+
         api_page = self._download_webpage(
             'http://mymedia.yam.com/api/a/?pID=' + video_id, video_id,
             note='Downloading API page')
         api_result_obj = compat_urlparse.parse_qs(api_page)
 
+        info_table = get_element_by_attribute('class', 'info', page)
         uploader_id = self._html_search_regex(
-            r'<!-- 發表作者 -->:[\n ]+<a href="/([a-z]+)"',
-            page, 'uploader id', fatal=False)
-        mobj = re.search(r'<!-- 發表於 -->(?P<mon>[A-Z][a-z]{2})  ' +
+            r'<!-- 發表作者 -->:[\n ]+<a href="/([a-z0-9]+)"',
+            info_table, 'uploader id', fatal=False)
+        mobj = re.search(r'<!-- 發表於 -->(?P<mon>[A-Z][a-z]{2})\s+' +
                          r'(?P<day>\d{1,2}), (?P<year>\d{4})', page)
         if mobj:
             upload_date = '%s%02d%02d' % (
@@ -97,7 +115,8 @@ class YamIE(InfoExtractor):
         return {
             'id': video_id,
             'url': api_result_obj['mp3file'][0],
-            'title': self._html_search_meta('description', page),
+            'title': title,
+            'description': self._html_search_meta('description', page),
             'duration': duration,
             'uploader_id': uploader_id,
             'upload_date': upload_date,
diff --git a/youtube_dl/extractor/yinyuetai.py b/youtube_dl/extractor/yinyuetai.py
new file mode 100644 (file)
index 0000000..834d860
--- /dev/null
@@ -0,0 +1,56 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class YinYueTaiIE(InfoExtractor):
+    IE_NAME = 'yinyuetai:video'
+    IE_DESC = '音悦Tai'
+    _VALID_URL = r'https?://v\.yinyuetai\.com/video(?:/h5)?/(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://v.yinyuetai.com/video/2322376',
+        'md5': '6e3abe28d38e3a54b591f9f040595ce0',
+        'info_dict': {
+            'id': '2322376',
+            'ext': 'mp4',
+            'title': '少女时代_PARTY_Music Video Teaser',
+            'creator': '少女时代',
+            'duration': 25,
+            'thumbnail': 're:^https?://.*\.jpg$',
+        },
+    }, {
+        'url': 'http://v.yinyuetai.com/video/h5/2322376',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        info = self._download_json(
+            'http://ext.yinyuetai.com/main/get-h-mv-info?json=true&videoId=%s' % video_id, video_id,
+            'Downloading mv info')['videoInfo']['coreVideoInfo']
+
+        if info['error']:
+            raise ExtractorError(info['errorMsg'], expected=True)
+
+        formats = [{
+            'url': format_info['videoUrl'],
+            'format_id': format_info['qualityLevel'],
+            'format': format_info.get('qualityLevelName'),
+            'filesize': format_info.get('fileSize'),
+            # though URLs ends with .flv, the downloaded files are in fact mp4
+            'ext': 'mp4',
+            'tbr': format_info.get('bitrate'),
+        } for format_info in info['videoUrlModels']]
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': info['videoName'],
+            'thumbnail': info.get('bigHeadImage'),
+            'creator': info.get('artistNames'),
+            'duration': info.get('duration'),
+            'formats': formats,
+        }
index 894678a23dac9d1b03e07f0cd9b2eecc7e690e18..869f3e8190ca0b751366a85f142a0b49fe294fa1 100644 (file)
@@ -5,7 +5,7 @@ import re
 import json
 
 from .common import InfoExtractor
-from ..compat import compat_urllib_parse
+from ..compat import compat_urllib_parse_unquote_plus
 
 
 class YnetIE(InfoExtractor):
@@ -34,7 +34,7 @@ class YnetIE(InfoExtractor):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
 
-        content = compat_urllib_parse.unquote_plus(self._og_search_video_url(webpage))
+        content = compat_urllib_parse_unquote_plus(self._og_search_video_url(webpage))
         config = json.loads(self._search_regex(r'config=({.+?})$', content, 'video config'))
         f4m_url = config['clip']['url']
         title = self._og_search_title(webpage)
index 97b98bbe88715f644da6bec1709d697af2c8e0e0..78caeb8b36e0be8cf4e97365d9e28251723059b7 100644 (file)
 # coding: utf-8
-
 from __future__ import unicode_literals
 
-import math
-import random
-import re
-import time
+import base64
 
 from .common import InfoExtractor
-from ..utils import (
-    ExtractorError,
+from ..utils import ExtractorError
+
+from ..compat import (
+    compat_urllib_parse,
+    compat_ord,
+    compat_urllib_request,
 )
 
 
 class YoukuIE(InfoExtractor):
+    IE_NAME = 'youku'
+    IE_DESC = '优酷'
     _VALID_URL = r'''(?x)
         (?:
             http://(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)|
             youku:)
         (?P<id>[A-Za-z0-9]+)(?:\.html|/v\.swf|)
     '''
-    _TEST = {
-        'url': 'http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html',
-        'md5': 'ffe3f2e435663dc2d1eea34faeff5b5b',
-        'params': {
-            'test': False
+
+    _TESTS = [{
+        'url': 'http://v.youku.com/v_show/id_XMTc1ODE5Njcy.html',
+        'md5': '5f3af4192eabacc4501508d54a8cabd7',
+        'info_dict': {
+            'id': 'XMTc1ODE5Njcy_part1',
+            'title': '★Smile﹗♡ Git Fresh -Booty Music舞蹈.',
+            'ext': 'flv'
+        }
+    }, {
+        'url': 'http://player.youku.com/player.php/sid/XNDgyMDQ2NTQw/v.swf',
+        'only_matching': True,
+    }, {
+        'url': 'http://v.youku.com/v_show/id_XODgxNjg1Mzk2_ev_1.html',
+        'info_dict': {
+            'id': 'XODgxNjg1Mzk2',
+            'title': '武媚娘传奇 85',
         },
+        'playlist_count': 11,
+    }, {
+        'url': 'http://v.youku.com/v_show/id_XMTI1OTczNDM5Mg==.html',
         'info_dict': {
-            'id': 'XNDgyMDQ2NTQw_part00',
-            'ext': 'flv',
-            'title': 'youtube-dl test video "\'/\\ä↭𝕐'
+            'id': 'XMTI1OTczNDM5Mg',
+            'title': '花千骨 04',
+        },
+        'playlist_count': 13,
+        'skip': 'Available in China only',
+    }]
+
+    def construct_video_urls(self, data1, data2):
+        # get sid, token
+        def yk_t(s1, s2):
+            ls = list(range(256))
+            t = 0
+            for i in range(256):
+                t = (t + ls[i] + compat_ord(s1[i % len(s1)])) % 256
+                ls[i], ls[t] = ls[t], ls[i]
+            s = bytearray()
+            x, y = 0, 0
+            for i in range(len(s2)):
+                y = (y + 1) % 256
+                x = (x + ls[y]) % 256
+                ls[x], ls[y] = ls[y], ls[x]
+                s.append(compat_ord(s2[i]) ^ ls[(ls[x] + ls[y]) % 256])
+            return bytes(s)
+
+        sid, token = yk_t(
+            b'becaf9be', base64.b64decode(data2['ep'].encode('ascii'))
+        ).decode('ascii').split('_')
+
+        # get oip
+        oip = data2['ip']
+
+        # get fileid
+        string_ls = list(
+            'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890')
+        shuffled_string_ls = []
+        seed = data1['seed']
+        N = len(string_ls)
+        for ii in range(N):
+            seed = (seed * 0xd3 + 0x754f) % 0x10000
+            idx = seed * len(string_ls) // 0x10000
+            shuffled_string_ls.append(string_ls[idx])
+            del string_ls[idx]
+
+        fileid_dict = {}
+        for format in data1['streamtypes']:
+            streamfileid = [
+                int(i) for i in data1['streamfileids'][format].strip('*').split('*')]
+            fileid = ''.join(
+                [shuffled_string_ls[i] for i in streamfileid])
+            fileid_dict[format] = fileid[:8] + '%s' + fileid[10:]
+
+        def get_fileid(format, n):
+            fileid = fileid_dict[format] % hex(int(n))[2:].upper().zfill(2)
+            return fileid
+
+        # get ep
+        def generate_ep(format, n):
+            fileid = get_fileid(format, n)
+            ep_t = yk_t(
+                b'bf7e5f01',
+                ('%s_%s_%s' % (sid, fileid, token)).encode('ascii')
+            )
+            ep = base64.b64encode(ep_t).decode('ascii')
+            return ep
+
+        # generate video_urls
+        video_urls_dict = {}
+        for format in data1['streamtypes']:
+            video_urls = []
+            for dt in data1['segs'][format]:
+                n = str(int(dt['no']))
+                param = {
+                    'K': dt['k'],
+                    'hd': self.get_hd(format),
+                    'myp': 0,
+                    'ts': dt['seconds'],
+                    'ypp': 0,
+                    'ctype': 12,
+                    'ev': 1,
+                    'token': token,
+                    'oip': oip,
+                    'ep': generate_ep(format, n)
+                }
+                video_url = \
+                    'http://k.youku.com/player/getFlvPath/' + \
+                    'sid/' + sid + \
+                    '_' + str(int(n) + 1).zfill(2) + \
+                    '/st/' + self.parse_ext_l(format) + \
+                    '/fileid/' + get_fileid(format, n) + '?' + \
+                    compat_urllib_parse.urlencode(param)
+                video_urls.append(video_url)
+            video_urls_dict[format] = video_urls
+
+        return video_urls_dict
+
+    def get_hd(self, fm):
+        hd_id_dict = {
+            'flv': '0',
+            'mp4': '1',
+            'hd2': '2',
+            'hd3': '3',
+            '3gp': '0',
+            '3gphd': '1'
         }
-    }
-
-    def _gen_sid(self):
-        nowTime = int(time.time() * 1000)
-        random1 = random.randint(1000, 1998)
-        random2 = random.randint(1000, 9999)
-
-        return "%d%d%d" % (nowTime, random1, random2)
-
-    def _get_file_ID_mix_string(self, seed):
-        mixed = []
-        source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
-        seed = float(seed)
-        for i in range(len(source)):
-            seed = (seed * 211 + 30031) % 65536
-            index = math.floor(seed / 65536 * len(source))
-            mixed.append(source[int(index)])
-            source.remove(source[int(index)])
-        # return ''.join(mixed)
-        return mixed
-
-    def _get_file_id(self, fileId, seed):
-        mixed = self._get_file_ID_mix_string(seed)
-        ids = fileId.split('*')
-        realId = []
-        for ch in ids:
-            if ch:
-                realId.append(mixed[int(ch)])
-        return ''.join(realId)
+        return hd_id_dict[fm]
+
+    def parse_ext_l(self, fm):
+        ext_dict = {
+            'flv': 'flv',
+            'mp4': 'mp4',
+            'hd2': 'flv',
+            'hd3': 'flv',
+            '3gp': 'flv',
+            '3gphd': 'mp4'
+        }
+        return ext_dict[fm]
+
+    def get_format_name(self, fm):
+        _dict = {
+            '3gp': 'h6',
+            '3gphd': 'h5',
+            'flv': 'h4',
+            'mp4': 'h3',
+            'hd2': 'h2',
+            'hd3': 'h1'
+        }
+        return _dict[fm]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-
-        info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
+        video_id = self._match_id(url)
 
-        config = self._download_json(info_url, video_id)
+        def retrieve_data(req_url, note):
+            req = compat_urllib_request.Request(req_url)
 
-        error_code = config['data'][0].get('error_code')
-        if error_code:
-            # -8 means blocked outside China.
-            error = config['data'][0].get('error')  # Chinese and English, separated by newline.
-            raise ExtractorError(error or 'Server reported error %i' % error_code,
-                                 expected=True)
+            cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')
+            if cn_verification_proxy:
+                req.add_header('Ytdl-request-proxy', cn_verification_proxy)
 
-        video_title = config['data'][0]['title']
-        seed = config['data'][0]['seed']
+            raw_data = self._download_json(req, video_id, note=note)
+            return raw_data['data'][0]
 
-        format = self._downloader.params.get('format', None)
-        supported_format = list(config['data'][0]['streamfileids'].keys())
+        # request basic data
+        data1 = retrieve_data(
+            'http://v.youku.com/player/getPlayList/VideoIDS/%s' % video_id,
+            'Downloading JSON metadata 1')
+        data2 = retrieve_data(
+            'http://v.youku.com/player/getPlayList/VideoIDS/%s/Pf/4/ctype/12/ev/1' % video_id,
+            'Downloading JSON metadata 2')
 
-        # TODO proper format selection
-        if format is None or format == 'best':
-            if 'hd2' in supported_format:
-                format = 'hd2'
+        error_code = data1.get('error_code')
+        if error_code:
+            error = data1.get('error')
+            if error is not None and '因版权原因无法观看此视频' in error:
+                raise ExtractorError(
+                    'Youku said: Sorry, this video is available in China only', expected=True)
             else:
-                format = 'flv'
-            ext = 'flv'
-        elif format == 'worst':
-            format = 'mp4'
-            ext = 'mp4'
-        else:
-            format = 'flv'
-            ext = 'flv'
-
-        fileid = config['data'][0]['streamfileids'][format]
-        keys = [s['k'] for s in config['data'][0]['segs'][format]]
-        # segs is usually a dictionary, but an empty *list* if an error occured.
-
-        files_info = []
-        sid = self._gen_sid()
-        fileid = self._get_file_id(fileid, seed)
-
-        # column 8,9 of fileid represent the segment number
-        # fileid[7:9] should be changed
-        for index, key in enumerate(keys):
-            temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
-            download_url = 'http://k.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
-
-            info = {
-                'id': '%s_part%02d' % (video_id, index),
-                'url': download_url,
-                'uploader': None,
-                'upload_date': None,
-                'title': video_title,
-                'ext': ext,
-            }
-            files_info.append(info)
-
-        return files_info
+                msg = 'Youku server reported error %i' % error_code
+                if error is not None:
+                    msg += ': ' + error
+                raise ExtractorError(msg)
+
+        title = data1['title']
+
+        # generate video_urls_dict
+        video_urls_dict = self.construct_video_urls(data1, data2)
+
+        # construct info
+        entries = [{
+            'id': '%s_part%d' % (video_id, i + 1),
+            'title': title,
+            'formats': [],
+            # some formats are not available for all parts, we have to detect
+            # which one has all
+        } for i in range(max(len(v) for v in data1['segs'].values()))]
+        for fm in data1['streamtypes']:
+            video_urls = video_urls_dict[fm]
+            for video_url, seg, entry in zip(video_urls, data1['segs'][fm], entries):
+                entry['formats'].append({
+                    'url': video_url,
+                    'format_id': self.get_format_name(fm),
+                    'ext': self.parse_ext_l(fm),
+                    'filesize': int(seg['size']),
+                })
+
+        return {
+            '_type': 'multi_video',
+            'id': video_id,
+            'title': title,
+            'entries': entries,
+        }
index 6abe72f739b63d8b39d8cdfc5bfccf70dc545715..4ba7c36db78fb457b63e05fe161a75b00383c78c 100644 (file)
@@ -47,7 +47,7 @@ class YouPornIE(InfoExtractor):
 
         # Get JSON parameters
         json_params = self._search_regex(
-            [r'var\s+videoJa?son\s*=\s*({.+?});',
+            [r'videoJa?son\s*=\s*({.+})',
              r'var\s+currentVideo\s*=\s*new\s+Video\((.+?)\)[,;]'],
             webpage, 'JSON parameters')
         try:
index 40fc4165f402722eca38cbfbc7c06cf2e4409a64..4e25d6f22312a0dca9f1997baa3bacd1c3fd263d 100644 (file)
@@ -1,8 +1,6 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
 
 
@@ -16,7 +14,7 @@ class YourUploadIE(InfoExtractor):
     _TESTS = [
         {
             'url': 'http://yourupload.com/watch/14i14h',
-            'md5': 'bf5c2f95c4c917536e80936af7bc51e1',
+            'md5': '5e2c63385454c557f97c4c4131a393cd',
             'info_dict': {
                 'id': '14i14h',
                 'ext': 'mp4',
@@ -35,24 +33,21 @@ class YourUploadIE(InfoExtractor):
     ]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = self._match_id(url)
 
-        url = 'http://embed.yucache.net/{0:}'.format(video_id)
-        webpage = self._download_webpage(url, video_id)
+        embed_url = 'http://embed.yucache.net/{0:}'.format(video_id)
+        webpage = self._download_webpage(embed_url, video_id)
 
         title = self._og_search_title(webpage)
-        thumbnail = self._og_search_thumbnail(webpage)
-        url = self._og_search_video_url(webpage)
-
-        formats = [{
-            'format_id': 'sd',
-            'url': url,
-        }]
+        video_url = self._og_search_video_url(webpage)
+        thumbnail = self._og_search_thumbnail(webpage, default=None)
 
         return {
             'id': video_id,
             'title': title,
-            'formats': formats,
+            'url': video_url,
             'thumbnail': thumbnail,
+            'http_headers': {
+                'Referer': embed_url,
+            },
         }
index 52909b0daa945170e2d3ebec6892b9f906f3087e..3d8b31f9830b218e02021b41214010b4f61d29a9 100644 (file)
@@ -17,6 +17,8 @@ from ..compat import (
     compat_chr,
     compat_parse_qs,
     compat_urllib_parse,
+    compat_urllib_parse_unquote,
+    compat_urllib_parse_unquote_plus,
     compat_urllib_request,
     compat_urlparse,
     compat_str,
@@ -28,11 +30,12 @@ from ..utils import (
     get_element_by_attribute,
     get_element_by_id,
     int_or_none,
-    OnDemandPagedList,
     orderedSet,
+    str_to_int,
     unescapeHTML,
     unified_strdate,
     uppercase_escape,
+    ISO3166Utils,
 )
 
 
@@ -50,6 +53,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
             # YouTube sets the expire time to about two months
             expire_time=time.time() + 2 * 30 * 24 * 3600)
 
+    def _ids_to_results(self, ids):
+        return [
+            self.url_result(vid_id, 'Youtube', video_id=vid_id)
+            for vid_id in ids]
+
     def _login(self):
         """
         Attempt to log in to YouTube.
@@ -230,6 +238,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         '44': {'ext': 'webm', 'width': 854, 'height': 480},
         '45': {'ext': 'webm', 'width': 1280, 'height': 720},
         '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
+        '59': {'ext': 'mp4', 'width': 854, 'height': 480},
+        '78': {'ext': 'mp4', 'width': 854, 'height': 480},
 
 
         # 3d videos
@@ -512,6 +522,38 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'skip_download': 'requires avconv',
             }
         },
+        # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
+        {
+            'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
+            'info_dict': {
+                'id': 'FIl7x6_3R5Y',
+                'ext': 'mp4',
+                'title': 'md5:7b81415841e02ecd4313668cde88737a',
+                'description': 'md5:116377fd2963b81ec4ce64b542173306',
+                'upload_date': '20150625',
+                'uploader_id': 'dorappi2000',
+                'uploader': 'dorappi2000',
+                'formats': 'mincount:33',
+            },
+        },
+        # DASH manifest with segment_list
+        {
+            'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
+            'md5': '8ce563a1d667b599d21064e982ab9e31',
+            'info_dict': {
+                'id': 'CsmdDsKjzN8',
+                'ext': 'mp4',
+                'upload_date': '20150501',  # According to '<meta itemprop="datePublished"', but in other places it's 20150510
+                'uploader': 'Airtek',
+                'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
+                'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
+                'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
+            },
+            'params': {
+                'youtube_include_dash_manifest': True,
+                'format': '135',  # bestvideo
+            }
+        },
     ]
 
     def __init__(self, *args, **kwargs):
@@ -776,16 +818,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
 
     def _parse_dash_manifest(
-            self, video_id, dash_manifest_url, player_url, age_gate):
+            self, video_id, dash_manifest_url, player_url, age_gate, fatal=True):
         def decrypt_sig(mobj):
             s = mobj.group(1)
             dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
             return '/signature/%s' % dec_s
-        dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
+        dash_manifest_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, dash_manifest_url)
         dash_doc = self._download_xml(
             dash_manifest_url, video_id,
             note='Downloading DASH manifest',
-            errnote='Could not download DASH manifest')
+            errnote='Could not download DASH manifest',
+            fatal=fatal)
+
+        if dash_doc is False:
+            return []
 
         formats = []
         for a in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}AdaptationSet'):
@@ -798,6 +844,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                     # TODO implement WebVTT downloading
                     pass
                 elif mime_type.startswith('audio/') or mime_type.startswith('video/'):
+                    segment_list = r.find('{urn:mpeg:DASH:schema:MPD:2011}SegmentList')
                     format_id = r.attrib['id']
                     video_url = url_el.text
                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
@@ -811,6 +858,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                         'filesize': filesize,
                         'fps': int_or_none(r.attrib.get('frameRate')),
                     }
+                    if segment_list is not None:
+                        f.update({
+                            'initialization_url': segment_list.find('{urn:mpeg:DASH:schema:MPD:2011}Initialization').attrib['sourceURL'],
+                            'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')],
+                            'protocol': 'http_dash_segments',
+                        })
                     try:
                         existing_format = next(
                             fo for fo in formats
@@ -818,6 +871,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                     except StopIteration:
                         full_info = self._formats.get(format_id, {}).copy()
                         full_info.update(f)
+                        codecs = r.attrib.get('codecs')
+                        if codecs:
+                            if full_info.get('acodec') == 'none' and 'vcodec' not in full_info:
+                                full_info['vcodec'] = codecs
+                            elif full_info.get('vcodec') == 'none' and 'acodec' not in full_info:
+                                full_info['acodec'] = codecs
                         formats.append(full_info)
                     else:
                         existing_format.update(f)
@@ -833,7 +892,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
         mobj = re.search(self._NEXT_URL_RE, url)
         if mobj:
-            url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
+            url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
         video_id = self.extract_id(url)
 
         # Get video webpage
@@ -847,8 +906,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         else:
             player_url = None
 
+        dash_mpds = []
+
+        def add_dash_mpd(video_info):
+            dash_mpd = video_info.get('dashmpd')
+            if dash_mpd and dash_mpd[0] not in dash_mpds:
+                dash_mpds.append(dash_mpd[0])
+
         # Get video info
         embed_webpage = None
+        is_live = None
         if re.search(r'player-age-gate-content">', video_webpage) is not None:
             age_gate = True
             # We simulate the access to the video from www.youtube.com/v/{video_id}
@@ -867,24 +934,31 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 note='Refetching age-gated info webpage',
                 errnote='unable to download video info webpage')
             video_info = compat_parse_qs(video_info_webpage)
+            add_dash_mpd(video_info)
         else:
             age_gate = False
-            try:
-                # Try looking directly into the video webpage
-                mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
-                if not mobj:
-                    raise ValueError('Could not find ytplayer.config')  # caught below
+            video_info = None
+            # Try looking directly into the video webpage
+            mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
+            if mobj:
                 json_code = uppercase_escape(mobj.group(1))
                 ytplayer_config = json.loads(json_code)
                 args = ytplayer_config['args']
-                # Convert to the same format returned by compat_parse_qs
-                video_info = dict((k, [v]) for k, v in args.items())
-                if not args.get('url_encoded_fmt_stream_map'):
-                    raise ValueError('No stream_map present')  # caught below
-            except ValueError:
-                # We fallback to the get_video_info pages (used by the embed page)
+                if args.get('url_encoded_fmt_stream_map'):
+                    # Convert to the same format returned by compat_parse_qs
+                    video_info = dict((k, [v]) for k, v in args.items())
+                    add_dash_mpd(video_info)
+                if args.get('livestream') == '1' or args.get('live_playback') == 1:
+                    is_live = True
+            if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
+                # We also try looking in get_video_info since it may contain different dashmpd
+                # URL that points to a DASH manifest with possibly different itag set (some itags
+                # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
+                # manifest pointed by get_video_info's dashmpd).
+                # The general idea is to take a union of itags of both DASH manifests (for example
+                # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
                 self.report_video_info_webpage_download(video_id)
-                for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
+                for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
                     video_info_url = (
                         '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
                         % (proto, video_id, el_type))
@@ -892,11 +966,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                         video_info_url,
                         video_id, note=False,
                         errnote='unable to download video info webpage')
-                    video_info = compat_parse_qs(video_info_webpage)
-                    if 'token' in video_info:
+                    get_video_info = compat_parse_qs(video_info_webpage)
+                    add_dash_mpd(get_video_info)
+                    if not video_info:
+                        video_info = get_video_info
+                    if 'token' in get_video_info:
                         break
         if 'token' not in video_info:
             if 'reason' in video_info:
+                if 'The uploader has not made this video available in your country.' in video_info['reason']:
+                    regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None)
+                    if regions_allowed is not None:
+                        raise ExtractorError('YouTube said: This video is available in %s only' % (
+                            ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))),
+                            expected=True)
                 raise ExtractorError(
                     'YouTube said: %s' % video_info['reason'][0],
                     expected=True, video_id=video_id)
@@ -920,7 +1003,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         # uploader
         if 'author' not in video_info:
             raise ExtractorError('Unable to extract uploader name')
-        video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
+        video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0])
 
         # uploader_id
         video_uploader_id = None
@@ -947,18 +1030,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             self._downloader.report_warning('unable to extract video thumbnail')
             video_thumbnail = None
         else:   # don't panic if we can't find it
-            video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
+            video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
 
         # upload date
-        upload_date = None
-        mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
-        if mobj is None:
-            mobj = re.search(
-                r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
-                video_webpage)
-        if mobj is not None:
-            upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
-            upload_date = unified_strdate(upload_date)
+        upload_date = self._html_search_meta(
+            'datePublished', video_webpage, 'upload date', default=None)
+        if not upload_date:
+            upload_date = self._search_regex(
+                [r'(?s)id="eow-date.*?>(.*?)</span>',
+                 r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'],
+                video_webpage, 'upload date', default=None)
+            if upload_date:
+                upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
+        upload_date = unified_strdate(upload_date)
 
         m_cat_container = self._search_regex(
             r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
@@ -992,12 +1076,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 video_description = ''
 
         def _extract_count(count_name):
-            count = self._search_regex(
-                r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
-                video_webpage, count_name, default=None)
-            if count is not None:
-                return int(count.replace(',', ''))
-            return None
+            return str_to_int(self._search_regex(
+                r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
+                % re.escape(count_name),
+                video_webpage, count_name, default=None))
+
         like_count = _extract_count('like')
         dislike_count = _extract_count('dislike')
 
@@ -1009,7 +1092,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             self._downloader.report_warning('unable to extract video duration')
             video_duration = None
         else:
-            video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
+            video_duration = int(compat_urllib_parse_unquote_plus(video_info['length_seconds'][0]))
 
         # annotations
         video_annotations = None
@@ -1112,23 +1195,32 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
 
         # Look for the DASH manifest
         if self._downloader.params.get('youtube_include_dash_manifest', True):
-            dash_mpd = video_info.get('dashmpd')
-            if dash_mpd:
-                dash_manifest_url = dash_mpd[0]
+            dash_mpd_fatal = True
+            for dash_manifest_url in dash_mpds:
+                dash_formats = {}
                 try:
-                    dash_formats = self._parse_dash_manifest(
-                        video_id, dash_manifest_url, player_url, age_gate)
+                    for df in self._parse_dash_manifest(
+                            video_id, dash_manifest_url, player_url, age_gate, dash_mpd_fatal):
+                        # Do not overwrite DASH format found in some previous DASH manifest
+                        if df['format_id'] not in dash_formats:
+                            dash_formats[df['format_id']] = df
+                        # Additional DASH manifests may end up in HTTP Error 403 therefore
+                        # allow them to fail without bug report message if we already have
+                        # some DASH manifest succeeded. This is temporary workaround to reduce
+                        # burst of bug reports until we figure out the reason and whether it
+                        # can be fixed at all.
+                        dash_mpd_fatal = False
                 except (ExtractorError, KeyError) as e:
                     self.report_warning(
                         'Skipping DASH manifest: %r' % e, video_id)
-                else:
-                    # Hide the formats we found through non-DASH
-                    dash_keys = set(df['format_id'] for df in dash_formats)
-                    for f in formats:
-                        if f['format_id'] in dash_keys:
-                            f['format_id'] = 'nondash-%s' % f['format_id']
-                            f['preference'] = f.get('preference', 0) - 10000
-                    formats.extend(dash_formats)
+                if dash_formats:
+                    # Remove the formats we found through non-DASH, they
+                    # contain less info and it can be wrong, because we use
+                    # fixed values (for example the resolution). See
+                    # https://github.com/rg3/youtube-dl/issues/5774 for an
+                    # example.
+                    formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
+                    formats.extend(dash_formats.values())
 
         # Check for malformed aspect ratio
         stretched_m = re.search(
@@ -1162,6 +1254,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             'dislike_count': dislike_count,
             'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
             'formats': formats,
+            'is_live': is_live,
         }
 
 
@@ -1262,11 +1355,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
     def _real_initialize(self):
         self._login()
 
-    def _ids_to_results(self, ids):
-        return [
-            self.url_result(vid_id, 'Youtube', video_id=vid_id)
-            for vid_id in ids]
-
     def _extract_mix(self, playlist_id):
         # The mixes are generated from a single video
         # the id of the playlist is just 'RD' + video_id
@@ -1290,46 +1378,55 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
     def _extract_playlist(self, playlist_id):
         url = self._TEMPLATE_URL % playlist_id
         page = self._download_webpage(url, playlist_id)
-        more_widget_html = content_html = page
 
-        # Check if the playlist exists or is private
-        if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
-            raise ExtractorError(
-                'The playlist doesn\'t exist or is private, use --username or '
-                '--netrc to access it.',
-                expected=True)
+        for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):
+            match = match.strip()
+            # Check if the playlist exists or is private
+            if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match):
+                raise ExtractorError(
+                    'The playlist doesn\'t exist or is private, use --username or '
+                    '--netrc to access it.',
+                    expected=True)
+            elif re.match(r'[^<]*Invalid parameters[^<]*', match):
+                raise ExtractorError(
+                    'Invalid parameters. Maybe URL is incorrect.',
+                    expected=True)
+            elif re.match(r'[^<]*Choose your language[^<]*', match):
+                continue
+            else:
+                self.report_warning('Youtube gives an alert message: ' + match)
 
         # Extract the video ids from the playlist pages
-        ids = []
-
-        for page_num in itertools.count(1):
-            matches = re.finditer(self._VIDEO_RE, content_html)
-            # We remove the duplicates and the link with index 0
-            # (it's not the first video of the playlist)
-            new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
-            ids.extend(new_ids)
-
-            mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
-            if not mobj:
-                break
+        def _entries():
+            more_widget_html = content_html = page
+            for page_num in itertools.count(1):
+                matches = re.finditer(self._VIDEO_RE, content_html)
+                # We remove the duplicates and the link with index 0
+                # (it's not the first video of the playlist)
+                new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
+                for vid_id in new_ids:
+                    yield self.url_result(vid_id, 'Youtube', video_id=vid_id)
+
+                mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
+                if not mobj:
+                    break
 
-            more = self._download_json(
-                'https://youtube.com/%s' % mobj.group('more'), playlist_id,
-                'Downloading page #%s' % page_num,
-                transform_source=uppercase_escape)
-            content_html = more['content_html']
-            if not content_html.strip():
-                # Some webpages show a "Load more" button but they don't
-                # have more videos
-                break
-            more_widget_html = more['load_more_widget_html']
+                more = self._download_json(
+                    'https://youtube.com/%s' % mobj.group('more'), playlist_id,
+                    'Downloading page #%s' % page_num,
+                    transform_source=uppercase_escape)
+                content_html = more['content_html']
+                if not content_html.strip():
+                    # Some webpages show a "Load more" button but they don't
+                    # have more videos
+                    break
+                more_widget_html = more['load_more_widget_html']
 
         playlist_title = self._html_search_regex(
             r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
             page, 'title')
 
-        url_results = self._ids_to_results(ids)
-        return self.playlist_result(url_results, playlist_id, playlist_title)
+        return self.playlist_result(_entries(), playlist_id, playlist_title)
 
     def _real_extract(self, url):
         # Extract playlist id
@@ -1358,6 +1455,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
 class YoutubeChannelIE(InfoExtractor):
     IE_DESC = 'YouTube.com channels'
     _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
+    _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
     IE_NAME = 'youtube:channel'
     _TESTS = [{
         'note': 'paginated channel',
@@ -1368,7 +1466,8 @@ class YoutubeChannelIE(InfoExtractor):
         }
     }]
 
-    def extract_videos_from_page(self, page):
+    @staticmethod
+    def extract_videos_from_page(page):
         ids_in_page = []
         titles_in_page = []
         for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page):
@@ -1386,8 +1485,26 @@ class YoutubeChannelIE(InfoExtractor):
     def _real_extract(self, url):
         channel_id = self._match_id(url)
 
-        url = 'https://www.youtube.com/channel/%s/videos' % channel_id
-        channel_page = self._download_webpage(url, channel_id)
+        url = self._TEMPLATE_URL % channel_id
+
+        # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
+        # Workaround by extracting as a playlist if managed to obtain channel playlist URL
+        # otherwise fallback on channel by page extraction
+        channel_page = self._download_webpage(
+            url + '?view=57', channel_id,
+            'Downloading channel page', fatal=False)
+        channel_playlist_id = self._html_search_meta(
+            'channelId', channel_page, 'channel id', default=None)
+        if not channel_playlist_id:
+            channel_playlist_id = self._search_regex(
+                r'data-channel-external-id="([^"]+)"',
+                channel_page, 'channel id', default=None)
+        if channel_playlist_id and channel_playlist_id.startswith('UC'):
+            playlist_id = 'UU' + channel_playlist_id[2:]
+            return self.url_result(
+                compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
+
+        channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
         autogenerated = re.search(r'''(?x)
                 class="[^"]*?(?:
                     channel-header-autogenerated-label|
@@ -1429,12 +1546,10 @@ class YoutubeChannelIE(InfoExtractor):
         return self.playlist_result(_entries(), channel_id)
 
 
-class YoutubeUserIE(InfoExtractor):
+class YoutubeUserIE(YoutubeChannelIE):
     IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
-    _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
-    _GDATA_PAGE_SIZE = 50
-    _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
+    _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'
     IE_NAME = 'youtube:user'
 
     _TESTS = [{
@@ -1458,95 +1573,57 @@ class YoutubeUserIE(InfoExtractor):
         else:
             return super(YoutubeUserIE, cls).suitable(url)
 
-    def _real_extract(self, url):
-        username = self._match_id(url)
-
-        # Download video ids using YouTube Data API. Result size per
-        # query is limited (currently to 50 videos) so we need to query
-        # page by page until there are no video ids - it means we got
-        # all of them.
-
-        def download_page(pagenum):
-            start_index = pagenum * self._GDATA_PAGE_SIZE + 1
-
-            gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
-            page = self._download_webpage(
-                gdata_url, username,
-                'Downloading video ids from %d to %d' % (
-                    start_index, start_index + self._GDATA_PAGE_SIZE))
-
-            try:
-                response = json.loads(page)
-            except ValueError as err:
-                raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
-            if 'entry' not in response['feed']:
-                return
-
-            # Extract video identifiers
-            entries = response['feed']['entry']
-            for entry in entries:
-                title = entry['title']['$t']
-                video_id = entry['id']['$t'].split('/')[-1]
-                yield {
-                    '_type': 'url',
-                    'url': video_id,
-                    'ie_key': 'Youtube',
-                    'id': video_id,
-                    'title': title,
-                }
-        url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
-
-        return self.playlist_result(url_results, playlist_title=username)
-
 
-class YoutubeSearchIE(SearchInfoExtractor):
+class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
     IE_DESC = 'YouTube.com searches'
-    _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
-    _MAX_RESULTS = 1000
+    # there doesn't appear to be a real limit, for example if you search for
+    # 'python' you get more than 8.000.000 results
+    _MAX_RESULTS = float('inf')
     IE_NAME = 'youtube:search'
     _SEARCH_KEY = 'ytsearch'
+    _EXTRA_QUERY_ARGS = {}
+    _TESTS = []
 
     def _get_n_results(self, query, n):
         """Get a specified number of results for a query"""
 
-        video_ids = []
-        pagenum = 0
+        videos = []
         limit = n
-        PAGE_SIZE = 50
 
-        while (PAGE_SIZE * pagenum) < limit:
-            result_url = self._API_URL % (
-                compat_urllib_parse.quote_plus(query.encode('utf-8')),
-                (PAGE_SIZE * pagenum) + 1)
-            data_json = self._download_webpage(
+        for pagenum in itertools.count(1):
+            url_query = {
+                'search_query': query.encode('utf-8'),
+                'page': pagenum,
+                'spf': 'navigate',
+            }
+            url_query.update(self._EXTRA_QUERY_ARGS)
+            result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query)
+            data = self._download_json(
                 result_url, video_id='query "%s"' % query,
-                note='Downloading page %s' % (pagenum + 1),
+                note='Downloading page %s' % pagenum,
                 errnote='Unable to download API page')
-            data = json.loads(data_json)
-            api_response = data['data']
+            html_content = data[1]['body']['content']
 
-            if 'items' not in api_response:
+            if 'class="search-message' in html_content:
                 raise ExtractorError(
                     '[youtube] No video results', expected=True)
 
-            new_ids = list(video['id'] for video in api_response['items'])
-            video_ids += new_ids
-
-            limit = min(n, api_response['totalItems'])
-            pagenum += 1
+            new_videos = self._ids_to_results(orderedSet(re.findall(
+                r'href="/watch\?v=(.{11})', html_content)))
+            videos += new_videos
+            if not new_videos or len(videos) > limit:
+                break
 
-        if len(video_ids) > n:
-            video_ids = video_ids[:n]
-        videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
-                  for video_id in video_ids]
+        if len(videos) > n:
+            videos = videos[:n]
         return self.playlist_result(videos, query)
 
 
 class YoutubeSearchDateIE(YoutubeSearchIE):
     IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
-    _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
     _SEARCH_KEY = 'ytsearchdate'
     IE_DESC = 'YouTube.com searches, newest videos first'
+    _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
 
 
 class YoutubeSearchURLIE(InfoExtractor):
@@ -1563,7 +1640,7 @@ class YoutubeSearchURLIE(InfoExtractor):
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        query = compat_urllib_parse.unquote_plus(mobj.group('query'))
+        query = compat_urllib_parse_unquote_plus(mobj.group('query'))
 
         webpage = self._download_webpage(url, query)
         result_code = self._search_regex(
@@ -1630,20 +1707,10 @@ class YoutubeShowIE(InfoExtractor):
 
 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
     """
-    Base class for extractors that fetch info from
-    http://www.youtube.com/feed_ajax
+    Base class for feed extractors
     Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
     """
     _LOGIN_REQUIRED = True
-    # use action_load_personal_feed instead of action_load_system_feed
-    _PERSONAL_FEED = False
-
-    @property
-    def _FEED_TEMPLATE(self):
-        action = 'action_load_system_feed'
-        if self._PERSONAL_FEED:
-            action = 'action_load_personal_feed'
-        return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
 
     @property
     def IE_NAME(self):
@@ -1653,36 +1720,38 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
         self._login()
 
     def _real_extract(self, url):
-        feed_entries = []
-        paging = 0
-        for i in itertools.count(1):
-            info = self._download_json(
-                self._FEED_TEMPLATE % paging,
-                '%s feed' % self._FEED_NAME,
-                'Downloading page %s' % i,
-                transform_source=uppercase_escape)
-            feed_html = info.get('feed_html') or info.get('content_html')
-            load_more_widget_html = info.get('load_more_widget_html') or feed_html
-            m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
-            ids = orderedSet(m.group(1) for m in m_ids)
-            feed_entries.extend(
-                self.url_result(video_id, 'Youtube', video_id=video_id)
-                for video_id in ids)
-            mobj = re.search(
-                r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
-                load_more_widget_html)
-            if mobj is None:
+        page = self._download_webpage(
+            'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)
+
+        # The extraction process is the same as for playlists, but the regex
+        # for the video ids doesn't contain an index
+        ids = []
+        more_widget_html = content_html = page
+        for page_num in itertools.count(1):
+            matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
+
+            # 'recommended' feed has infinite 'load more' and each new portion spins
+            # the same videos in (sometimes) slightly different order, so we'll check
+            # for unicity and break when portion has no new videos
+            new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
+            if not new_ids:
                 break
-            paging = mobj.group('paging')
-        return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
 
+            ids.extend(new_ids)
 
-class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
-    IE_NAME = 'youtube:recommended'
-    IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
-    _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
-    _FEED_NAME = 'recommended'
-    _PLAYLIST_TITLE = 'Youtube Recommended videos'
+            mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
+            if not mobj:
+                break
+
+            more = self._download_json(
+                'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
+                'Downloading page #%s' % page_num,
+                transform_source=uppercase_escape)
+            content_html = more['content_html']
+            more_widget_html = more['load_more_widget_html']
+
+        return self.playlist_result(
+            self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)
 
 
 class YoutubeWatchLaterIE(YoutubePlaylistIE):
@@ -1696,15 +1765,6 @@ class YoutubeWatchLaterIE(YoutubePlaylistIE):
         return self._extract_playlist('WL')
 
 
-class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
-    IE_NAME = 'youtube:history'
-    IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
-    _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
-    _FEED_NAME = 'history'
-    _PERSONAL_FEED = True
-    _PLAYLIST_TITLE = 'Youtube Watch History'
-
-
 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
     IE_NAME = 'youtube:favorites'
     IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
@@ -1717,42 +1777,25 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
         return self.url_result(playlist_id, 'YoutubePlaylist')
 
 
-class YoutubeSubscriptionsIE(YoutubePlaylistIE):
-    IE_NAME = 'youtube:subscriptions'
-    IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
-    _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
-    _TESTS = []
-
-    def _real_extract(self, url):
-        title = 'Youtube Subscriptions'
-        page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
-
-        # The extraction process is the same as for playlists, but the regex
-        # for the video ids doesn't contain an index
-        ids = []
-        more_widget_html = content_html = page
+class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
+    IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
+    _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
+    _FEED_NAME = 'recommended'
+    _PLAYLIST_TITLE = 'Youtube Recommended videos'
 
-        for page_num in itertools.count(1):
-            matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
-            new_ids = orderedSet(matches)
-            ids.extend(new_ids)
 
-            mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
-            if not mobj:
-                break
+class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
+    IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
+    _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
+    _FEED_NAME = 'subscriptions'
+    _PLAYLIST_TITLE = 'Youtube Subscriptions'
 
-            more = self._download_json(
-                'https://youtube.com/%s' % mobj.group('more'), title,
-                'Downloading page #%s' % page_num,
-                transform_source=uppercase_escape)
-            content_html = more['content_html']
-            more_widget_html = more['load_more_widget_html']
 
-        return {
-            '_type': 'playlist',
-            'title': title,
-            'entries': self._ids_to_results(ids),
-        }
+class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
+    IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
+    _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
+    _FEED_NAME = 'history'
+    _PLAYLIST_TITLE = 'Youtube History'
 
 
 class YoutubeTruncatedURLIE(InfoExtractor):
index 1afbe68ed68e084028cda0c0f9d7d80a385118fb..7dc1e2f2bd3f36e2b71e199fb5e5f6f2cc4e18e9 100644 (file)
@@ -4,12 +4,18 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..utils import ExtractorError
 
 
 class ZingMp3BaseInfoExtractor(InfoExtractor):
 
-    @staticmethod
-    def _extract_item(item):
+    def _extract_item(self, item):
+        error_message = item.find('./errormessage').text
+        if error_message:
+            raise ExtractorError(
+                '%s returned error: %s' % (self.IE_NAME, error_message),
+                expected=True)
+
         title = item.find('./title').text.strip()
         source = item.find('./source').text
         extension = item.attrib['type']
index 11603f60d6c9a1f5aff35a15927a7d1db2521802..9016e34983d3fed5e0fab72e9a8626124cdee859 100644 (file)
@@ -145,11 +145,15 @@ def parseOpts(overrideArguments=None):
     general.add_option(
         '--list-extractors',
         action='store_true', dest='list_extractors', default=False,
-        help='List all supported extractors and the URLs they would handle')
+        help='List all supported extractors')
     general.add_option(
         '--extractor-descriptions',
         action='store_true', dest='list_extractor_descriptions', default=False,
         help='Output descriptions of all supported extractors')
+    general.add_option(
+        '--force-generic-extractor',
+        action='store_true', dest='force_generic_extractor', default=False,
+        help='Force extraction to use the generic extractor')
     general.add_option(
         '--default-search',
         dest='default_search', metavar='PREFIX',
@@ -215,7 +219,7 @@ def parseOpts(overrideArguments=None):
     selection.add_option(
         '--playlist-items',
         dest='playlist_items', metavar='ITEM_SPEC', default=None,
-        help='Playlist video items to download. Specify indices of the videos in the playlist seperated by commas like: "--playlist-items 1,2,5,8" if you want to download videos indexed 1, 2, 5, 8 in the playlist. You can specify range: "--playlist-items 1-3,7,10-13", it will download the videos at index 1, 2, 3, 7, 10, 11, 12 and 13.')
+        help='Playlist video items to download. Specify indices of the videos in the playlist separated by commas like: "--playlist-items 1,2,5,8" if you want to download videos indexed 1, 2, 5, 8 in the playlist. You can specify range: "--playlist-items 1-3,7,10-13", it will download the videos at index 1, 2, 3, 7, 10, 11, 12 and 13.')
     selection.add_option(
         '--match-title',
         dest='matchtitle', metavar='REGEX',
@@ -322,32 +326,7 @@ def parseOpts(overrideArguments=None):
     video_format.add_option(
         '-f', '--format',
         action='store', dest='format', metavar='FORMAT', default=None,
-        help=(
-            'Video format code, specify the order of preference using'
-            ' slashes, as in -f 22/17/18 . '
-            ' Instead of format codes, you can select by extension for the '
-            'extensions aac, m4a, mp3, mp4, ogg, wav, webm. '
-            'You can also use the special names "best",'
-            ' "bestvideo", "bestaudio", "worst". '
-            ' You can filter the video results by putting a condition in'
-            ' brackets, as in -f "best[height=720]"'
-            ' (or -f "[filesize>10M]"). '
-            ' This works for filesize, height, width, tbr, abr, vbr, asr, and fps'
-            ' and the comparisons <, <=, >, >=, =, !='
-            ' and for ext, acodec, vcodec, container, and protocol'
-            ' and the comparisons =, != .'
-            ' Formats for which the value is not known are excluded unless you'
-            ' put a question mark (?) after the operator.'
-            ' You can combine format filters, so  '
-            '-f "[height <=? 720][tbr>500]" '
-            'selects up to 720p videos (or videos where the height is not '
-            'known) with a bitrate of at least 500 KBit/s.'
-            ' By default, youtube-dl will pick the best quality.'
-            ' Use commas to download multiple audio formats, such as'
-            ' -f  136/137/mp4/bestvideo,140/m4a/bestaudio.'
-            ' You can merge the video and audio of two formats into a single'
-            ' file using -f <video-format>+<audio-format> (requires ffmpeg or'
-            ' avconv), for example -f bestvideo+bestaudio.'))
+        help='Video format code, see the "FORMAT SELECTION" for all the info')
     video_format.add_option(
         '--all-formats',
         action='store_const', dest='format', const='all',
@@ -356,10 +335,6 @@ def parseOpts(overrideArguments=None):
         '--prefer-free-formats',
         action='store_true', dest='prefer_free_formats', default=False,
         help='Prefer free video formats unless a specific one is requested')
-    video_format.add_option(
-        '--max-quality',
-        action='store', dest='format_limit', metavar='FORMAT',
-        help='Highest quality format to download')
     video_format.add_option(
         '-F', '--list-formats',
         action='store_true', dest='listformats',
@@ -371,12 +346,13 @@ def parseOpts(overrideArguments=None):
     video_format.add_option(
         '--youtube-skip-dash-manifest',
         action='store_false', dest='youtube_include_dash_manifest',
-        help='Do not download the DASH manifest on YouTube videos')
+        help='Do not download the DASH manifests and related data on YouTube videos')
     video_format.add_option(
         '--merge-output-format',
         action='store', dest='merge_output_format', metavar='FORMAT', default=None,
         help=(
-            'If a merge is required (e.g. bestvideo+bestaudio), output to given container format. One of mkv, mp4, ogg, webm, flv.'
+            'If a merge is required (e.g. bestvideo+bestaudio), '
+            'output to given container format. One of mkv, mp4, ogg, webm, flv. '
             'Ignored if no merge is required'))
 
     subtitles = optparse.OptionGroup(parser, 'Subtitle Options')
@@ -566,7 +542,7 @@ def parseOpts(overrideArguments=None):
     verbosity.add_option(
         '--dump-pages', '--dump-intermediate-pages',
         action='store_true', dest='dump_intermediate_pages', default=False,
-        help='Print downloaded pages to debug problems (very verbose)')
+        help='Print downloaded pages encoded using base64 to debug problems (very verbose)')
     verbosity.add_option(
         '--write-pages',
         action='store_true', dest='write_pages', default=False,
@@ -666,7 +642,7 @@ def parseOpts(overrideArguments=None):
     filesystem.add_option(
         '--write-annotations',
         action='store_true', dest='writeannotations', default=False,
-        help='Write video annotations to a .annotation file')
+        help='Write video annotations to a .annotations.xml file')
     filesystem.add_option(
         '--load-info',
         dest='load_info_filename', metavar='FILE',
@@ -715,7 +691,11 @@ def parseOpts(overrideArguments=None):
     postproc.add_option(
         '--recode-video',
         metavar='FORMAT', dest='recodevideo', default=None,
-        help='Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv)')
+        help='Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv|avi)')
+    postproc.add_option(
+        '--postprocessor-args',
+        dest='postprocessor_args', metavar='ARGS',
+        help='Give these arguments to the postprocessor')
     postproc.add_option(
         '-k', '--keep-video',
         action='store_true', dest='keepvideo', default=False,
@@ -727,7 +707,7 @@ def parseOpts(overrideArguments=None):
     postproc.add_option(
         '--embed-subs',
         action='store_true', dest='embedsubtitles', default=False,
-        help='Embed subtitles in the video (only for mp4 videos)')
+        help='Embed subtitles in the video (only for mkv and mp4 videos)')
     postproc.add_option(
         '--embed-thumbnail',
         action='store_true', dest='embedthumbnail', default=False,
@@ -742,7 +722,7 @@ def parseOpts(overrideArguments=None):
         help='Parse additional metadata like song title / artist from the video title. '
              'The format syntax is the same as --output, '
              'the parsed parameters replace existing values. '
-             'Additional templates: %(album), %(artist). '
+             'Additional templates: %(album)s, %(artist)s. '
              'Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like '
              '"Coldplay - Paradise"')
     postproc.add_option(
@@ -754,7 +734,7 @@ def parseOpts(overrideArguments=None):
         metavar='POLICY', dest='fixup', default='detect_or_warn',
         help='Automatically correct known faults of the file. '
              'One of never (do nothing), warn (only emit a warning), '
-             'detect_or_warn(the default; fix file if we can, warn otherwise)')
+             'detect_or_warn (the default; fix file if we can, warn otherwise)')
     postproc.add_option(
         '--prefer-avconv',
         action='store_false', dest='prefer_ffmpeg',
index f39acadce8ac6ab0603e2355af8397edd93249ac..0d8ef6ca26c6ef7f1b7b402b387d20eebd3f8a8f 100644 (file)
@@ -1,9 +1,8 @@
 from __future__ import unicode_literals
 
-from .atomicparsley import AtomicParsleyPP
+from .embedthumbnail import EmbedThumbnailPP
 from .ffmpeg import (
     FFmpegPostProcessor,
-    FFmpegAudioFixPP,
     FFmpegEmbedSubtitlePP,
     FFmpegExtractAudioPP,
     FFmpegFixupStretchedPP,
@@ -23,9 +22,8 @@ def get_postprocessor(key):
 
 
 __all__ = [
-    'AtomicParsleyPP',
+    'EmbedThumbnailPP',
     'ExecAfterDownloadPP',
-    'FFmpegAudioFixPP',
     'FFmpegEmbedSubtitlePP',
     'FFmpegExtractAudioPP',
     'FFmpegFixupM4aPP',
diff --git a/youtube_dl/postprocessor/atomicparsley.py b/youtube_dl/postprocessor/atomicparsley.py
deleted file mode 100644 (file)
index a5dfc13..0000000
+++ /dev/null
@@ -1,62 +0,0 @@
-# -*- coding: utf-8 -*-
-from __future__ import unicode_literals
-
-
-import os
-import subprocess
-
-from .common import PostProcessor
-from ..compat import (
-    compat_urlretrieve,
-)
-from ..utils import (
-    check_executable,
-    encodeFilename,
-    PostProcessingError,
-    prepend_extension,
-    shell_quote
-)
-
-
-class AtomicParsleyPPError(PostProcessingError):
-    pass
-
-
-class AtomicParsleyPP(PostProcessor):
-    def run(self, info):
-        if not check_executable('AtomicParsley', ['-v']):
-            raise AtomicParsleyPPError('AtomicParsley was not found. Please install.')
-
-        filename = info['filepath']
-        temp_filename = prepend_extension(filename, 'temp')
-        temp_thumbnail = prepend_extension(filename, 'thumb')
-
-        if not info.get('thumbnail'):
-            raise AtomicParsleyPPError('Thumbnail was not found. Nothing to do.')
-
-        compat_urlretrieve(info['thumbnail'], temp_thumbnail)
-
-        cmd = ['AtomicParsley', filename, '--artwork', temp_thumbnail, '-o', temp_filename]
-
-        self._downloader.to_screen('[atomicparsley] Adding thumbnail to "%s"' % filename)
-
-        if self._downloader.params.get('verbose', False):
-            self._downloader.to_screen('[debug] AtomicParsley command line: %s' % shell_quote(cmd))
-
-        p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        stdout, stderr = p.communicate()
-
-        if p.returncode != 0:
-            msg = stderr.decode('utf-8', 'replace').strip()
-            raise AtomicParsleyPPError(msg)
-
-        os.remove(encodeFilename(temp_thumbnail))
-        # for formats that don't support thumbnails (like 3gp) AtomicParsley
-        # won't create to the temporary file
-        if b'No changes' in stdout:
-            self._downloader.report_warning('The file format doesn\'t support embedding a thumbnail')
-        else:
-            os.remove(encodeFilename(filename))
-            os.rename(encodeFilename(temp_filename), encodeFilename(filename))
-
-        return True, info
index ef9fdfa19a5562b6b85840ce129b632967b30326..4191d040bb1da468e248d57b78df1afdacf64cd0 100644 (file)
@@ -23,6 +23,9 @@ class PostProcessor(object):
 
     PostProcessor objects follow a "mutual registration" process similar
     to InfoExtractor objects.
+
+    Optionally PostProcessor can use a list of additional command-line arguments
+    with self._configuration_args.
     """
 
     _downloader = None
@@ -42,14 +45,14 @@ class PostProcessor(object):
         one has an extra field called "filepath" that points to the
         downloaded file.
 
-        This method returns a tuple, the first element of which describes
-        whether the original file should be kept (i.e. not deleted - None for
-        no preference), and the second of which is the updated information.
+        This method returns a tuple, the first element is a list of the files
+        that can be deleted, and the second of which is the updated
+        information.
 
         In addition, this method may raise a PostProcessingError
         exception if post processing fails.
         """
-        return None, information  # by default, keep file and do nothing
+        return [], information  # by default, keep file and do nothing
 
     def try_utime(self, path, atime, mtime, errnote='Cannot update utime of file'):
         try:
@@ -57,6 +60,13 @@ class PostProcessor(object):
         except Exception:
             self._downloader.report_warning(errnote)
 
+    def _configuration_args(self, default=[]):
+        pp_args = self._downloader.params.get('postprocessor_args')
+        if pp_args is None:
+            return default
+        assert isinstance(pp_args, list)
+        return pp_args
+
 
 class AudioConversionError(PostProcessingError):
     pass
diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py
new file mode 100644 (file)
index 0000000..e19dbf7
--- /dev/null
@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+
+import os
+import subprocess
+
+from .ffmpeg import FFmpegPostProcessor
+
+from ..utils import (
+    check_executable,
+    encodeArgument,
+    encodeFilename,
+    PostProcessingError,
+    prepend_extension,
+    shell_quote
+)
+
+
+class EmbedThumbnailPPError(PostProcessingError):
+    pass
+
+
+class EmbedThumbnailPP(FFmpegPostProcessor):
+    def __init__(self, downloader=None, already_have_thumbnail=False):
+        super(EmbedThumbnailPP, self).__init__(downloader)
+        self._already_have_thumbnail = already_have_thumbnail
+
+    def run(self, info):
+        filename = info['filepath']
+        temp_filename = prepend_extension(filename, 'temp')
+
+        if not info.get('thumbnails'):
+            raise EmbedThumbnailPPError('Thumbnail was not found. Nothing to do.')
+
+        thumbnail_filename = info['thumbnails'][-1]['filename']
+
+        if not os.path.exists(encodeFilename(thumbnail_filename)):
+            self._downloader.report_warning(
+                'Skipping embedding the thumbnail because the file is missing.')
+            return [], info
+
+        if info['ext'] == 'mp3':
+            options = [
+                '-c', 'copy', '-map', '0', '-map', '1',
+                '-metadata:s:v', 'title="Album cover"', '-metadata:s:v', 'comment="Cover (Front)"']
+
+            self._downloader.to_screen('[ffmpeg] Adding thumbnail to "%s"' % filename)
+
+            self.run_ffmpeg_multiple_files([filename, thumbnail_filename], temp_filename, options)
+
+            if not self._already_have_thumbnail:
+                os.remove(encodeFilename(thumbnail_filename))
+            os.remove(encodeFilename(filename))
+            os.rename(encodeFilename(temp_filename), encodeFilename(filename))
+
+        elif info['ext'] in ['m4a', 'mp4']:
+            if not check_executable('AtomicParsley', ['-v']):
+                raise EmbedThumbnailPPError('AtomicParsley was not found. Please install.')
+
+            cmd = [encodeFilename('AtomicParsley', True),
+                   encodeFilename(filename, True),
+                   encodeArgument('--artwork'),
+                   encodeFilename(thumbnail_filename, True),
+                   encodeArgument('-o'),
+                   encodeFilename(temp_filename, True)]
+
+            self._downloader.to_screen('[atomicparsley] Adding thumbnail to "%s"' % filename)
+
+            if self._downloader.params.get('verbose', False):
+                self._downloader.to_screen('[debug] AtomicParsley command line: %s' % shell_quote(cmd))
+
+            p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            stdout, stderr = p.communicate()
+
+            if p.returncode != 0:
+                msg = stderr.decode('utf-8', 'replace').strip()
+                raise EmbedThumbnailPPError(msg)
+
+            if not self._already_have_thumbnail:
+                os.remove(encodeFilename(thumbnail_filename))
+            # for formats that don't support thumbnails (like 3gp) AtomicParsley
+            # won't create to the temporary file
+            if b'No changes' in stdout:
+                self._downloader.report_warning('The file format doesn\'t support embedding a thumbnail')
+            else:
+                os.remove(encodeFilename(filename))
+                os.rename(encodeFilename(temp_filename), encodeFilename(filename))
+        else:
+            raise EmbedThumbnailPPError('Only mp3 and m4a/mp4 are supported for thumbnail embedding for now.')
+
+        return [], info
index 75c0f7bbe86ef8e19f41fd61e1bbd58678474d8a..13794b7ba8653b179a08a348744441ae5c296852 100644 (file)
@@ -8,8 +8,8 @@ from ..utils import PostProcessingError
 
 
 class ExecAfterDownloadPP(PostProcessor):
-    def __init__(self, downloader=None, verboseOutput=None, exec_cmd=None):
-        self.verboseOutput = verboseOutput
+    def __init__(self, downloader, exec_cmd):
+        super(ExecAfterDownloadPP, self).__init__(downloader)
         self.exec_cmd = exec_cmd
 
     def run(self, information):
@@ -25,4 +25,4 @@ class ExecAfterDownloadPP(PostProcessor):
             raise PostProcessingError(
                 'Command returned error code %d' % retCode)
 
-        return None, information  # by default, keep file and do nothing
+        return [], information
index 8e99a3c2c461d300dbf077236907e0f80bd16e9b..1f723908be8d4ff0247affc5aed9ffc44e777602 100644 (file)
@@ -20,6 +20,8 @@ from ..utils import (
     prepend_extension,
     shell_quote,
     subtitles_filename,
+    dfxp2srt,
+    ISO639Utils,
 )
 
 
@@ -28,9 +30,8 @@ class FFmpegPostProcessorError(PostProcessingError):
 
 
 class FFmpegPostProcessor(PostProcessor):
-    def __init__(self, downloader=None, deletetempfiles=False):
+    def __init__(self, downloader=None):
         PostProcessor.__init__(self, downloader)
-        self._deletetempfiles = deletetempfiles
         self._determine_executables()
 
     def check_version(self):
@@ -130,6 +131,8 @@ class FFmpegPostProcessor(PostProcessor):
         oldest_mtime = min(
             os.stat(encodeFilename(path)).st_mtime for path in input_paths)
 
+        opts += self._configuration_args()
+
         files_cmd = []
         for path in input_paths:
             files_cmd.extend([encodeArgument('-i'), encodeFilename(path, True)])
@@ -148,10 +151,6 @@ class FFmpegPostProcessor(PostProcessor):
             raise FFmpegPostProcessorError(msg)
         self.try_utime(out_path, oldest_mtime, oldest_mtime)
 
-        if self._deletetempfiles:
-            for ipath in input_paths:
-                os.remove(ipath)
-
     def run_ffmpeg(self, path, out_path, opts):
         self.run_ffmpeg_multiple_files([path], out_path, opts)
 
@@ -264,15 +263,14 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):
         new_path = prefix + sep + extension
 
         # If we download foo.mp3 and convert it to... foo.mp3, then don't delete foo.mp3, silly.
-        if new_path == path:
-            self._nopostoverwrites = True
+        if (new_path == path or
+                (self._nopostoverwrites and os.path.exists(encodeFilename(new_path)))):
+            self._downloader.to_screen('[ffmpeg] Post-process file %s exists, skipping' % new_path)
+            return [], information
 
         try:
-            if self._nopostoverwrites and os.path.exists(encodeFilename(new_path)):
-                self._downloader.to_screen('[youtube] Post-process file %s exists, skipping' % new_path)
-            else:
-                self._downloader.to_screen('[' + self.basename + '] Destination: ' + new_path)
-                self.run_ffmpeg(path, new_path, acodec, more_opts)
+            self._downloader.to_screen('[' + self.basename + '] Destination: ' + new_path)
+            self.run_ffmpeg(path, new_path, acodec, more_opts)
         except AudioConversionError as e:
             raise PostProcessingError(
                 'audio conversion failed: ' + e.msg)
@@ -286,7 +284,9 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):
                 errnote='Cannot update utime of audio file')
 
         information['filepath'] = new_path
-        return self._nopostoverwrites, information
+        information['ext'] = extension
+
+        return [path], information
 
 
 class FFmpegVideoConvertorPP(FFmpegPostProcessor):
@@ -296,225 +296,36 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor):
 
     def run(self, information):
         path = information['filepath']
-        prefix, sep, ext = path.rpartition('.')
-        outpath = prefix + sep + self._preferedformat
         if information['ext'] == self._preferedformat:
             self._downloader.to_screen('[ffmpeg] Not converting video file %s - already is in target format %s' % (path, self._preferedformat))
-            return True, information
+            return [], information
+        options = []
+        if self._preferedformat == 'avi':
+            options.extend(['-c:v', 'libxvid', '-vtag', 'XVID'])
+        prefix, sep, ext = path.rpartition('.')
+        outpath = prefix + sep + self._preferedformat
         self._downloader.to_screen('[' + 'ffmpeg' + '] Converting video from %s to %s, Destination: ' % (information['ext'], self._preferedformat) + outpath)
-        self.run_ffmpeg(path, outpath, [])
+        self.run_ffmpeg(path, outpath, options)
         information['filepath'] = outpath
         information['format'] = self._preferedformat
         information['ext'] = self._preferedformat
-        return False, information
+        return [path], information
 
 
 class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
-    # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
-    _lang_map = {
-        'aa': 'aar',
-        'ab': 'abk',
-        'ae': 'ave',
-        'af': 'afr',
-        'ak': 'aka',
-        'am': 'amh',
-        'an': 'arg',
-        'ar': 'ara',
-        'as': 'asm',
-        'av': 'ava',
-        'ay': 'aym',
-        'az': 'aze',
-        'ba': 'bak',
-        'be': 'bel',
-        'bg': 'bul',
-        'bh': 'bih',
-        'bi': 'bis',
-        'bm': 'bam',
-        'bn': 'ben',
-        'bo': 'bod',
-        'br': 'bre',
-        'bs': 'bos',
-        'ca': 'cat',
-        'ce': 'che',
-        'ch': 'cha',
-        'co': 'cos',
-        'cr': 'cre',
-        'cs': 'ces',
-        'cu': 'chu',
-        'cv': 'chv',
-        'cy': 'cym',
-        'da': 'dan',
-        'de': 'deu',
-        'dv': 'div',
-        'dz': 'dzo',
-        'ee': 'ewe',
-        'el': 'ell',
-        'en': 'eng',
-        'eo': 'epo',
-        'es': 'spa',
-        'et': 'est',
-        'eu': 'eus',
-        'fa': 'fas',
-        'ff': 'ful',
-        'fi': 'fin',
-        'fj': 'fij',
-        'fo': 'fao',
-        'fr': 'fra',
-        'fy': 'fry',
-        'ga': 'gle',
-        'gd': 'gla',
-        'gl': 'glg',
-        'gn': 'grn',
-        'gu': 'guj',
-        'gv': 'glv',
-        'ha': 'hau',
-        'he': 'heb',
-        'hi': 'hin',
-        'ho': 'hmo',
-        'hr': 'hrv',
-        'ht': 'hat',
-        'hu': 'hun',
-        'hy': 'hye',
-        'hz': 'her',
-        'ia': 'ina',
-        'id': 'ind',
-        'ie': 'ile',
-        'ig': 'ibo',
-        'ii': 'iii',
-        'ik': 'ipk',
-        'io': 'ido',
-        'is': 'isl',
-        'it': 'ita',
-        'iu': 'iku',
-        'ja': 'jpn',
-        'jv': 'jav',
-        'ka': 'kat',
-        'kg': 'kon',
-        'ki': 'kik',
-        'kj': 'kua',
-        'kk': 'kaz',
-        'kl': 'kal',
-        'km': 'khm',
-        'kn': 'kan',
-        'ko': 'kor',
-        'kr': 'kau',
-        'ks': 'kas',
-        'ku': 'kur',
-        'kv': 'kom',
-        'kw': 'cor',
-        'ky': 'kir',
-        'la': 'lat',
-        'lb': 'ltz',
-        'lg': 'lug',
-        'li': 'lim',
-        'ln': 'lin',
-        'lo': 'lao',
-        'lt': 'lit',
-        'lu': 'lub',
-        'lv': 'lav',
-        'mg': 'mlg',
-        'mh': 'mah',
-        'mi': 'mri',
-        'mk': 'mkd',
-        'ml': 'mal',
-        'mn': 'mon',
-        'mr': 'mar',
-        'ms': 'msa',
-        'mt': 'mlt',
-        'my': 'mya',
-        'na': 'nau',
-        'nb': 'nob',
-        'nd': 'nde',
-        'ne': 'nep',
-        'ng': 'ndo',
-        'nl': 'nld',
-        'nn': 'nno',
-        'no': 'nor',
-        'nr': 'nbl',
-        'nv': 'nav',
-        'ny': 'nya',
-        'oc': 'oci',
-        'oj': 'oji',
-        'om': 'orm',
-        'or': 'ori',
-        'os': 'oss',
-        'pa': 'pan',
-        'pi': 'pli',
-        'pl': 'pol',
-        'ps': 'pus',
-        'pt': 'por',
-        'qu': 'que',
-        'rm': 'roh',
-        'rn': 'run',
-        'ro': 'ron',
-        'ru': 'rus',
-        'rw': 'kin',
-        'sa': 'san',
-        'sc': 'srd',
-        'sd': 'snd',
-        'se': 'sme',
-        'sg': 'sag',
-        'si': 'sin',
-        'sk': 'slk',
-        'sl': 'slv',
-        'sm': 'smo',
-        'sn': 'sna',
-        'so': 'som',
-        'sq': 'sqi',
-        'sr': 'srp',
-        'ss': 'ssw',
-        'st': 'sot',
-        'su': 'sun',
-        'sv': 'swe',
-        'sw': 'swa',
-        'ta': 'tam',
-        'te': 'tel',
-        'tg': 'tgk',
-        'th': 'tha',
-        'ti': 'tir',
-        'tk': 'tuk',
-        'tl': 'tgl',
-        'tn': 'tsn',
-        'to': 'ton',
-        'tr': 'tur',
-        'ts': 'tso',
-        'tt': 'tat',
-        'tw': 'twi',
-        'ty': 'tah',
-        'ug': 'uig',
-        'uk': 'ukr',
-        'ur': 'urd',
-        'uz': 'uzb',
-        've': 'ven',
-        'vi': 'vie',
-        'vo': 'vol',
-        'wa': 'wln',
-        'wo': 'wol',
-        'xh': 'xho',
-        'yi': 'yid',
-        'yo': 'yor',
-        'za': 'zha',
-        'zh': 'zho',
-        'zu': 'zul',
-    }
-
-    @classmethod
-    def _conver_lang_code(cls, code):
-        """Convert language code from ISO 639-1 to ISO 639-2/T"""
-        return cls._lang_map.get(code[:2])
-
     def run(self, information):
-        if information['ext'] != 'mp4':
-            self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4 files')
-            return True, information
+        if information['ext'] not in ['mp4', 'mkv']:
+            self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4 or mkv files')
+            return [], information
         subtitles = information.get('requested_subtitles')
         if not subtitles:
             self._downloader.to_screen('[ffmpeg] There aren\'t any subtitles to embed')
-            return True, information
+            return [], information
 
         sub_langs = list(subtitles.keys())
         filename = information['filepath']
-        input_files = [filename] + [subtitles_filename(filename, lang, sub_info['ext']) for lang, sub_info in subtitles.items()]
+        sub_filenames = [subtitles_filename(filename, lang, sub_info['ext']) for lang, sub_info in subtitles.items()]
+        input_files = [filename] + sub_filenames
 
         opts = [
             '-map', '0',
@@ -522,11 +333,12 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
             # Don't copy the existing subtitles, we may be running the
             # postprocessor a second time
             '-map', '-0:s',
-            '-c:s', 'mov_text',
         ]
+        if information['ext'] == 'mp4':
+            opts += ['-c:s', 'mov_text']
         for (i, lang) in enumerate(sub_langs):
             opts.extend(['-map', '%d:0' % (i + 1)])
-            lang_code = self._conver_lang_code(lang)
+            lang_code = ISO639Utils.short2long(lang)
             if lang_code is not None:
                 opts.extend(['-metadata:s:s:%d' % i, 'language=%s' % lang_code])
 
@@ -536,7 +348,7 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
         os.remove(encodeFilename(filename))
         os.rename(encodeFilename(temp_filename), encodeFilename(filename))
 
-        return True, information
+        return sub_filenames, information
 
 
 class FFmpegMetadataPP(FFmpegPostProcessor):
@@ -562,7 +374,7 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
 
         if not metadata:
             self._downloader.to_screen('[ffmpeg] There isn\'t any metadata to add')
-            return True, info
+            return [], info
 
         filename = info['filepath']
         temp_filename = prepend_extension(filename, 'temp')
@@ -579,38 +391,42 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
         self.run_ffmpeg(filename, temp_filename, options)
         os.remove(encodeFilename(filename))
         os.rename(encodeFilename(temp_filename), encodeFilename(filename))
-        return True, info
+        return [], info
 
 
 class FFmpegMergerPP(FFmpegPostProcessor):
     def run(self, info):
         filename = info['filepath']
+        temp_filename = prepend_extension(filename, 'temp')
         args = ['-c', 'copy', '-map', '0:v:0', '-map', '1:a:0']
         self._downloader.to_screen('[ffmpeg] Merging formats into "%s"' % filename)
-        self.run_ffmpeg_multiple_files(info['__files_to_merge'], filename, args)
-        return True, info
-
-
-class FFmpegAudioFixPP(FFmpegPostProcessor):
-    def run(self, info):
-        filename = info['filepath']
-        temp_filename = prepend_extension(filename, 'temp')
-
-        options = ['-vn', '-acodec', 'copy']
-        self._downloader.to_screen('[ffmpeg] Fixing audio file "%s"' % filename)
-        self.run_ffmpeg(filename, temp_filename, options)
-
-        os.remove(encodeFilename(filename))
+        self.run_ffmpeg_multiple_files(info['__files_to_merge'], temp_filename, args)
         os.rename(encodeFilename(temp_filename), encodeFilename(filename))
+        return info['__files_to_merge'], info
 
-        return True, info
+    def can_merge(self):
+        # TODO: figure out merge-capable ffmpeg version
+        if self.basename != 'avconv':
+            return True
+
+        required_version = '10-0'
+        if is_outdated_version(
+                self._versions[self.basename], required_version):
+            warning = ('Your copy of %s is outdated and unable to properly mux separate video and audio files, '
+                       'youtube-dl will download single file media. '
+                       'Update %s to version %s or newer to fix this.') % (
+                           self.basename, self.basename, required_version)
+            if self._downloader:
+                self._downloader.report_warning(warning)
+            return False
+        return True
 
 
 class FFmpegFixupStretchedPP(FFmpegPostProcessor):
     def run(self, info):
         stretched_ratio = info.get('stretched_ratio')
         if stretched_ratio is None or stretched_ratio == 1:
-            return True, info
+            return [], info
 
         filename = info['filepath']
         temp_filename = prepend_extension(filename, 'temp')
@@ -622,13 +438,13 @@ class FFmpegFixupStretchedPP(FFmpegPostProcessor):
         os.remove(encodeFilename(filename))
         os.rename(encodeFilename(temp_filename), encodeFilename(filename))
 
-        return True, info
+        return [], info
 
 
 class FFmpegFixupM4aPP(FFmpegPostProcessor):
     def run(self, info):
         if info.get('container') != 'm4a_dash':
-            return True, info
+            return [], info
 
         filename = info['filepath']
         temp_filename = prepend_extension(filename, 'temp')
@@ -640,7 +456,7 @@ class FFmpegFixupM4aPP(FFmpegPostProcessor):
         os.remove(encodeFilename(filename))
         os.rename(encodeFilename(temp_filename), encodeFilename(filename))
 
-        return True, info
+        return [], info
 
 
 class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor):
@@ -657,7 +473,7 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor):
             new_format = 'webvtt'
         if subs is None:
             self._downloader.to_screen('[ffmpeg] There aren\'t any subtitles to convert')
-            return True, info
+            return [], info
         self._downloader.to_screen('[ffmpeg] Converting subtitles')
         for lang, sub in subs.items():
             ext = sub['ext']
@@ -667,6 +483,30 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor):
                     'format' % new_ext)
                 continue
             new_file = subtitles_filename(filename, lang, new_ext)
+
+            if ext == 'dfxp' or ext == 'ttml':
+                self._downloader.report_warning(
+                    'You have requested to convert dfxp (TTML) subtitles into another format, '
+                    'which results in style information loss')
+
+                dfxp_file = subtitles_filename(filename, lang, ext)
+                srt_file = subtitles_filename(filename, lang, 'srt')
+
+                with io.open(dfxp_file, 'rt', encoding='utf-8') as f:
+                    srt_data = dfxp2srt(f.read())
+
+                with io.open(srt_file, 'wt', encoding='utf-8') as f:
+                    f.write(srt_data)
+
+                ext = 'srt'
+                subs[lang] = {
+                    'ext': 'srt',
+                    'data': srt_data
+                }
+
+                if new_ext == 'srt':
+                    continue
+
             self.run_ffmpeg(
                 subtitles_filename(filename, lang, ext),
                 new_file, ['-f', new_format])
@@ -677,4 +517,4 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor):
                     'data': f.read(),
                 }
 
-        return True, info
+        return [], info
index 5019433d3dde55a9e18f82f40ad8f93e4a49d10b..a56077f206b5133f2fae3fadbcad10523353ed5a 100644 (file)
@@ -44,4 +44,4 @@ class MetadataFromTitlePP(PostProcessor):
             info[attribute] = value
             self._downloader.to_screen('[fromtitle] parsed ' + attribute + ': ' + value)
 
-        return True, info
+        return [], info
index f6c63fe97545d86947ef1ef4bf2d70e9ea7144be..7d88e130820e073af1b6fd527390cb1cb5dc8dec 100644 (file)
@@ -3,17 +3,34 @@ from __future__ import unicode_literals
 import os
 import subprocess
 import sys
+import errno
 
 from .common import PostProcessor
-from ..compat import (
-    subprocess_check_output
-)
 from ..utils import (
     check_executable,
     hyphenate_date,
+    version_tuple,
+    PostProcessingError,
+    encodeArgument,
+    encodeFilename,
 )
 
 
+class XAttrMetadataError(PostProcessingError):
+    def __init__(self, code=None, msg='Unknown error'):
+        super(XAttrMetadataError, self).__init__(msg)
+        self.code = code
+
+        # Parsing code and msg
+        if (self.code in (errno.ENOSPC, errno.EDQUOT) or
+                'No space left' in self.msg or 'Disk quota excedded' in self.msg):
+            self.reason = 'NO_SPACE'
+        elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
+            self.reason = 'VALUE_TOO_LONG'
+        else:
+            self.reason = 'NOT_SUPPORTED'
+
+
 class XAttrMetadataPP(PostProcessor):
 
     #
@@ -36,8 +53,24 @@ class XAttrMetadataPP(PostProcessor):
             # try the pyxattr module...
             import xattr
 
+            # Unicode arguments are not supported in python-pyxattr until
+            # version 0.5.0
+            # See https://github.com/rg3/youtube-dl/issues/5498
+            pyxattr_required_version = '0.5.0'
+            if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
+                self._downloader.report_warning(
+                    'python-pyxattr is detected but is too old. '
+                    'youtube-dl requires %s or above while your version is %s. '
+                    'Falling back to other xattr implementations' % (
+                        pyxattr_required_version, xattr.__version__))
+
+                raise ImportError
+
             def write_xattr(path, key, value):
-                return xattr.setxattr(path, key, value)
+                try:
+                    xattr.set(path, key, value)
+                except EnvironmentError as e:
+                    raise XAttrMetadataError(e.errno, e.strerror)
 
         except ImportError:
             if os.name == 'nt':
@@ -48,8 +81,11 @@ class XAttrMetadataPP(PostProcessor):
                     assert os.path.exists(path)
 
                     ads_fn = path + ":" + key
-                    with open(ads_fn, "wb") as f:
-                        f.write(value)
+                    try:
+                        with open(ads_fn, "wb") as f:
+                            f.write(value)
+                    except EnvironmentError as e:
+                        raise XAttrMetadataError(e.errno, e.strerror)
             else:
                 user_has_setfattr = check_executable("setfattr", ['--version'])
                 user_has_xattr = check_executable("xattr", ['-h'])
@@ -57,12 +93,27 @@ class XAttrMetadataPP(PostProcessor):
                 if user_has_setfattr or user_has_xattr:
 
                     def write_xattr(path, key, value):
+                        value = value.decode('utf-8')
                         if user_has_setfattr:
-                            cmd = ['setfattr', '-n', key, '-v', value, path]
+                            executable = 'setfattr'
+                            opts = ['-n', key, '-v', value]
                         elif user_has_xattr:
-                            cmd = ['xattr', '-w', key, value, path]
-
-                        subprocess_check_output(cmd)
+                            executable = 'xattr'
+                            opts = ['-w', key, value]
+
+                        cmd = ([encodeFilename(executable, True)] +
+                               [encodeArgument(o) for o in opts] +
+                               [encodeFilename(path, True)])
+
+                        try:
+                            p = subprocess.Popen(
+                                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
+                        except EnvironmentError as e:
+                            raise XAttrMetadataError(e.errno, e.strerror)
+                        stdout, stderr = p.communicate()
+                        stderr = stderr.decode('utf-8', 'replace')
+                        if p.returncode != 0:
+                            raise XAttrMetadataError(p.returncode, stderr)
 
                 else:
                     # On Unix, and can't find pyxattr, setfattr, or xattr.
@@ -105,8 +156,21 @@ class XAttrMetadataPP(PostProcessor):
                     byte_value = value.encode('utf-8')
                     write_xattr(filename, xattrname, byte_value)
 
-            return True, info
+            return [], info
 
-        except (subprocess.CalledProcessError, OSError):
-            self._downloader.report_error("This filesystem doesn't support extended attributes. (You may have to enable them in your /etc/fstab)")
-            return False, info
+        except XAttrMetadataError as e:
+            if e.reason == 'NO_SPACE':
+                self._downloader.report_warning(
+                    'There\'s no disk space left or disk quota exceeded. ' +
+                    'Extended attributes are not written.')
+            elif e.reason == 'VALUE_TOO_LONG':
+                self._downloader.report_warning(
+                    'Unable to write extended attributes due to too long values.')
+            else:
+                msg = 'This filesystem doesn\'t support extended attributes. '
+                if os.name == 'nt':
+                    msg += 'You need to use NTFS.'
+                else:
+                    msg += '(You may have to enable them in your /etc/fstab)'
+                self._downloader.report_error(msg)
+            return [], info
index de3169eef1d6ec29d82a60b2f4b6a68f49d7dd4e..fc7ac8305d71c8cce077ef3040cd0903ac9f09c5 100644 (file)
@@ -50,7 +50,7 @@ def rsa_verify(message, signature, key):
 def update_self(to_screen, verbose):
     """Update the program file with the latest version from the repository"""
 
-    UPDATE_URL = "http://rg3.github.io/youtube-dl/update/"
+    UPDATE_URL = "https://rg3.github.io/youtube-dl/update/"
     VERSION_URL = UPDATE_URL + 'LATEST_VERSION'
     JSON_URL = UPDATE_URL + 'versions.json'
     UPDATES_RSA_KEY = (0x9d60ee4d8f805312fdb15a62f87b95bd66177b91df176765d13514a0f1754bcd2057295c5b6f1d35daa6742c3ffc9a82d3e118861c207995a8031e151d863c9927e304576bc80692bc8e094896fcf11b66f3e29e04e3a71e9a11558558acea1840aec37fc396fb6b65dc81a1c4144e03bd1c011de62e3f1357b327d08426fe93, 65537)
index 52f0dd09aac2ef0103212086a280fc317b36b82d..942f76d2452c06a261d75e03cebc999fff02874c 100644 (file)
@@ -37,6 +37,7 @@ from .compat import (
     compat_chr,
     compat_html_entities,
     compat_http_client,
+    compat_kwargs,
     compat_parse_qs,
     compat_socket_create_connection,
     compat_str,
@@ -61,6 +62,8 @@ std_headers = {
 }
 
 
+NO_DEFAULT = object()
+
 ENGLISH_MONTH_NAMES = [
     'January', 'February', 'March', 'April', 'May', 'June',
     'July', 'August', 'September', 'October', 'November', 'December']
@@ -114,7 +117,7 @@ def write_json_file(obj, fn):
             'encoding': 'utf-8',
         })
 
-    tf = tempfile.NamedTemporaryFile(**args)
+    tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 
     try:
         with tf:
@@ -170,13 +173,15 @@ def xpath_with_ns(path, ns_map):
     return '/'.join(replaced)
 
 
-def xpath_text(node, xpath, name=None, fatal=False):
+def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
     if sys.version_info < (2, 7):  # Crazy 2.6
         xpath = xpath.encode('ascii')
 
     n = node.find(xpath)
     if n is None or n.text is None:
-        if fatal:
+        if default is not NO_DEFAULT:
+            return default
+        elif fatal:
             name = xpath if name is None else name
             raise ExtractorError('Could not find XML element %s' % name)
         else:
@@ -312,27 +317,20 @@ def sanitize_path(s):
     """Sanitizes and normalizes path on Windows"""
     if sys.platform != 'win32':
         return s
-    drive, _ = os.path.splitdrive(s)
-    unc, _ = os.path.splitunc(s)
-    unc_or_drive = unc or drive
-    norm_path = os.path.normpath(remove_start(s, unc_or_drive)).split(os.path.sep)
-    if unc_or_drive:
+    drive_or_unc, _ = os.path.splitdrive(s)
+    if sys.version_info < (2, 7) and not drive_or_unc:
+        drive_or_unc, _ = os.path.splitunc(s)
+    norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
+    if drive_or_unc:
         norm_path.pop(0)
     sanitized_path = [
         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
         for path_part in norm_path]
-    if unc_or_drive:
-        sanitized_path.insert(0, unc_or_drive + os.path.sep)
+    if drive_or_unc:
+        sanitized_path.insert(0, drive_or_unc + os.path.sep)
     return os.path.join(*sanitized_path)
 
 
-def sanitize_url_path_consecutive_slashes(url):
-    """Collapses consecutive slashes in URLs' path"""
-    parsed_url = list(compat_urlparse.urlparse(url))
-    parsed_url[2] = re.sub(r'/{2,}', '/', parsed_url[2])
-    return compat_urlparse.urlunparse(parsed_url)
-
-
 def orderedSet(iterable):
     """ Remove all duplicates from the input iterable """
     res = []
@@ -371,6 +369,18 @@ def unescapeHTML(s):
         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 
 
+def get_subprocess_encoding():
+    if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
+        # For subprocess calls, encode with locale encoding
+        # Refer to http://stackoverflow.com/a/9951851/35070
+        encoding = preferredencoding()
+    else:
+        encoding = sys.getfilesystemencoding()
+    if encoding is None:
+        encoding = 'utf-8'
+    return encoding
+
+
 def encodeFilename(s, for_subprocess=False):
     """
     @param s The name of the file
@@ -382,21 +392,24 @@ def encodeFilename(s, for_subprocess=False):
     if sys.version_info >= (3, 0):
         return s
 
-    if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
-        # Pass '' directly to use Unicode APIs on Windows 2000 and up
-        # (Detecting Windows NT 4 is tricky because 'major >= 4' would
-        # match Windows 9x series as well. Besides, NT 4 is obsolete.)
-        if not for_subprocess:
-            return s
-        else:
-            # For subprocess calls, encode with locale encoding
-            # Refer to http://stackoverflow.com/a/9951851/35070
-            encoding = preferredencoding()
-    else:
-        encoding = sys.getfilesystemencoding()
-    if encoding is None:
-        encoding = 'utf-8'
-    return s.encode(encoding, 'ignore')
+    # Pass '' directly to use Unicode APIs on Windows 2000 and up
+    # (Detecting Windows NT 4 is tricky because 'major >= 4' would
+    # match Windows 9x series as well. Besides, NT 4 is obsolete.)
+    if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
+        return s
+
+    return s.encode(get_subprocess_encoding(), 'ignore')
+
+
+def decodeFilename(b, for_subprocess=False):
+
+    if sys.version_info >= (3, 0):
+        return b
+
+    if not isinstance(b, bytes):
+        return b
+
+    return b.decode(get_subprocess_encoding(), 'ignore')
 
 
 def encodeArgument(s):
@@ -408,6 +421,10 @@ def encodeArgument(s):
     return encodeFilename(s, True)
 
 
+def decodeArgument(b):
+    return decodeFilename(b, True)
+
+
 def decodeOption(optval):
     if optval is None:
         return optval
@@ -452,6 +469,17 @@ def make_HTTPS_handler(params, **kwargs):
         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 
 
+def bug_reports_message():
+    if ytdl_is_updateable():
+        update_cmd = 'type  youtube-dl -U  to update'
+    else:
+        update_cmd = 'see  https://yt-dl.org/update  on how to update'
+    msg = '; please report this issue on https://yt-dl.org/bug .'
+    msg += ' Make sure you are using the latest version; %s.' % update_cmd
+    msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
+    return msg
+
+
 class ExtractorError(Exception):
     """Error during info extraction."""
 
@@ -467,13 +495,7 @@ class ExtractorError(Exception):
         if cause:
             msg += ' (caused by %r)' % cause
         if not expected:
-            if ytdl_is_updateable():
-                update_cmd = 'type  youtube-dl -U  to update'
-            else:
-                update_cmd = 'see  https://yt-dl.org/update  on how to update'
-            msg += '; please report this issue on https://yt-dl.org/bug .'
-            msg += ' Make sure you are using the latest version; %s.' % update_cmd
-            msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
+            msg += bug_reports_message()
         super(ExtractorError, self).__init__(msg)
 
         self.traceback = tb
@@ -1104,15 +1126,6 @@ def shell_quote(args):
     return ' '.join(quoted_args)
 
 
-def takewhile_inclusive(pred, seq):
-    """ Like itertools.takewhile, but include the latest evaluated element
-        (the first element so that Not pred(e)) """
-    for e in seq:
-        yield e
-        if not pred(e):
-            return
-
-
 def smuggle_url(url, data):
     """ Pass additional data in a URL for internal use. """
 
@@ -1333,9 +1346,19 @@ def parse_duration(s):
     return res
 
 
-def prepend_extension(filename, ext):
+def prepend_extension(filename, ext, expected_real_ext=None):
+    name, real_ext = os.path.splitext(filename)
+    return (
+        '{0}.{1}{2}'.format(name, ext, real_ext)
+        if not expected_real_ext or real_ext[1:] == expected_real_ext
+        else '{0}.{1}'.format(filename, ext))
+
+
+def replace_extension(filename, ext, expected_real_ext=None):
     name, real_ext = os.path.splitext(filename)
-    return '{0}.{1}{2}'.format(name, ext, real_ext)
+    return '{0}.{1}'.format(
+        name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
+        ext)
 
 
 def check_executable(exe, args=[]):
@@ -1354,7 +1377,7 @@ def get_exe_version(exe, args=['--version'],
     or False if the executable is not present """
     try:
         out, _ = subprocess.Popen(
-            [exe] + args,
+            [encodeArgument(exe)] + args,
             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
     except OSError:
         return False
@@ -1460,6 +1483,14 @@ def uppercase_escape(s):
         s)
 
 
+def lowercase_escape(s):
+    unicode_escape = codecs.getdecoder('unicode_escape')
+    return re.sub(
+        r'\\u[0-9a-fA-F]{4}',
+        lambda m: unicode_escape(m.group(0))[0],
+        s)
+
+
 def escape_rfc3986(s):
     """Escape non-ASCII characters as suggested by RFC 3986"""
     if sys.version_info < (3, 0) and isinstance(s, compat_str):
@@ -1638,6 +1669,7 @@ def mimetype2ext(mt):
     return {
         'x-ms-wmv': 'wmv',
         'x-mp4-fragmented': 'mp4',
+        'ttml+xml': 'ttml',
     }.get(res, res)
 
 
@@ -1795,6 +1827,527 @@ def match_filter_func(filter_str):
     return _match_func
 
 
+def parse_dfxp_time_expr(time_expr):
+    if not time_expr:
+        return 0.0
+
+    mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
+    if mobj:
+        return float(mobj.group('time_offset'))
+
+    mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
+    if mobj:
+        return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
+
+
+def srt_subtitles_timecode(seconds):
+    return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
+
+
+def dfxp2srt(dfxp_data):
+    _x = functools.partial(xpath_with_ns, ns_map={
+        'ttml': 'http://www.w3.org/ns/ttml',
+        'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
+    })
+
+    def parse_node(node):
+        str_or_empty = functools.partial(str_or_none, default='')
+
+        out = str_or_empty(node.text)
+
+        for child in node:
+            if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
+                out += '\n' + str_or_empty(child.tail)
+            elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
+                out += str_or_empty(parse_node(child))
+            else:
+                out += str_or_empty(xml.etree.ElementTree.tostring(child))
+
+        return out
+
+    dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
+    out = []
+    paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
+
+    if not paras:
+        raise ValueError('Invalid dfxp/TTML subtitle')
+
+    for para, index in zip(paras, itertools.count(1)):
+        begin_time = parse_dfxp_time_expr(para.attrib['begin'])
+        end_time = parse_dfxp_time_expr(para.attrib.get('end'))
+        if not end_time:
+            end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
+        out.append('%d\n%s --> %s\n%s\n\n' % (
+            index,
+            srt_subtitles_timecode(begin_time),
+            srt_subtitles_timecode(end_time),
+            parse_node(para)))
+
+    return ''.join(out)
+
+
+class ISO639Utils(object):
+    # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
+    _lang_map = {
+        'aa': 'aar',
+        'ab': 'abk',
+        'ae': 'ave',
+        'af': 'afr',
+        'ak': 'aka',
+        'am': 'amh',
+        'an': 'arg',
+        'ar': 'ara',
+        'as': 'asm',
+        'av': 'ava',
+        'ay': 'aym',
+        'az': 'aze',
+        'ba': 'bak',
+        'be': 'bel',
+        'bg': 'bul',
+        'bh': 'bih',
+        'bi': 'bis',
+        'bm': 'bam',
+        'bn': 'ben',
+        'bo': 'bod',
+        'br': 'bre',
+        'bs': 'bos',
+        'ca': 'cat',
+        'ce': 'che',
+        'ch': 'cha',
+        'co': 'cos',
+        'cr': 'cre',
+        'cs': 'ces',
+        'cu': 'chu',
+        'cv': 'chv',
+        'cy': 'cym',
+        'da': 'dan',
+        'de': 'deu',
+        'dv': 'div',
+        'dz': 'dzo',
+        'ee': 'ewe',
+        'el': 'ell',
+        'en': 'eng',
+        'eo': 'epo',
+        'es': 'spa',
+        'et': 'est',
+        'eu': 'eus',
+        'fa': 'fas',
+        'ff': 'ful',
+        'fi': 'fin',
+        'fj': 'fij',
+        'fo': 'fao',
+        'fr': 'fra',
+        'fy': 'fry',
+        'ga': 'gle',
+        'gd': 'gla',
+        'gl': 'glg',
+        'gn': 'grn',
+        'gu': 'guj',
+        'gv': 'glv',
+        'ha': 'hau',
+        'he': 'heb',
+        'hi': 'hin',
+        'ho': 'hmo',
+        'hr': 'hrv',
+        'ht': 'hat',
+        'hu': 'hun',
+        'hy': 'hye',
+        'hz': 'her',
+        'ia': 'ina',
+        'id': 'ind',
+        'ie': 'ile',
+        'ig': 'ibo',
+        'ii': 'iii',
+        'ik': 'ipk',
+        'io': 'ido',
+        'is': 'isl',
+        'it': 'ita',
+        'iu': 'iku',
+        'ja': 'jpn',
+        'jv': 'jav',
+        'ka': 'kat',
+        'kg': 'kon',
+        'ki': 'kik',
+        'kj': 'kua',
+        'kk': 'kaz',
+        'kl': 'kal',
+        'km': 'khm',
+        'kn': 'kan',
+        'ko': 'kor',
+        'kr': 'kau',
+        'ks': 'kas',
+        'ku': 'kur',
+        'kv': 'kom',
+        'kw': 'cor',
+        'ky': 'kir',
+        'la': 'lat',
+        'lb': 'ltz',
+        'lg': 'lug',
+        'li': 'lim',
+        'ln': 'lin',
+        'lo': 'lao',
+        'lt': 'lit',
+        'lu': 'lub',
+        'lv': 'lav',
+        'mg': 'mlg',
+        'mh': 'mah',
+        'mi': 'mri',
+        'mk': 'mkd',
+        'ml': 'mal',
+        'mn': 'mon',
+        'mr': 'mar',
+        'ms': 'msa',
+        'mt': 'mlt',
+        'my': 'mya',
+        'na': 'nau',
+        'nb': 'nob',
+        'nd': 'nde',
+        'ne': 'nep',
+        'ng': 'ndo',
+        'nl': 'nld',
+        'nn': 'nno',
+        'no': 'nor',
+        'nr': 'nbl',
+        'nv': 'nav',
+        'ny': 'nya',
+        'oc': 'oci',
+        'oj': 'oji',
+        'om': 'orm',
+        'or': 'ori',
+        'os': 'oss',
+        'pa': 'pan',
+        'pi': 'pli',
+        'pl': 'pol',
+        'ps': 'pus',
+        'pt': 'por',
+        'qu': 'que',
+        'rm': 'roh',
+        'rn': 'run',
+        'ro': 'ron',
+        'ru': 'rus',
+        'rw': 'kin',
+        'sa': 'san',
+        'sc': 'srd',
+        'sd': 'snd',
+        'se': 'sme',
+        'sg': 'sag',
+        'si': 'sin',
+        'sk': 'slk',
+        'sl': 'slv',
+        'sm': 'smo',
+        'sn': 'sna',
+        'so': 'som',
+        'sq': 'sqi',
+        'sr': 'srp',
+        'ss': 'ssw',
+        'st': 'sot',
+        'su': 'sun',
+        'sv': 'swe',
+        'sw': 'swa',
+        'ta': 'tam',
+        'te': 'tel',
+        'tg': 'tgk',
+        'th': 'tha',
+        'ti': 'tir',
+        'tk': 'tuk',
+        'tl': 'tgl',
+        'tn': 'tsn',
+        'to': 'ton',
+        'tr': 'tur',
+        'ts': 'tso',
+        'tt': 'tat',
+        'tw': 'twi',
+        'ty': 'tah',
+        'ug': 'uig',
+        'uk': 'ukr',
+        'ur': 'urd',
+        'uz': 'uzb',
+        've': 'ven',
+        'vi': 'vie',
+        'vo': 'vol',
+        'wa': 'wln',
+        'wo': 'wol',
+        'xh': 'xho',
+        'yi': 'yid',
+        'yo': 'yor',
+        'za': 'zha',
+        'zh': 'zho',
+        'zu': 'zul',
+    }
+
+    @classmethod
+    def short2long(cls, code):
+        """Convert language code from ISO 639-1 to ISO 639-2/T"""
+        return cls._lang_map.get(code[:2])
+
+    @classmethod
+    def long2short(cls, code):
+        """Convert language code from ISO 639-2/T to ISO 639-1"""
+        for short_name, long_name in cls._lang_map.items():
+            if long_name == code:
+                return short_name
+
+
+class ISO3166Utils(object):
+    # From http://data.okfn.org/data/core/country-list
+    _country_map = {
+        'AF': 'Afghanistan',
+        'AX': 'Åland Islands',
+        'AL': 'Albania',
+        'DZ': 'Algeria',
+        'AS': 'American Samoa',
+        'AD': 'Andorra',
+        'AO': 'Angola',
+        'AI': 'Anguilla',
+        'AQ': 'Antarctica',
+        'AG': 'Antigua and Barbuda',
+        'AR': 'Argentina',
+        'AM': 'Armenia',
+        'AW': 'Aruba',
+        'AU': 'Australia',
+        'AT': 'Austria',
+        'AZ': 'Azerbaijan',
+        'BS': 'Bahamas',
+        'BH': 'Bahrain',
+        'BD': 'Bangladesh',
+        'BB': 'Barbados',
+        'BY': 'Belarus',
+        'BE': 'Belgium',
+        'BZ': 'Belize',
+        'BJ': 'Benin',
+        'BM': 'Bermuda',
+        'BT': 'Bhutan',
+        'BO': 'Bolivia, Plurinational State of',
+        'BQ': 'Bonaire, Sint Eustatius and Saba',
+        'BA': 'Bosnia and Herzegovina',
+        'BW': 'Botswana',
+        'BV': 'Bouvet Island',
+        'BR': 'Brazil',
+        'IO': 'British Indian Ocean Territory',
+        'BN': 'Brunei Darussalam',
+        'BG': 'Bulgaria',
+        'BF': 'Burkina Faso',
+        'BI': 'Burundi',
+        'KH': 'Cambodia',
+        'CM': 'Cameroon',
+        'CA': 'Canada',
+        'CV': 'Cape Verde',
+        'KY': 'Cayman Islands',
+        'CF': 'Central African Republic',
+        'TD': 'Chad',
+        'CL': 'Chile',
+        'CN': 'China',
+        'CX': 'Christmas Island',
+        'CC': 'Cocos (Keeling) Islands',
+        'CO': 'Colombia',
+        'KM': 'Comoros',
+        'CG': 'Congo',
+        'CD': 'Congo, the Democratic Republic of the',
+        'CK': 'Cook Islands',
+        'CR': 'Costa Rica',
+        'CI': 'Côte d\'Ivoire',
+        'HR': 'Croatia',
+        'CU': 'Cuba',
+        'CW': 'Curaçao',
+        'CY': 'Cyprus',
+        'CZ': 'Czech Republic',
+        'DK': 'Denmark',
+        'DJ': 'Djibouti',
+        'DM': 'Dominica',
+        'DO': 'Dominican Republic',
+        'EC': 'Ecuador',
+        'EG': 'Egypt',
+        'SV': 'El Salvador',
+        'GQ': 'Equatorial Guinea',
+        'ER': 'Eritrea',
+        'EE': 'Estonia',
+        'ET': 'Ethiopia',
+        'FK': 'Falkland Islands (Malvinas)',
+        'FO': 'Faroe Islands',
+        'FJ': 'Fiji',
+        'FI': 'Finland',
+        'FR': 'France',
+        'GF': 'French Guiana',
+        'PF': 'French Polynesia',
+        'TF': 'French Southern Territories',
+        'GA': 'Gabon',
+        'GM': 'Gambia',
+        'GE': 'Georgia',
+        'DE': 'Germany',
+        'GH': 'Ghana',
+        'GI': 'Gibraltar',
+        'GR': 'Greece',
+        'GL': 'Greenland',
+        'GD': 'Grenada',
+        'GP': 'Guadeloupe',
+        'GU': 'Guam',
+        'GT': 'Guatemala',
+        'GG': 'Guernsey',
+        'GN': 'Guinea',
+        'GW': 'Guinea-Bissau',
+        'GY': 'Guyana',
+        'HT': 'Haiti',
+        'HM': 'Heard Island and McDonald Islands',
+        'VA': 'Holy See (Vatican City State)',
+        'HN': 'Honduras',
+        'HK': 'Hong Kong',
+        'HU': 'Hungary',
+        'IS': 'Iceland',
+        'IN': 'India',
+        'ID': 'Indonesia',
+        'IR': 'Iran, Islamic Republic of',
+        'IQ': 'Iraq',
+        'IE': 'Ireland',
+        'IM': 'Isle of Man',
+        'IL': 'Israel',
+        'IT': 'Italy',
+        'JM': 'Jamaica',
+        'JP': 'Japan',
+        'JE': 'Jersey',
+        'JO': 'Jordan',
+        'KZ': 'Kazakhstan',
+        'KE': 'Kenya',
+        'KI': 'Kiribati',
+        'KP': 'Korea, Democratic People\'s Republic of',
+        'KR': 'Korea, Republic of',
+        'KW': 'Kuwait',
+        'KG': 'Kyrgyzstan',
+        'LA': 'Lao People\'s Democratic Republic',
+        'LV': 'Latvia',
+        'LB': 'Lebanon',
+        'LS': 'Lesotho',
+        'LR': 'Liberia',
+        'LY': 'Libya',
+        'LI': 'Liechtenstein',
+        'LT': 'Lithuania',
+        'LU': 'Luxembourg',
+        'MO': 'Macao',
+        'MK': 'Macedonia, the Former Yugoslav Republic of',
+        'MG': 'Madagascar',
+        'MW': 'Malawi',
+        'MY': 'Malaysia',
+        'MV': 'Maldives',
+        'ML': 'Mali',
+        'MT': 'Malta',
+        'MH': 'Marshall Islands',
+        'MQ': 'Martinique',
+        'MR': 'Mauritania',
+        'MU': 'Mauritius',
+        'YT': 'Mayotte',
+        'MX': 'Mexico',
+        'FM': 'Micronesia, Federated States of',
+        'MD': 'Moldova, Republic of',
+        'MC': 'Monaco',
+        'MN': 'Mongolia',
+        'ME': 'Montenegro',
+        'MS': 'Montserrat',
+        'MA': 'Morocco',
+        'MZ': 'Mozambique',
+        'MM': 'Myanmar',
+        'NA': 'Namibia',
+        'NR': 'Nauru',
+        'NP': 'Nepal',
+        'NL': 'Netherlands',
+        'NC': 'New Caledonia',
+        'NZ': 'New Zealand',
+        'NI': 'Nicaragua',
+        'NE': 'Niger',
+        'NG': 'Nigeria',
+        'NU': 'Niue',
+        'NF': 'Norfolk Island',
+        'MP': 'Northern Mariana Islands',
+        'NO': 'Norway',
+        'OM': 'Oman',
+        'PK': 'Pakistan',
+        'PW': 'Palau',
+        'PS': 'Palestine, State of',
+        'PA': 'Panama',
+        'PG': 'Papua New Guinea',
+        'PY': 'Paraguay',
+        'PE': 'Peru',
+        'PH': 'Philippines',
+        'PN': 'Pitcairn',
+        'PL': 'Poland',
+        'PT': 'Portugal',
+        'PR': 'Puerto Rico',
+        'QA': 'Qatar',
+        'RE': 'Réunion',
+        'RO': 'Romania',
+        'RU': 'Russian Federation',
+        'RW': 'Rwanda',
+        'BL': 'Saint Barthélemy',
+        'SH': 'Saint Helena, Ascension and Tristan da Cunha',
+        'KN': 'Saint Kitts and Nevis',
+        'LC': 'Saint Lucia',
+        'MF': 'Saint Martin (French part)',
+        'PM': 'Saint Pierre and Miquelon',
+        'VC': 'Saint Vincent and the Grenadines',
+        'WS': 'Samoa',
+        'SM': 'San Marino',
+        'ST': 'Sao Tome and Principe',
+        'SA': 'Saudi Arabia',
+        'SN': 'Senegal',
+        'RS': 'Serbia',
+        'SC': 'Seychelles',
+        'SL': 'Sierra Leone',
+        'SG': 'Singapore',
+        'SX': 'Sint Maarten (Dutch part)',
+        'SK': 'Slovakia',
+        'SI': 'Slovenia',
+        'SB': 'Solomon Islands',
+        'SO': 'Somalia',
+        'ZA': 'South Africa',
+        'GS': 'South Georgia and the South Sandwich Islands',
+        'SS': 'South Sudan',
+        'ES': 'Spain',
+        'LK': 'Sri Lanka',
+        'SD': 'Sudan',
+        'SR': 'Suriname',
+        'SJ': 'Svalbard and Jan Mayen',
+        'SZ': 'Swaziland',
+        'SE': 'Sweden',
+        'CH': 'Switzerland',
+        'SY': 'Syrian Arab Republic',
+        'TW': 'Taiwan, Province of China',
+        'TJ': 'Tajikistan',
+        'TZ': 'Tanzania, United Republic of',
+        'TH': 'Thailand',
+        'TL': 'Timor-Leste',
+        'TG': 'Togo',
+        'TK': 'Tokelau',
+        'TO': 'Tonga',
+        'TT': 'Trinidad and Tobago',
+        'TN': 'Tunisia',
+        'TR': 'Turkey',
+        'TM': 'Turkmenistan',
+        'TC': 'Turks and Caicos Islands',
+        'TV': 'Tuvalu',
+        'UG': 'Uganda',
+        'UA': 'Ukraine',
+        'AE': 'United Arab Emirates',
+        'GB': 'United Kingdom',
+        'US': 'United States',
+        'UM': 'United States Minor Outlying Islands',
+        'UY': 'Uruguay',
+        'UZ': 'Uzbekistan',
+        'VU': 'Vanuatu',
+        'VE': 'Venezuela, Bolivarian Republic of',
+        'VN': 'Viet Nam',
+        'VG': 'Virgin Islands, British',
+        'VI': 'Virgin Islands, U.S.',
+        'WF': 'Wallis and Futuna',
+        'EH': 'Western Sahara',
+        'YE': 'Yemen',
+        'ZM': 'Zambia',
+        'ZW': 'Zimbabwe',
+    }
+
+    @classmethod
+    def short2full(cls, code):
+        """Convert an ISO 3166-2 country code to the corresponding full name"""
+        return cls._country_map.get(code.upper())
+
+
 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
     def __init__(self, proxies=None):
         # Set default handlers
index 1095fea2fe908fe59d28b3489b7916d05b8e6354..280afdd7f8f21cc058768eba232a11672ca32761 100644 (file)
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '2015.04.09'
+__version__ = '2015.07.21'