Merge branch 'ir90tv' of https://github.com/cyb3r/youtube-dl into cyb3r-ir90tv
authorYen Chi Hsuan <yan12125@gmail.com>
Sat, 25 Jul 2015 10:36:04 +0000 (18:36 +0800)
committerYen Chi Hsuan <yan12125@gmail.com>
Sat, 25 Jul 2015 10:36:04 +0000 (18:36 +0800)
232 files changed:
AUTHORS
README.md
docs/supportedsites.md
test/test_YoutubeDL.py
test/test_aes.py
test/test_compat.py
test/test_subtitles.py
test/test_utils.py
tox.ini
youtube_dl/YoutubeDL.py
youtube_dl/__init__.py
youtube_dl/aes.py
youtube_dl/compat.py
youtube_dl/downloader/__init__.py
youtube_dl/downloader/dash.py [new file with mode: 0644]
youtube_dl/downloader/external.py
youtube_dl/extractor/__init__.py
youtube_dl/extractor/adobetv.py
youtube_dl/extractor/aftenposten.py
youtube_dl/extractor/aftonbladet.py
youtube_dl/extractor/appleconnect.py [new file with mode: 0644]
youtube_dl/extractor/ard.py
youtube_dl/extractor/arte.py
youtube_dl/extractor/baidu.py
youtube_dl/extractor/bbccouk.py
youtube_dl/extractor/bet.py
youtube_dl/extractor/bilibili.py
youtube_dl/extractor/bliptv.py
youtube_dl/extractor/br.py
youtube_dl/extractor/brightcove.py
youtube_dl/extractor/byutv.py
youtube_dl/extractor/canalplus.py
youtube_dl/extractor/cbs.py
youtube_dl/extractor/cbsnews.py
youtube_dl/extractor/ccc.py
youtube_dl/extractor/ceskatelevize.py
youtube_dl/extractor/chilloutzone.py
youtube_dl/extractor/cinemassacre.py
youtube_dl/extractor/clipsyndicate.py
youtube_dl/extractor/cnet.py
youtube_dl/extractor/cnn.py
youtube_dl/extractor/common.py
youtube_dl/extractor/crunchyroll.py
youtube_dl/extractor/ctsnews.py
youtube_dl/extractor/dailymotion.py
youtube_dl/extractor/dfb.py
youtube_dl/extractor/discovery.py
youtube_dl/extractor/douyutv.py
youtube_dl/extractor/dramafever.py [new file with mode: 0644]
youtube_dl/extractor/drbonanza.py
youtube_dl/extractor/drtuber.py
youtube_dl/extractor/drtv.py
youtube_dl/extractor/dumpert.py
youtube_dl/extractor/ehow.py
youtube_dl/extractor/empflix.py [deleted file]
youtube_dl/extractor/eroprofile.py
youtube_dl/extractor/espn.py [new file with mode: 0644]
youtube_dl/extractor/facebook.py
youtube_dl/extractor/faz.py
youtube_dl/extractor/firedrive.py [deleted file]
youtube_dl/extractor/fivetv.py [new file with mode: 0644]
youtube_dl/extractor/foxsports.py
youtube_dl/extractor/francetv.py
youtube_dl/extractor/gamespot.py
youtube_dl/extractor/generic.py
youtube_dl/extractor/gfycat.py
youtube_dl/extractor/gorillavid.py
youtube_dl/extractor/hentaistigma.py
youtube_dl/extractor/hostingbulk.py
youtube_dl/extractor/howcast.py
youtube_dl/extractor/howstuffworks.py
youtube_dl/extractor/imdb.py
youtube_dl/extractor/imgur.py
youtube_dl/extractor/ina.py
youtube_dl/extractor/infoq.py
youtube_dl/extractor/instagram.py
youtube_dl/extractor/iprima.py
youtube_dl/extractor/iqiyi.py [new file with mode: 0644]
youtube_dl/extractor/izlesene.py
youtube_dl/extractor/jeuxvideo.py
youtube_dl/extractor/kanalplay.py
youtube_dl/extractor/karaoketv.py
youtube_dl/extractor/karrierevideos.py [new file with mode: 0644]
youtube_dl/extractor/kickstarter.py
youtube_dl/extractor/kuwo.py [new file with mode: 0644]
youtube_dl/extractor/lecture2go.py [new file with mode: 0644]
youtube_dl/extractor/letv.py
youtube_dl/extractor/lifenews.py
youtube_dl/extractor/liveleak.py
youtube_dl/extractor/lynda.py
youtube_dl/extractor/malemotion.py
youtube_dl/extractor/metacafe.py
youtube_dl/extractor/mitele.py
youtube_dl/extractor/mixcloud.py
youtube_dl/extractor/mlb.py
youtube_dl/extractor/mofosex.py
youtube_dl/extractor/myspass.py
youtube_dl/extractor/myvi.py [new file with mode: 0644]
youtube_dl/extractor/myvideo.py
youtube_dl/extractor/nationalgeographic.py
youtube_dl/extractor/naver.py
youtube_dl/extractor/nba.py
youtube_dl/extractor/ndr.py
youtube_dl/extractor/neteasemusic.py [new file with mode: 0644]
youtube_dl/extractor/netzkino.py
youtube_dl/extractor/newstube.py
youtube_dl/extractor/nextmedia.py
youtube_dl/extractor/nfl.py
youtube_dl/extractor/niconico.py
youtube_dl/extractor/noco.py
youtube_dl/extractor/nova.py [new file with mode: 0644]
youtube_dl/extractor/nowtv.py [new file with mode: 0644]
youtube_dl/extractor/npo.py
youtube_dl/extractor/nrk.py
youtube_dl/extractor/nytimes.py
youtube_dl/extractor/odnoklassniki.py
youtube_dl/extractor/onionstudios.py [new file with mode: 0644]
youtube_dl/extractor/ooyala.py
youtube_dl/extractor/openfilm.py
youtube_dl/extractor/patreon.py
youtube_dl/extractor/pbs.py
youtube_dl/extractor/photobucket.py
youtube_dl/extractor/pinkbike.py [new file with mode: 0644]
youtube_dl/extractor/planetaplay.py
youtube_dl/extractor/played.py
youtube_dl/extractor/playvid.py
youtube_dl/extractor/porn91.py [new file with mode: 0644]
youtube_dl/extractor/pornhub.py
youtube_dl/extractor/pornovoisines.py
youtube_dl/extractor/primesharetv.py
youtube_dl/extractor/promptfile.py
youtube_dl/extractor/prosiebensat1.py
youtube_dl/extractor/qqmusic.py
youtube_dl/extractor/quickvid.py
youtube_dl/extractor/rds.py [new file with mode: 0644]
youtube_dl/extractor/rtbf.py
youtube_dl/extractor/rtlnl.py
youtube_dl/extractor/rtlnow.py [deleted file]
youtube_dl/extractor/rts.py
youtube_dl/extractor/rtve.py
youtube_dl/extractor/rutv.py
youtube_dl/extractor/ruutu.py [new file with mode: 0644]
youtube_dl/extractor/safari.py
youtube_dl/extractor/sbs.py
youtube_dl/extractor/screenwavemedia.py
youtube_dl/extractor/senateisvp.py
youtube_dl/extractor/shared.py
youtube_dl/extractor/smotri.py
youtube_dl/extractor/snagfilms.py [new file with mode: 0644]
youtube_dl/extractor/sockshare.py [deleted file]
youtube_dl/extractor/sohu.py
youtube_dl/extractor/soompi.py [new file with mode: 0644]
youtube_dl/extractor/soundcloud.py
youtube_dl/extractor/southpark.py
youtube_dl/extractor/spankwire.py
youtube_dl/extractor/spiegeltv.py
youtube_dl/extractor/sportbox.py
youtube_dl/extractor/sunporno.py
youtube_dl/extractor/svt.py [new file with mode: 0644]
youtube_dl/extractor/svtplay.py [deleted file]
youtube_dl/extractor/tagesschau.py
youtube_dl/extractor/teamcoco.py
youtube_dl/extractor/telecinco.py
youtube_dl/extractor/tenplay.py
youtube_dl/extractor/tf1.py
youtube_dl/extractor/theplatform.py
youtube_dl/extractor/thesixtyone.py
youtube_dl/extractor/thisamericanlife.py [new file with mode: 0644]
youtube_dl/extractor/tlc.py
youtube_dl/extractor/tmz.py
youtube_dl/extractor/tnaflix.py
youtube_dl/extractor/tube8.py
youtube_dl/extractor/tubitv.py [new file with mode: 0644]
youtube_dl/extractor/tumblr.py
youtube_dl/extractor/turbo.py
youtube_dl/extractor/tutv.py
youtube_dl/extractor/tv2.py [new file with mode: 0644]
youtube_dl/extractor/tvc.py [new file with mode: 0644]
youtube_dl/extractor/tvigle.py
youtube_dl/extractor/tvplay.py
youtube_dl/extractor/twentyfourvideo.py
youtube_dl/extractor/twitch.py
youtube_dl/extractor/twitter.py [new file with mode: 0644]
youtube_dl/extractor/udemy.py
youtube_dl/extractor/udn.py
youtube_dl/extractor/ultimedia.py
youtube_dl/extractor/vbox7.py
youtube_dl/extractor/veehd.py
youtube_dl/extractor/vgtv.py
youtube_dl/extractor/vice.py
youtube_dl/extractor/videomega.py
youtube_dl/extractor/videott.py
youtube_dl/extractor/vidme.py
youtube_dl/extractor/vier.py
youtube_dl/extractor/viewster.py
youtube_dl/extractor/viki.py
youtube_dl/extractor/vimeo.py
youtube_dl/extractor/vimple.py
youtube_dl/extractor/vine.py
youtube_dl/extractor/vk.py
youtube_dl/extractor/vodlocker.py
youtube_dl/extractor/voicerepublic.py [new file with mode: 0644]
youtube_dl/extractor/vube.py
youtube_dl/extractor/vuclip.py
youtube_dl/extractor/vulture.py
youtube_dl/extractor/webofstories.py
youtube_dl/extractor/wimp.py
youtube_dl/extractor/worldstarhiphop.py
youtube_dl/extractor/xbef.py
youtube_dl/extractor/xhamster.py
youtube_dl/extractor/xminus.py
youtube_dl/extractor/xnxx.py
youtube_dl/extractor/xstream.py [new file with mode: 0644]
youtube_dl/extractor/xtube.py
youtube_dl/extractor/xuite.py
youtube_dl/extractor/xvideos.py
youtube_dl/extractor/yahoo.py
youtube_dl/extractor/yam.py
youtube_dl/extractor/yinyuetai.py [new file with mode: 0644]
youtube_dl/extractor/ynet.py
youtube_dl/extractor/youku.py
youtube_dl/extractor/youtube.py
youtube_dl/extractor/zingmp3.py
youtube_dl/options.py
youtube_dl/postprocessor/common.py
youtube_dl/postprocessor/embedthumbnail.py
youtube_dl/postprocessor/execafterdownload.py
youtube_dl/postprocessor/ffmpeg.py
youtube_dl/postprocessor/xattrpp.py
youtube_dl/update.py
youtube_dl/utils.py
youtube_dl/version.py

diff --git a/AUTHORS b/AUTHORS
index 267b8da1e6ffbda7853d53996de4020111074834..e75e9885d83d960e5013e0bdf6047016dc1b3239 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -124,3 +124,14 @@ Mohammad Teimori Pabandi
 Roman Le Négrate
 Matthias Küch
 Julian Richen
+Ping O.
+Mister Hat
+Peter Ding
+jackyzy823
+George Brighton
+Remita Amine
+Aurélio A. Heckert
+Bernhard Minks
+sceext
+Zach Bruggeman
+Tjark Saul
index 9aeb114f399895483e59b211e390b0d109383715..ac54d7b67b8c36d370495759153678f711ac614e 100644 (file)
--- a/README.md
+++ b/README.md
@@ -17,12 +17,12 @@ youtube-dl - download videos from youtube.com or other video platforms
 To install it right away for all UNIX users (Linux, OS X, etc.), type:
 
     sudo curl https://yt-dl.org/latest/youtube-dl -o /usr/local/bin/youtube-dl
-    sudo chmod a+x /usr/local/bin/youtube-dl
+    sudo chmod a+rx /usr/local/bin/youtube-dl
 
 If you do not have curl, you can alternatively use a recent wget:
 
     sudo wget https://yt-dl.org/downloads/latest/youtube-dl -O /usr/local/bin/youtube-dl
-    sudo chmod a+x /usr/local/bin/youtube-dl
+    sudo chmod a+rx /usr/local/bin/youtube-dl
 
 Windows users can [download a .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in their home directory or any other location on their [PATH](http://en.wikipedia.org/wiki/PATH_%28variable%29).
 
@@ -52,8 +52,9 @@ which means you can modify it, redistribute it or use it however you like.
     -i, --ignore-errors              Continue on download errors, for example to skip unavailable videos in a playlist
     --abort-on-error                 Abort downloading of further videos (in the playlist or the command line) if an error occurs
     --dump-user-agent                Display the current browser identification
-    --list-extractors                List all supported extractors and the URLs they would handle
+    --list-extractors                List all supported extractors
     --extractor-descriptions         Output descriptions of all supported extractors
+    --force-generic-extractor        Force extraction to use the generic extractor
     --default-search PREFIX          Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple".
                                      Use the value "auto" to let youtube-dl guess ("auto_warning" to emit a warning when guessing). "error" just throws an error. The
                                      default value "fixup_error" repairs broken URLs, but emits an error if this is not possible instead of searching.
@@ -74,7 +75,7 @@ which means you can modify it, redistribute it or use it however you like.
 ## Video Selection:
     --playlist-start NUMBER          Playlist video to start at (default is 1)
     --playlist-end NUMBER            Playlist video to end at (default is last)
-    --playlist-items ITEM_SPEC       Playlist video items to download. Specify indices of the videos in the playlist seperated by commas like: "--playlist-items 1,2,5,8"
+    --playlist-items ITEM_SPEC       Playlist video items to download. Specify indices of the videos in the playlist separated by commas like: "--playlist-items 1,2,5,8"
                                      if you want to download videos indexed 1, 2, 5, 8 in the playlist. You can specify range: "--playlist-items 1-3,7,10-13", it will
                                      download the videos at index 1, 2, 3, 7, 10, 11, 12 and 13.
     --match-title REGEX              Download only matching titles (regex or caseless sub-string)
@@ -107,7 +108,7 @@ which means you can modify it, redistribute it or use it however you like.
     --playlist-reverse               Download playlist videos in reverse order
     --xattr-set-filesize             Set file xattribute ytdl.filesize with expected filesize (experimental)
     --hls-prefer-native              Use the native HLS downloader instead of ffmpeg (experimental)
-    --external-downloader COMMAND    Use the specified external downloader. Currently supports aria2c,curl,wget
+    --external-downloader COMMAND    Use the specified external downloader. Currently supports aria2c,curl,httpie,wget
     --external-downloader-args ARGS  Give these arguments to the external downloader
 
 ## Filesystem Options:
@@ -168,7 +169,7 @@ which means you can modify it, redistribute it or use it however you like.
     --no-progress                    Do not print progress bar
     --console-title                  Display progress in console titlebar
     -v, --verbose                    Print various debugging information
-    --dump-pages                     Print downloaded pages to debug problems (very verbose)
+    --dump-pages                     Print downloaded pages encoded using base64 to debug problems (very verbose)
     --write-pages                    Write downloaded intermediary pages to files in the current directory to debug problems
     --print-traffic                  Display sent and read HTTP traffic
     -C, --call-home                  Contact the youtube-dl server for debugging
@@ -189,8 +190,8 @@ which means you can modify it, redistribute it or use it however you like.
     --all-formats                    Download all available video formats
     --prefer-free-formats            Prefer free video formats unless a specific one is requested
     -F, --list-formats               List all available formats
-    --youtube-skip-dash-manifest     Do not download the DASH manifest on YouTube videos
-    --merge-output-format FORMAT     If a merge is required (e.g. bestvideo+bestaudio), output to given container format. One of mkv, mp4, ogg, webm, flv.Ignored if no
+    --youtube-skip-dash-manifest     Do not download the DASH manifests and related data on YouTube videos
+    --merge-output-format FORMAT     If a merge is required (e.g. bestvideo+bestaudio), output to given container format. One of mkv, mp4, ogg, webm, flv. Ignored if no
                                      merge is required
 
 ## Subtitle Options:
@@ -213,17 +214,18 @@ which means you can modify it, redistribute it or use it however you like.
     --audio-format FORMAT            Specify audio format: "best", "aac", "vorbis", "mp3", "m4a", "opus", or "wav"; "best" by default
     --audio-quality QUALITY          Specify ffmpeg/avconv audio quality, insert a value between 0 (better) and 9 (worse) for VBR or a specific bitrate like 128K (default
                                      5)
-    --recode-video FORMAT            Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv)
+    --recode-video FORMAT            Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv|avi)
+    --postprocessor-args ARGS        Give these arguments to the postprocessor
     -k, --keep-video                 Keep the video file on disk after the post-processing; the video is erased by default
     --no-post-overwrites             Do not overwrite post-processed files; the post-processed files are overwritten by default
     --embed-subs                     Embed subtitles in the video (only for mkv and mp4 videos)
     --embed-thumbnail                Embed thumbnail in the audio as cover art
     --add-metadata                   Write metadata to the video file
     --metadata-from-title FORMAT     Parse additional metadata like song title / artist from the video title. The format syntax is the same as --output, the parsed
-                                     parameters replace existing values. Additional templates: %(album), %(artist). Example: --metadata-from-title "%(artist)s -
+                                     parameters replace existing values. Additional templates: %(album)s, %(artist)s. Example: --metadata-from-title "%(artist)s -
                                      %(title)s" matches a title like "Coldplay - Paradise"
     --xattrs                         Write metadata to the video file's xattrs (using dublin core and xdg standards)
-    --fixup POLICY                   Automatically correct known faults of the file. One of never (do nothing), warn (only emit a warning), detect_or_warn(the default;
+    --fixup POLICY                   Automatically correct known faults of the file. One of never (do nothing), warn (only emit a warning), detect_or_warn (the default;
                                      fix file if we can, warn otherwise)
     --prefer-avconv                  Prefer avconv over ffmpeg for running the postprocessors (default)
     --prefer-ffmpeg                  Prefer ffmpeg over avconv for running the postprocessors
@@ -236,6 +238,26 @@ which means you can modify it, redistribute it or use it however you like.
 
 You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl/config`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config.txt` and `C:\Users\<user name>\youtube-dl.conf`.
 
+### Authentication with `.netrc` file ###
+
+You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every youtube-dl execution and prevent tracking plain text passwords in shell command history. You can achieve this using [`.netrc` file](http://stackoverflow.com/tags/.netrc/info) on per extractor basis. For that you will need to create `.netrc` file in your `$HOME` and restrict permissions to read/write by you only:
+```
+touch $HOME/.netrc
+chmod a-rwx,u+rw $HOME/.netrc
+```
+After that you can add credentials for extractor in the following format, where *extractor* is the name of extractor in lowercase:
+```
+machine <extractor> login <login> password <password>
+```
+For example:
+```
+machine youtube login myaccount@gmail.com password my_youtube_password
+machine twitch login my_twitch_account_name password my_twitch_password
+```
+To activate authentication with `.netrc` file you should pass `--netrc` to youtube-dl or to place it in [configuration file](#configuration).
+
+On Windows you may also need to setup `%HOME%` environment variable manually.
+
 # OUTPUT TEMPLATE
 
 The `-o` option allows users to indicate a template for the output file names. The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "http://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences have the format `%(NAME)s`. To clarify, that is a percent symbol followed by a name in parenthesis, followed by a lowercase S. Allowed names are:
@@ -269,7 +291,7 @@ The simplest case is requesting a specific format, for example `-f 22`. You can
 
 If you want to download multiple videos and they don't have the same formats available, you can specify the order of preference using slashes, as in `-f 22/17/18`. You can also filter the video results by putting a condition in brackets, as in `-f "best[height=720]"` (or `-f "[filesize>10M]"`).  This works for filesize, height, width, tbr, abr, vbr, asr, and fps and the comparisons <, <=, >, >=, =, != and for ext, acodec, vcodec, container, and protocol and the comparisons =, != . Formats for which the value is not known are excluded unless you put a question mark (?) after the operator. You can combine format filters, so  `-f "[height <=? 720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 KBit/s. Use commas to download multiple formats, such as `-f 136/137/mp4/bestvideo,140/m4a/bestaudio`. You can merge the video and audio of two formats into a single file using `-f <video-format>+<audio-format>` (requires ffmpeg or avconv), for example `-f bestvideo+bestaudio`.
 
-Since the end of April 2015 and version 2015.04.26 youtube-dl uses `-f bestvideo+bestaudio/best` as default format selection (see #5447, #5456). If ffmpeg or avconv are installed this results in downloading `bestvideo` and `bestaudio` separately and muxing them together into a single file giving the best overall quality available. Otherwise it falls back to `best` and results in downloading best available quality served as a single file. `best` is also needed for videos that don't come from YouTube because they don't provide the audio and video in two different files. If you want to only download some dash formats (for example if you are not interested in getting videos with a resolution higher than 1080p), you can add `-f bestvideo[height<=?1080]+bestaudio/best` to your configuration file.
+Since the end of April 2015 and version 2015.04.26 youtube-dl uses `-f bestvideo+bestaudio/best` as default format selection (see #5447, #5456). If ffmpeg or avconv are installed this results in downloading `bestvideo` and `bestaudio` separately and muxing them together into a single file giving the best overall quality available. Otherwise it falls back to `best` and results in downloading best available quality served as a single file. `best` is also needed for videos that don't come from YouTube because they don't provide the audio and video in two different files. If you want to only download some dash formats (for example if you are not interested in getting videos with a resolution higher than 1080p), you can add `-f bestvideo[height<=?1080]+bestaudio/best` to your configuration file. Note that if you use youtube-dl to stream to `stdout` (and most likely to pipe it to your media player then), i.e. you explicitly specify output template as `-o -`, youtube-dl still uses `-f best` format selection in order to start content delivery immediately to your player and not to wait until `bestvideo` and `bestaudio` are downloaded and muxed.
 
 If you want to preserve the old format selection behavior (prior to youtube-dl 2015.04.26), i.e. you want to download best available quality media served as a single file, you should explicitly specify your choice with `-f best`. You may want to add it to the [configuration file](#configuration) in order not to type it every time you run youtube-dl.
 
@@ -379,7 +401,7 @@ In February 2015, the new YouTube player contained a character sequence in a str
 
 ### HTTP Error 429: Too Many Requests or 402: Payment Required
 
-These two error codes indicate that the service is blocking your IP address because of overuse. Contact the service and ask them to unblock your IP address, or - if you have acquired a whitelisted IP address already - use the [`--proxy` or `--network-address` options](#network-options) to select another IP address.
+These two error codes indicate that the service is blocking your IP address because of overuse. Contact the service and ask them to unblock your IP address, or - if you have acquired a whitelisted IP address already - use the [`--proxy` or `--source-address` options](#network-options) to select another IP address.
 
 ### SyntaxError: Non-ASCII character ###
 
index 6d2e496a893d6e6bda5257dcffa6273b9ee84084..73445137f3f165de4d98b6f1c367553f5ce9345a 100644 (file)
  - **56.com**
  - **5min**
  - **8tracks**
+ - **91porn**
  - **9gag**
  - **abc.net.au**
  - **Abc7News**
  - **AcademicEarth:Course**
  - **AddAnime**
  - **AdobeTV**
+ - **AdobeTVVideo**
  - **AdultSwim**
  - **Aftenposten**
  - **Aftonbladet**
@@ -26,8 +28,8 @@
  - **anitube.se**
  - **AnySex**
  - **Aparat**
- - **AppleDailyAnimationNews**
- - **AppleDailyRealtimeNews**
+ - **AppleConnect**
+ - **AppleDaily**: 臺灣蘋果日報
  - **AppleTrailers**
  - **archive.org**: archive.org videos
  - **ARD**
@@ -44,7 +46,7 @@
  - **audiomack**
  - **audiomack:album**
  - **Azubu**
- - **BaiduVideo**
+ - **BaiduVideo**: 百度视频
  - **bambuser**
  - **bambuser:channel**
  - **Bandcamp**
@@ -64,6 +66,8 @@
  - **BR**: Bayerischer Rundfunk Mediathek
  - **Break**
  - **Brightcove**
+ - **bt:article**: Bergens Tidende Articles
+ - **bt:vestlendingen**: Bergens Tidende - Vestlendingen
  - **BuzzFeed**
  - **BYUtv**
  - **Camdemy**
  - **Crunchyroll**
  - **crunchyroll:playlist**
  - **CSpan**: C-SPAN
- - **CtsNews**
+ - **CtsNews**: 華視新聞
  - **culturebox.francetvinfo.fr**
  - **dailymotion**
  - **dailymotion:playlist**
  - **dailymotion:user**
+ - **DailymotionCloud**
  - **daum.net**
  - **DBTV**
  - **DctpTv**
  - **Discovery**
  - **divxstage**: DivxStage
  - **Dotsub**
- - **DouyuTV**
+ - **DouyuTV**: 斗鱼
+ - **dramafever**
+ - **dramafever:series**
  - **DRBonanza**
  - **Dropbox**
  - **DrTuber**
  - **Eporner**
  - **EroProfile**
  - **Escapist**
+ - **ESPN** (Currently broken)
  - **EveryonesMixtape**
  - **exfm**: ex.fm
  - **ExpoTV**
  - **fc2**
  - **fernsehkritik.tv**
  - **fernsehkritik.tv:postecke**
- - **Firedrive**
  - **Firstpost**
+ - **FiveTV**
  - **Flickr**
  - **Folketinget**: Folketinget (ft.dk; Danish parliament)
  - **FootyRoom**
  - **instagram:user**: Instagram user profile
  - **InternetVideoArchive**
  - **IPrima**
+ - **iqiyi**: 爱奇艺
  - **ivi**: ivi.ru
  - **ivi:compilation**: ivi.ru compilations
  - **Izlesene**
  - **KanalPlay**: Kanal 5/9/11 Play
  - **Kankan**
  - **Karaoketv**
+ - **KarriereVideos**
  - **keek**
  - **KeezMovies**
  - **KhanAcademy**
  - **kontrtube**: KontrTube.ru - Труба зовёт
  - **KrasView**: Красвью
  - **Ku6**
+ - **kuwo:album**: 酷我音乐 - 专辑
+ - **kuwo:category**: 酷我音乐 - 分类
+ - **kuwo:chart**: 酷我音乐 - 排行榜
+ - **kuwo:mv**: 酷我音乐 - MV
+ - **kuwo:singer**: 酷我音乐 - 歌手
+ - **kuwo:song**: 酷我音乐
  - **la7.tv**
  - **Laola1Tv**
- - **Letv**
+ - **Letv**: 乐视网
  - **LetvPlaylist**
  - **LetvTv**
  - **Libsyn**
+ - **life:embed**
  - **lifenews**: LIFE | NEWS
  - **LiveLeak**
  - **livestream**
  - **Motherless**
  - **Motorsport**: motorsport.com
  - **MovieClips**
+ - **MovieFap**
  - **Moviezine**
  - **movshare**: MovShare
  - **MPORA**
  - **MySpace**
  - **MySpace:album**
  - **MySpass**
+ - **Myvi**
  - **myvideo**
  - **MyVidster**
+ - **N-JOY**
  - **n-tv.de**
  - **NationalGeographic**
  - **Naver**
  - **NDTV**
  - **NerdCubedFeed**
  - **Nerdist**
+ - **netease:album**: 网易云音乐 - 专辑
+ - **netease:djradio**: 网易云音乐 - 电台
+ - **netease:mv**: 网易云音乐 - MV
+ - **netease:playlist**: 网易云音乐 - 歌单
+ - **netease:program**: 网易云音乐 - 电台节目
+ - **netease:singer**: 网易云音乐 - 歌手
+ - **netease:song**: 网易云音乐
  - **Netzkino**
  - **Newgrounds**
  - **Newstube**
- - **NextMedia**
- - **NextMediaActionNews**
+ - **NextMedia**: 蘋果日報
+ - **NextMediaActionNews**: 蘋果日報 - 動新聞
  - **nfb**: National Film Board of Canada
  - **nfl.com**
  - **nhl.com**
  - **Noco**
  - **Normalboots**
  - **NosVideo**
+ - **Nova**: TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz
  - **novamov**: NovaMov
  - **Nowness**
+ - **NowTV**
  - **nowvideo**: NowVideo
- - **npo.nl**
+ - **npo**: npo.nl and ntr.nl
+ - **npo**: npo.nl and ntr.nl
  - **npo.nl:live**
  - **npo.nl:radio**
  - **npo.nl:radio:fragment**
  - **NRK**
  - **NRKPlaylist**
- - **NRKTV**
+ - **NRKTV**: NRK TV and NRK Radio
  - **ntv.ru**
  - **Nuvid**
  - **NYTimes**
+ - **NYTimesArticle**
  - **ocw.mit.edu**
  - **Odnoklassniki**
  - **OktoberfestTV**
  - **on.aol.com**
+ - **OnionStudios**
  - **Ooyala**
+ - **OoyalaExternal**
  - **OpenFilm**
  - **orf:fm4**: radio FM4
  - **orf:iptv**: iptv.ORF.at
  - **PhilharmonieDeParis**: Philharmonie de Paris
  - **Phoenix**
  - **Photobucket**
+ - **Pinkbike**
  - **Pladform**
  - **PlanetaPlay**
  - **play.fm**
  - **prosiebensat1**: ProSiebenSat.1 Digital
  - **Puls4**
  - **Pyvideo**
- - **QQMusic**
- - **QQMusicAlbum**
- - **QQMusicSinger**
+ - **qqmusic**: QQ音乐
+ - **qqmusic:album**: QQ音乐 - 专辑
+ - **qqmusic:playlist**: QQ音乐 - 歌单
+ - **qqmusic:singer**: QQ音乐 - 歌手
+ - **qqmusic:toplist**: QQ音乐 - 排行榜
  - **QuickVid**
  - **R7**
  - **radio.de**
  - **RadioJavan**
  - **Rai**
  - **RBMARadio**
+ - **RDS**: RDS.ca
  - **RedTube**
  - **Restudy**
  - **ReverbNation**
  - **Rte**
  - **rtl.nl**: rtl.nl and rtlxl.nl
  - **RTL2**
- - **RTLnow**
  - **RTP**
  - **RTS**: RTS.ch
  - **rtve.es:alacarta**: RTVE a la carta
  - **rutube:movie**: Rutube movies
  - **rutube:person**: Rutube person videos
  - **RUTV**: RUTV.RU
+ - **Ruutu**
  - **safari**: safaribooksonline.com online video
  - **safari:course**: safaribooksonline.com online courses
  - **Sandia**: Sandia National Laboratories
  - **smotri:broadcast**: Smotri.com broadcasts
  - **smotri:community**: Smotri.com community videos
  - **smotri:user**: Smotri.com user videos
+ - **SnagFilms**
+ - **SnagFilmsEmbed**
  - **Snotr**
- - **Sockshare**
  - **Sohu**
+ - **soompi**
+ - **soompi:show**
  - **soundcloud**
  - **soundcloud:playlist**
  - **soundcloud:set**
  - **southpark.cc.com**
  - **southpark.cc.com:español**
  - **southpark.de**
+ - **southpark.nl**
+ - **southparkstudios.dk**
  - **Space**
  - **SpankBang**
  - **Spankwire**
  - **Spike**
  - **Sport5**
  - **SportBox**
+ - **SportBoxEmbed**
  - **SportDeutschland**
+ - **Sportschau**
  - **Srf**
  - **SRMediathek**: Saarländischer Rundfunk
  - **SSA**
  - **StreamCZ**
  - **StreetVoice**
  - **SunPorno**
+ - **SVT**
  - **SVTPlay**: SVT Play and Öppet arkiv
  - **SWRMediathek**
  - **Syfy**
  - **TechTalks**
  - **techtv.mit.edu**
  - **ted**
- - **tegenlicht.vpro.nl**
  - **TeleBruxelles**
  - **telecinco.es**
  - **TeleMB**
  - **TheOnion**
  - **ThePlatform**
  - **TheSixtyOne**
+ - **ThisAmericanLife**
  - **ThisAV**
  - **THVideo**
  - **THVideoPlaylist**
  - **tlc.com**
  - **tlc.de**
  - **TMZ**
+ - **TMZArticle**
  - **TNAFlix**
  - **tou.tv**
  - **Toypics**: Toypics user profile
  - **Trilulilu**
  - **TruTube**
  - **Tube8**
+ - **TubiTv**
  - **Tudou**
  - **Tumblr**
  - **TuneIn**
  - **Turbo**
  - **Tutv**
  - **tv.dfb.de**
+ - **TV2**
+ - **TV2Article**
  - **TV4**: tv4.se and tv4play.se
+ - **TVC**
+ - **TVCArticle**
  - **tvigle**: Интернет-телевидение Tvigle.ru
  - **tvp.pl**
  - **tvp.pl:Series**
  - **twitch:stream**
  - **twitch:video**
  - **twitch:vod**
+ - **TwitterCard**
  - **Ubu**
  - **udemy**
  - **udemy:course**
- - **UDNEmbed**
+ - **UDNEmbed**: 聯合影音
  - **Ultimedia**
  - **Unistra**
  - **Urort**: NRK P3 Urørt
  - **Vessel**
  - **Vesti**: Вести.Ru
  - **Vevo**
- - **VGTV**
+ - **VGTV**: VGTV and BTTV
  - **vh1.com**
  - **Vice**
  - **Viddler**
  - **vier:videos**
  - **Viewster**
  - **viki**
+ - **viki:channel**
  - **vimeo**
  - **vimeo:album**
  - **vimeo:channel**
  - **Vimple**: Vimple - one-click video hosting
  - **Vine**
  - **vine:user**
- - **vk.com**
- - **vk.com:user-videos**: vk.com:All of a user's videos
+ - **vk**: VK
+ - **vk:uservideos**: VK - User's Videos
  - **Vodlocker**
+ - **VoiceRepublic**
  - **Vporn**
  - **VRT**
  - **vube**: Vube.com
  - **wdr:mobile**
  - **WDRMaus**: Sendung mit der Maus
  - **WebOfStories**
+ - **WebOfStoriesPlaylist**
  - **Weibo**
  - **Wimp**
  - **Wistia**
+ - **WNL**
  - **WorldStarHipHop**
  - **wrzuta.pl**
  - **WSJ**: Wall Street Journal
  - **XBef**
  - **XboxClips**
  - **XHamster**
+ - **XHamsterEmbed**
  - **XMinus**
  - **XNXX**
+ - **Xstream**
  - **XTube**
  - **XTubeUser**: XTube user profile
- - **Xuite**
+ - **Xuite**: 隨意窩Xuite影音
  - **XVideos**
  - **XXXYMovies**
  - **Yahoo**: Yahoo screen and movies
- - **Yam**
+ - **Yam**: 蕃薯藤yam天空部落
  - **yandexmusic:album**: Яндекс.Музыка - Альбом
  - **yandexmusic:playlist**: Яндекс.Музыка - Плейлист
  - **yandexmusic:track**: Яндекс.Музыка - Трек
  - **YesJapan**
+ - **yinyuetai:video**: 音悦Tai
  - **Ynet**
  - **YouJizz**
- - **Youku**
+ - **youku**: 优酷
  - **YouPorn**
  - **YourUpload**
  - **youtube**: YouTube.com
index 82b827536d746246d3241fd856f749945cf6d561..a13c09ef40c7d8c101c86b471cae142ada9ccd7c 100644 (file)
@@ -12,6 +12,7 @@ import copy
 
 from test.helper import FakeYDL, assertRegexpMatches
 from youtube_dl import YoutubeDL
+from youtube_dl.compat import compat_str
 from youtube_dl.extractor import YoutubeIE
 from youtube_dl.postprocessor.common import PostProcessor
 from youtube_dl.utils import match_filter_func
@@ -507,6 +508,51 @@ class TestYoutubeDL(unittest.TestCase):
         res = get_videos(f)
         self.assertEqual(res, ['1'])
 
+    def test_playlist_items_selection(self):
+        entries = [{
+            'id': compat_str(i),
+            'title': compat_str(i),
+            'url': TEST_URL,
+        } for i in range(1, 5)]
+        playlist = {
+            '_type': 'playlist',
+            'id': 'test',
+            'entries': entries,
+            'extractor': 'test:playlist',
+            'extractor_key': 'test:playlist',
+            'webpage_url': 'http://example.com',
+        }
+
+        def get_ids(params):
+            ydl = YDL(params)
+            # make a copy because the dictionary can be modified
+            ydl.process_ie_result(playlist.copy())
+            return [int(v['id']) for v in ydl.downloaded_info_dicts]
+
+        result = get_ids({})
+        self.assertEqual(result, [1, 2, 3, 4])
+
+        result = get_ids({'playlistend': 10})
+        self.assertEqual(result, [1, 2, 3, 4])
+
+        result = get_ids({'playlistend': 2})
+        self.assertEqual(result, [1, 2])
+
+        result = get_ids({'playliststart': 10})
+        self.assertEqual(result, [])
+
+        result = get_ids({'playliststart': 2})
+        self.assertEqual(result, [2, 3, 4])
+
+        result = get_ids({'playlist_items': '2-4'})
+        self.assertEqual(result, [2, 3, 4])
+
+        result = get_ids({'playlist_items': '2,4'})
+        self.assertEqual(result, [2, 4])
+
+        result = get_ids({'playlist_items': '10'})
+        self.assertEqual(result, [])
+
 
 if __name__ == '__main__':
     unittest.main()
index 4dc7de7b5b8d55fc1d95296b58e076aba664e1cc..315a3f5ae6a597662d05f56e97672b4ff93aff10 100644 (file)
@@ -39,7 +39,7 @@ class TestAES(unittest.TestCase):
         encrypted = base64.b64encode(
             intlist_to_bytes(self.iv[:8]) +
             b'\x17\x15\x93\xab\x8d\x80V\xcdV\xe0\t\xcdo\xc2\xa5\xd8ksM\r\xe27N\xae'
-        )
+        ).decode('utf-8')
         decrypted = (aes_decrypt_text(encrypted, password, 16))
         self.assertEqual(decrypted, self.secret_msg)
 
@@ -47,7 +47,7 @@ class TestAES(unittest.TestCase):
         encrypted = base64.b64encode(
             intlist_to_bytes(self.iv[:8]) +
             b'\x0b\xe6\xa4\xd9z\x0e\xb8\xb9\xd0\xd4i_\x85\x1d\x99\x98_\xe5\x80\xe7.\xbf\xa5\x83'
-        )
+        ).decode('utf-8')
         decrypted = (aes_decrypt_text(encrypted, password, 32))
         self.assertEqual(decrypted, self.secret_msg)
 
index 1eb454e068970eb1a0d48cc3cd881e0bf71f9463..c3ba8ad2e3aa1f5cd33dd5a61a184d52cc0c07a9 100644 (file)
@@ -14,6 +14,8 @@ from youtube_dl.utils import get_filesystem_encoding
 from youtube_dl.compat import (
     compat_getenv,
     compat_expanduser,
+    compat_urllib_parse_unquote,
+    compat_urllib_parse_unquote_plus,
 )
 
 
@@ -42,5 +44,28 @@ class TestCompat(unittest.TestCase):
             dir(youtube_dl.compat))) - set(['unicode_literals'])
         self.assertEqual(all_names, sorted(present_names))
 
+    def test_compat_urllib_parse_unquote(self):
+        self.assertEqual(compat_urllib_parse_unquote('abc%20def'), 'abc def')
+        self.assertEqual(compat_urllib_parse_unquote('%7e/abc+def'), '~/abc+def')
+        self.assertEqual(compat_urllib_parse_unquote(''), '')
+        self.assertEqual(compat_urllib_parse_unquote('%'), '%')
+        self.assertEqual(compat_urllib_parse_unquote('%%'), '%%')
+        self.assertEqual(compat_urllib_parse_unquote('%%%'), '%%%')
+        self.assertEqual(compat_urllib_parse_unquote('%2F'), '/')
+        self.assertEqual(compat_urllib_parse_unquote('%2f'), '/')
+        self.assertEqual(compat_urllib_parse_unquote('%E6%B4%A5%E6%B3%A2'), '津波')
+        self.assertEqual(
+            compat_urllib_parse_unquote('''<meta property="og:description" content="%E2%96%81%E2%96%82%E2%96%83%E2%96%84%25%E2%96%85%E2%96%86%E2%96%87%E2%96%88" />
+%<a href="https://ar.wikipedia.org/wiki/%D8%AA%D8%B3%D9%88%D9%86%D8%A7%D9%85%D9%8A">%a'''),
+            '''<meta property="og:description" content="▁▂▃▄%▅▆▇█" />
+%<a href="https://ar.wikipedia.org/wiki/تسونامي">%a''')
+        self.assertEqual(
+            compat_urllib_parse_unquote('''%28%5E%E2%97%A3_%E2%97%A2%5E%29%E3%81%A3%EF%B8%BB%E3%83%87%E2%95%90%E4%B8%80    %E2%87%80    %E2%87%80    %E2%87%80    %E2%87%80    %E2%87%80    %E2%86%B6%I%Break%25Things%'''),
+            '''(^◣_◢^)っ︻デ═一    ⇀    ⇀    ⇀    ⇀    ⇀    ↶%I%Break%Things%''')
+
+    def test_compat_urllib_parse_unquote_plus(self):
+        self.assertEqual(compat_urllib_parse_unquote_plus('abc%20def'), 'abc def')
+        self.assertEqual(compat_urllib_parse_unquote_plus('%7e/abc+def'), '~/abc def')
+
 if __name__ == '__main__':
     unittest.main()
index 891ee620b1f2627dd6991e0cccfbc58b59fb6a95..c4e3adb67b7d1034b36cdd3c45969fe321351c64 100644 (file)
@@ -266,7 +266,7 @@ class TestNRKSubtitles(BaseTestSubtitles):
         self.DL.params['allsubtitles'] = True
         subtitles = self.getSubtitles()
         self.assertEqual(set(subtitles.keys()), set(['no']))
-        self.assertEqual(md5(subtitles['no']), '1d221e6458c95c5494dcd38e6a1f129a')
+        self.assertEqual(md5(subtitles['no']), '544fa917d3197fcbee64634559221cc2')
 
 
 class TestRaiSubtitles(BaseTestSubtitles):
index 032d3656ae21ec733a11d946e52693ebd560af16..65692a9fbb2bdc0dd6dbc5203439f1ca12bf5c46 100644 (file)
@@ -40,7 +40,6 @@ from youtube_dl.utils import (
     read_batch_urls,
     sanitize_filename,
     sanitize_path,
-    sanitize_url_path_consecutive_slashes,
     prepend_extension,
     replace_extension,
     shell_quote,
@@ -176,26 +175,6 @@ class TestUtil(unittest.TestCase):
         self.assertEqual(sanitize_path('./abc'), 'abc')
         self.assertEqual(sanitize_path('./../abc'), '..\\abc')
 
-    def test_sanitize_url_path_consecutive_slashes(self):
-        self.assertEqual(
-            sanitize_url_path_consecutive_slashes('http://hostname/foo//bar/filename.html'),
-            'http://hostname/foo/bar/filename.html')
-        self.assertEqual(
-            sanitize_url_path_consecutive_slashes('http://hostname//foo/bar/filename.html'),
-            'http://hostname/foo/bar/filename.html')
-        self.assertEqual(
-            sanitize_url_path_consecutive_slashes('http://hostname//'),
-            'http://hostname/')
-        self.assertEqual(
-            sanitize_url_path_consecutive_slashes('http://hostname/foo/bar/filename.html'),
-            'http://hostname/foo/bar/filename.html')
-        self.assertEqual(
-            sanitize_url_path_consecutive_slashes('http://hostname/'),
-            'http://hostname/')
-        self.assertEqual(
-            sanitize_url_path_consecutive_slashes('http://hostname/abc//'),
-            'http://hostname/abc/')
-
     def test_prepend_extension(self):
         self.assertEqual(prepend_extension('abc.ext', 'temp'), 'abc.temp.ext')
         self.assertEqual(prepend_extension('abc.ext', 'temp', 'ext'), 'abc.temp.ext')
@@ -345,6 +324,7 @@ class TestUtil(unittest.TestCase):
         self.assertEqual(parse_duration('02:03:04'), 7384)
         self.assertEqual(parse_duration('01:02:03:04'), 93784)
         self.assertEqual(parse_duration('1 hour 3 minutes'), 3780)
+        self.assertEqual(parse_duration('87 Min.'), 5220)
 
     def test_fix_xml_ampersands(self):
         self.assertEqual(
@@ -621,7 +601,7 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4')
                 <div xml:lang="en">
                     <p begin="0" end="1">The following line contains Chinese characters and special symbols</p>
                     <p begin="1" end="2">第二行<br/>♪♪</p>
-                    <p begin="2" end="3"><span>Third<br/>Line</span></p>
+                    <p begin="2" dur="1"><span>Third<br/>Line</span></p>
                 </div>
             </body>
             </tt>'''
@@ -642,6 +622,21 @@ Line
 '''
         self.assertEqual(dfxp2srt(dfxp_data), srt_data)
 
+        dfxp_data_no_default_namespace = '''<?xml version="1.0" encoding="UTF-8"?>
+            <tt xml:lang="en" xmlns:tts="http://www.w3.org/ns/ttml#parameter">
+            <body>
+                <div xml:lang="en">
+                    <p begin="0" end="1">The first line</p>
+                </div>
+            </body>
+            </tt>'''
+        srt_data = '''1
+00:00:00,000 --> 00:00:01,000
+The first line
+
+'''
+        self.assertEqual(dfxp2srt(dfxp_data_no_default_namespace), srt_data)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tox.ini b/tox.ini
index 00c6e00e3b72c4de21dc725173e3bb60ea5fa55b..cd805fe8ac27481937a1000a5a37412ff4f0d923 100644 (file)
--- a/tox.ini
+++ b/tox.ini
@@ -4,6 +4,8 @@ envlist = py26,py27,py33,py34
 deps =
    nose
    coverage
+# We need a valid $HOME for test_compat_expanduser
+passenv = HOME
 defaultargs = test --exclude test_download.py --exclude test_age_restriction.py
     --exclude test_subtitles.py --exclude test_write_annotations.py
     --exclude test_youtube_lists.py
index eb7470f72cd39572071b6624180d4d2e0c08cbda..702a6ad50b6c6bf2d3f3bfbd8c873cb3a64c8e7b 100755 (executable)
@@ -49,6 +49,7 @@ from .utils import (
     ExtractorError,
     format_bytes,
     formatSeconds,
+    HEADRequest,
     locked_file,
     make_HTTPS_handler,
     MaxDownloadsReached,
@@ -118,7 +119,7 @@ class YoutubeDL(object):
 
     username:          Username for authentication purposes.
     password:          Password for authentication purposes.
-    videopassword:     Password for acces a video.
+    videopassword:     Password for accessing a video.
     usenetrc:          Use netrc for authentication instead.
     verbose:           Print additional info to stdout.
     quiet:             Do not print messages to stdout.
@@ -138,6 +139,7 @@ class YoutubeDL(object):
     outtmpl:           Template for output names.
     restrictfilenames: Do not allow "&" and spaces in file names
     ignoreerrors:      Do not stop on download errors.
+    force_generic_extractor: Force downloader to use the generic extractor
     nooverwrites:      Prevent overwriting files.
     playliststart:     Playlist item to start at.
     playlistend:       Playlist item to end at.
@@ -260,7 +262,8 @@ class YoutubeDL(object):
     The following options are used by the post processors:
     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
                        otherwise prefer avconv.
-    exec_cmd:          Arbitrary command to run after downloading
+    postprocessor_args: A list of additional command-line arguments for the
+                        postprocessor.
     """
 
     params = None
@@ -626,13 +629,16 @@ class YoutubeDL(object):
             info_dict.setdefault(key, value)
 
     def extract_info(self, url, download=True, ie_key=None, extra_info={},
-                     process=True):
+                     process=True, force_generic_extractor=False):
         '''
         Returns a list with a dictionary for each video we find.
         If 'download', also downloads the videos.
         extra_info is a dict containing the extra values to add to each result
         '''
 
+        if not ie_key and force_generic_extractor:
+            ie_key = 'Generic'
+
         if ie_key:
             ies = [self.get_info_extractor(ie_key)]
         else:
@@ -760,7 +766,9 @@ class YoutubeDL(object):
             if isinstance(ie_entries, list):
                 n_all_entries = len(ie_entries)
                 if playlistitems:
-                    entries = [ie_entries[i - 1] for i in playlistitems]
+                    entries = [
+                        ie_entries[i - 1] for i in playlistitems
+                        if -n_all_entries <= i - 1 < n_all_entries]
                 else:
                     entries = ie_entries[playliststart:playlistend]
                 n_entries = len(entries)
@@ -922,8 +930,9 @@ class YoutubeDL(object):
                 if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
             if audiovideo_formats:
                 return audiovideo_formats[format_idx]
-            # for audio only urls, select the best/worst audio format
-            elif all(f.get('acodec') != 'none' for f in available_formats):
+            # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format
+            elif (all(f.get('acodec') != 'none' for f in available_formats) or
+                  all(f.get('vcodec') != 'none' for f in available_formats)):
                 return available_formats[format_idx]
         elif format_spec == 'bestaudio':
             audio_formats = [
@@ -1001,7 +1010,7 @@ class YoutubeDL(object):
                 t.get('preference'), t.get('width'), t.get('height'),
                 t.get('id'), t.get('url')))
             for i, t in enumerate(thumbnails):
-                if 'width' in t and 'height' in t:
+                if t.get('width') and t.get('height'):
                     t['resolution'] = '%dx%d' % (t['width'], t['height'])
                 if t.get('id') is None:
                     t['id'] = '%d' % i
@@ -1013,13 +1022,13 @@ class YoutubeDL(object):
             info_dict['display_id'] = info_dict['id']
 
         if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
-            # Working around negative timestamps in Windows
-            # (see http://bugs.python.org/issue1646728)
-            if info_dict['timestamp'] < 0 and os.name == 'nt':
-                info_dict['timestamp'] = 0
-            upload_date = datetime.datetime.utcfromtimestamp(
-                info_dict['timestamp'])
-            info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
+            # Working around out-of-range timestamp values (e.g. negative ones on Windows,
+            # see http://bugs.python.org/issue1646728)
+            try:
+                upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
+                info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
+            except (ValueError, OverflowError, OSError):
+                pass
 
         if self.params.get('listsubtitles', False):
             if 'automatic_captions' in info_dict:
@@ -1030,12 +1039,6 @@ class YoutubeDL(object):
             info_dict['id'], info_dict.get('subtitles'),
             info_dict.get('automatic_captions'))
 
-        # This extractors handle format selection themselves
-        if info_dict['extractor'] in ['Youku']:
-            if download:
-                self.process_info(info_dict)
-            return info_dict
-
         # We now pick which formats have to be downloaded
         if info_dict.get('formats') is None:
             # There's only one format available
@@ -1046,6 +1049,8 @@ class YoutubeDL(object):
         if not formats:
             raise ExtractorError('No video formats found!')
 
+        formats_dict = {}
+
         # We check that all the formats have the format and format_id fields
         for i, format in enumerate(formats):
             if 'url' not in format:
@@ -1053,6 +1058,18 @@ class YoutubeDL(object):
 
             if format.get('format_id') is None:
                 format['format_id'] = compat_str(i)
+            format_id = format['format_id']
+            if format_id not in formats_dict:
+                formats_dict[format_id] = []
+            formats_dict[format_id].append(format)
+
+        # Make sure all formats have unique format_id
+        for format_id, ambiguous_formats in formats_dict.items():
+            if len(ambiguous_formats) > 1:
+                for i, format in enumerate(ambiguous_formats):
+                    format['format_id'] = '%s-%d' % (format_id, i)
+
+        for i, format in enumerate(formats):
             if format.get('format') is None:
                 format['format'] = '{id} - {res}{note}'.format(
                     id=format['format_id'],
@@ -1086,8 +1103,12 @@ class YoutubeDL(object):
         req_format = self.params.get('format')
         if req_format is None:
             req_format_list = []
-            if info_dict['extractor'] in ['youtube', 'ted'] and FFmpegMergerPP(self).available:
-                req_format_list.append('bestvideo+bestaudio')
+            if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and
+                    info_dict['extractor'] in ['youtube', 'ted'] and
+                    not info_dict.get('is_live')):
+                merger = FFmpegMergerPP(self)
+                if merger.available and merger.can_merge():
+                    req_format_list.append('bestvideo+bestaudio')
             req_format_list.append('best')
             req_format = '/'.join(req_format_list)
         formats_to_download = []
@@ -1364,7 +1385,7 @@ class YoutubeDL(object):
                         postprocessors = []
                         self.report_warning('You have requested multiple '
                                             'formats but ffmpeg or avconv are not installed.'
-                                            ' The formats won\'t be merged')
+                                            ' The formats won\'t be merged.')
                     else:
                         postprocessors = [merger]
 
@@ -1391,8 +1412,8 @@ class YoutubeDL(object):
                     requested_formats = info_dict['requested_formats']
                     if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
                         info_dict['ext'] = 'mkv'
-                        self.report_warning('You have requested formats incompatible for merge. '
-                                            'The formats will be merged into mkv')
+                        self.report_warning(
+                            'Requested formats are incompatible for merge and will be merged into mkv.')
                     # Ensure filename always has a correct extension for successful merge
                     filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
                     if os.path.exists(encodeFilename(filename)):
@@ -1479,7 +1500,8 @@ class YoutubeDL(object):
         for url in url_list:
             try:
                 # It also downloads the videos
-                res = self.extract_info(url)
+                res = self.extract_info(
+                    url, force_generic_extractor=self.params.get('force_generic_extractor', False))
             except UnavailableVideoError:
                 self.report_error('unable to download video')
             except MaxDownloadsReached:
@@ -1523,6 +1545,7 @@ class YoutubeDL(object):
             pps_chain.extend(ie_info['__postprocessors'])
         pps_chain.extend(self._pps)
         for pp in pps_chain:
+            files_to_delete = []
             try:
                 files_to_delete, info = pp.run(info)
             except PostProcessingError as e:
@@ -1701,7 +1724,8 @@ class YoutubeDL(object):
             if req_is_string:
                 req = url_escaped
             else:
-                req = compat_urllib_request.Request(
+                req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
+                req = req_type(
                     url_escaped, data=req.data, headers=req.headers,
                     origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
 
@@ -1847,7 +1871,7 @@ class YoutubeDL(object):
             thumb_ext = determine_ext(t['url'], 'jpg')
             suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
             thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
-            thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
+            t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
 
             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
                 self.to_screen('[%s] %s: Thumbnail %sis already present' %
index c88489f2989a70f8daaf1f8899c8788495091afa..55b22c889f97c73d28e732466f850bcfb1615c83 100644 (file)
@@ -169,7 +169,7 @@ def _real_main(argv=None):
         if not opts.audioquality.isdigit():
             parser.error('invalid audio quality specified')
     if opts.recodevideo is not None:
-        if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv']:
+        if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv', 'avi']:
             parser.error('invalid video recode format specified')
     if opts.convertsubtitles is not None:
         if opts.convertsubtitles not in ['srt', 'vtt', 'ass']:
@@ -240,13 +240,18 @@ def _real_main(argv=None):
     if opts.xattrs:
         postprocessors.append({'key': 'XAttrMetadata'})
     if opts.embedthumbnail:
-        postprocessors.append({'key': 'EmbedThumbnail'})
+        already_have_thumbnail = opts.writethumbnail or opts.write_all_thumbnails
+        postprocessors.append({
+            'key': 'EmbedThumbnail',
+            'already_have_thumbnail': already_have_thumbnail
+        })
+        if not already_have_thumbnail:
+            opts.writethumbnail = True
     # Please keep ExecAfterDownload towards the bottom as it allows the user to modify the final file in any way.
     # So if the user is able to remove the file before your postprocessor runs it might cause a few problems.
     if opts.exec_cmd:
         postprocessors.append({
             'key': 'ExecAfterDownload',
-            'verboseOutput': opts.verbose,
             'exec_cmd': opts.exec_cmd,
         })
     if opts.xattr_set_filesize:
@@ -258,6 +263,9 @@ def _real_main(argv=None):
     external_downloader_args = None
     if opts.external_downloader_args:
         external_downloader_args = shlex.split(opts.external_downloader_args)
+    postprocessor_args = None
+    if opts.postprocessor_args:
+        postprocessor_args = shlex.split(opts.postprocessor_args)
     match_filter = (
         None if opts.match_filter is None
         else match_filter_func(opts.match_filter))
@@ -288,6 +296,7 @@ def _real_main(argv=None):
         'autonumber_size': opts.autonumber_size,
         'restrictfilenames': opts.restrictfilenames,
         'ignoreerrors': opts.ignoreerrors,
+        'force_generic_extractor': opts.force_generic_extractor,
         'ratelimit': opts.ratelimit,
         'nooverwrites': opts.nooverwrites,
         'retries': opts_retries,
@@ -345,7 +354,6 @@ def _real_main(argv=None):
         'default_search': opts.default_search,
         'youtube_include_dash_manifest': opts.youtube_include_dash_manifest,
         'encoding': opts.encoding,
-        'exec_cmd': opts.exec_cmd,
         'extract_flat': opts.extract_flat,
         'merge_output_format': opts.merge_output_format,
         'postprocessors': postprocessors,
@@ -362,6 +370,7 @@ def _real_main(argv=None):
         'ffmpeg_location': opts.ffmpeg_location,
         'hls_prefer_native': opts.hls_prefer_native,
         'external_downloader_args': external_downloader_args,
+        'postprocessor_args': postprocessor_args,
         'cn_verification_proxy': opts.cn_verification_proxy,
     }
 
index 07224d5084158184ac30c927b1b79802d398c336..7817adcfdd546f70cfb76e0634b8df8ddbcaf8e0 100644 (file)
@@ -152,7 +152,7 @@ def aes_decrypt_text(data, password, key_size_bytes):
     """
     NONCE_LENGTH_BYTES = 8
 
-    data = bytes_to_intlist(base64.b64decode(data))
+    data = bytes_to_intlist(base64.b64decode(data.encode('utf-8')))
     password = bytes_to_intlist(password.encode('utf-8'))
 
     key = password[:key_size_bytes] + [0] * (key_size_bytes - len(password))
index f9529210dd955932eca837aa7022696470c557ed..0c57c7aebf6bd45a8f9edfc7e1d585f1bbf2f23a 100644 (file)
@@ -9,6 +9,7 @@ import shutil
 import socket
 import subprocess
 import sys
+import itertools
 
 
 try:
@@ -74,42 +75,74 @@ except ImportError:
     import BaseHTTPServer as compat_http_server
 
 try:
+    from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes
     from urllib.parse import unquote as compat_urllib_parse_unquote
-except ImportError:
-    def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
-        if string == '':
+    from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus
+except ImportError:  # Python 2
+    _asciire = (compat_urllib_parse._asciire if hasattr(compat_urllib_parse, '_asciire')
+                else re.compile('([\x00-\x7f]+)'))
+
+    # HACK: The following are the correct unquote_to_bytes, unquote and unquote_plus
+    # implementations from cpython 3.4.3's stdlib. Python 2's version
+    # is apparently broken (see https://github.com/rg3/youtube-dl/pull/6244)
+
+    def compat_urllib_parse_unquote_to_bytes(string):
+        """unquote_to_bytes('abc%20def') -> b'abc def'."""
+        # Note: strings are encoded as UTF-8. This is only an issue if it contains
+        # unescaped non-ASCII characters, which URIs should not.
+        if not string:
+            # Is it a string-like object?
+            string.split
+            return b''
+        if isinstance(string, unicode):
+            string = string.encode('utf-8')
+        bits = string.split(b'%')
+        if len(bits) == 1:
             return string
-        res = string.split('%')
-        if len(res) == 1:
+        res = [bits[0]]
+        append = res.append
+        for item in bits[1:]:
+            try:
+                append(compat_urllib_parse._hextochr[item[:2]])
+                append(item[2:])
+            except KeyError:
+                append(b'%')
+                append(item)
+        return b''.join(res)
+
+    def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
+        """Replace %xx escapes by their single-character equivalent. The optional
+        encoding and errors parameters specify how to decode percent-encoded
+        sequences into Unicode characters, as accepted by the bytes.decode()
+        method.
+        By default, percent-encoded sequences are decoded with UTF-8, and invalid
+        sequences are replaced by a placeholder character.
+
+        unquote('abc%20def') -> 'abc def'.
+        """
+        if '%' not in string:
+            string.split
             return string
         if encoding is None:
             encoding = 'utf-8'
         if errors is None:
             errors = 'replace'
-        # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
-        pct_sequence = b''
-        string = res[0]
-        for item in res[1:]:
-            try:
-                if not item:
-                    raise ValueError
-                pct_sequence += item[:2].decode('hex')
-                rest = item[2:]
-                if not rest:
-                    # This segment was just a single percent-encoded character.
-                    # May be part of a sequence of code units, so delay decoding.
-                    # (Stored in pct_sequence).
-                    continue
-            except ValueError:
-                rest = '%' + item
-            # Encountered non-percent-encoded characters. Flush the current
-            # pct_sequence.
-            string += pct_sequence.decode(encoding, errors) + rest
-            pct_sequence = b''
-        if pct_sequence:
-            # Flush the final pct_sequence
-            string += pct_sequence.decode(encoding, errors)
-        return string
+        bits = _asciire.split(string)
+        res = [bits[0]]
+        append = res.append
+        for i in range(1, len(bits), 2):
+            append(compat_urllib_parse_unquote_to_bytes(bits[i]).decode(encoding, errors))
+            append(bits[i + 1])
+        return ''.join(res)
+
+    def compat_urllib_parse_unquote_plus(string, encoding='utf-8', errors='replace'):
+        """Like unquote(), but also replace plus signs by spaces, as required for
+        unquoting HTML form values.
+
+        unquote_plus('%7e/abc+def') -> '~/abc def'
+        """
+        string = string.replace('+', ' ')
+        return compat_urllib_parse_unquote(string, encoding, errors)
 
 try:
     compat_str = unicode  # Python 2
@@ -388,6 +421,15 @@ else:
             pass
         return _terminal_size(columns, lines)
 
+try:
+    itertools.count(start=0, step=1)
+    compat_itertools_count = itertools.count
+except TypeError:  # Python 2.6
+    def compat_itertools_count(start=0, step=1):
+        n = start
+        while True:
+            yield n
+            n += step
 
 __all__ = [
     'compat_HTTPError',
@@ -401,6 +443,7 @@ __all__ = [
     'compat_html_entities',
     'compat_http_client',
     'compat_http_server',
+    'compat_itertools_count',
     'compat_kwargs',
     'compat_ord',
     'compat_parse_qs',
@@ -411,6 +454,8 @@ __all__ = [
     'compat_urllib_error',
     'compat_urllib_parse',
     'compat_urllib_parse_unquote',
+    'compat_urllib_parse_unquote_plus',
+    'compat_urllib_parse_unquote_to_bytes',
     'compat_urllib_parse_urlparse',
     'compat_urllib_request',
     'compat_urlparse',
index f110830c472eb451d77b48fa9337bd5feee55952..dccc59212d3028bb9a96f0eb9ffff4acb0be681e 100644 (file)
@@ -8,6 +8,7 @@ from .hls import NativeHlsFD
 from .http import HttpFD
 from .rtsp import RtspFD
 from .rtmp import RtmpFD
+from .dash import DashSegmentsFD
 
 from ..utils import (
     determine_protocol,
@@ -20,6 +21,7 @@ PROTOCOL_MAP = {
     'mms': RtspFD,
     'rtsp': RtspFD,
     'f4m': F4mFD,
+    'http_dash_segments': DashSegmentsFD,
 }
 
 
diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py
new file mode 100644 (file)
index 0000000..8b6fa27
--- /dev/null
@@ -0,0 +1,66 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import FileDownloader
+from ..compat import compat_urllib_request
+
+
+class DashSegmentsFD(FileDownloader):
+    """
+    Download segments in a DASH manifest
+    """
+    def real_download(self, filename, info_dict):
+        self.report_destination(filename)
+        tmpfilename = self.temp_name(filename)
+        base_url = info_dict['url']
+        segment_urls = info_dict['segment_urls']
+
+        is_test = self.params.get('test', False)
+        remaining_bytes = self._TEST_FILE_SIZE if is_test else None
+        byte_counter = 0
+
+        def append_url_to_file(outf, target_url, target_name, remaining_bytes=None):
+            self.to_screen('[DashSegments] %s: Downloading %s' % (info_dict['id'], target_name))
+            req = compat_urllib_request.Request(target_url)
+            if remaining_bytes is not None:
+                req.add_header('Range', 'bytes=0-%d' % (remaining_bytes - 1))
+
+            data = self.ydl.urlopen(req).read()
+
+            if remaining_bytes is not None:
+                data = data[:remaining_bytes]
+
+            outf.write(data)
+            return len(data)
+
+        def combine_url(base_url, target_url):
+            if re.match(r'^https?://', target_url):
+                return target_url
+            return '%s%s%s' % (base_url, '' if base_url.endswith('/') else '/', target_url)
+
+        with open(tmpfilename, 'wb') as outf:
+            append_url_to_file(
+                outf, combine_url(base_url, info_dict['initialization_url']),
+                'initialization segment')
+            for i, segment_url in enumerate(segment_urls):
+                segment_len = append_url_to_file(
+                    outf, combine_url(base_url, segment_url),
+                    'segment %d / %d' % (i + 1, len(segment_urls)),
+                    remaining_bytes)
+                byte_counter += segment_len
+                if remaining_bytes is not None:
+                    remaining_bytes -= segment_len
+                    if remaining_bytes <= 0:
+                        break
+
+        self.try_rename(tmpfilename, filename)
+
+        self._hook_progress({
+            'downloaded_bytes': byte_counter,
+            'total_bytes': byte_counter,
+            'filename': filename,
+            'status': 'finished',
+        })
+
+        return True
index 7ca2d314348400fc6ba3095e23d69e1c925ad7dd..1d5cc99043d02f658064e688c268c37171c37325 100644 (file)
@@ -109,6 +109,14 @@ class Aria2cFD(ExternalFD):
         cmd += ['--', info_dict['url']]
         return cmd
 
+
+class HttpieFD(ExternalFD):
+    def _make_cmd(self, tmpfilename, info_dict):
+        cmd = ['http', '--download', '--output', tmpfilename, info_dict['url']]
+        for key, val in info_dict['http_headers'].items():
+            cmd += ['%s:%s' % (key, val)]
+        return cmd
+
 _BY_NAME = dict(
     (klass.get_basename(), klass)
     for name, klass in globals().items()
@@ -123,5 +131,6 @@ def list_external_downloaders():
 def get_external_downloader(external_downloader):
     """ Given the name of the executable, see whether we support the given
         downloader . """
-    bn = os.path.basename(external_downloader)
+    # Drop .exe extension on Windows
+    bn = os.path.splitext(os.path.basename(external_downloader))[0]
     return _BY_NAME[bn]
index ee05a69583309105cc8f459e096590014057be39..ec4c017f31d0a2bad84a8d93430a97126865a022 100644 (file)
@@ -4,7 +4,10 @@ from .abc import ABCIE
 from .abc7news import Abc7NewsIE
 from .academicearth import AcademicEarthCourseIE
 from .addanime import AddAnimeIE
-from .adobetv import AdobeTVIE
+from .adobetv import (
+    AdobeTVIE,
+    AdobeTVVideoIE,
+)
 from .adultswim import AdultSwimIE
 from .aftenposten import AftenpostenIE
 from .aftonbladet import AftonbladetIE
@@ -16,9 +19,14 @@ from .anysex import AnySexIE
 from .aol import AolIE
 from .allocine import AllocineIE
 from .aparat import AparatIE
+from .appleconnect import AppleConnectIE
 from .appletrailers import AppleTrailersIE
 from .archiveorg import ArchiveOrgIE
-from .ard import ARDIE, ARDMediathekIE
+from .ard import (
+    ARDIE,
+    ARDMediathekIE,
+    SportschauIE,
+)
 from .arte import (
     ArteTvIE,
     ArteTVPlus7IE,
@@ -103,6 +111,7 @@ from .dailymotion import (
     DailymotionIE,
     DailymotionPlaylistIE,
     DailymotionUserIE,
+    DailymotionCloudIE,
 )
 from .daum import DaumIE
 from .dbtv import DBTVIE
@@ -112,6 +121,10 @@ from .dfb import DFBIE
 from .dhm import DHMIE
 from .dotsub import DotsubIE
 from .douyutv import DouyuTVIE
+from .dramafever import (
+    DramaFeverIE,
+    DramaFeverSeriesIE,
+)
 from .dreisat import DreiSatIE
 from .drbonanza import DRBonanzaIE
 from .drtuber import DrTuberIE
@@ -136,11 +149,11 @@ from .ellentv import (
 )
 from .elpais import ElPaisIE
 from .embedly import EmbedlyIE
-from .empflix import EMPFlixIE
 from .engadget import EngadgetIE
 from .eporner import EpornerIE
 from .eroprofile import EroProfileIE
 from .escapist import EscapistIE
+from .espn import ESPNIE
 from .everyonesmixtape import EveryonesMixtapeIE
 from .exfm import ExfmIE
 from .expotv import ExpoTVIE
@@ -148,10 +161,10 @@ from .extremetube import ExtremeTubeIE
 from .facebook import FacebookIE
 from .faz import FazIE
 from .fc2 import FC2IE
-from .firedrive import FiredriveIE
 from .firstpost import FirstpostIE
 from .firsttv import FirstTVIE
 from .fivemin import FiveMinIE
+from .fivetv import FiveTVIE
 from .fktv import (
     FKTVIE,
     FKTVPosteckeIE,
@@ -229,6 +242,7 @@ from .infoq import InfoQIE
 from .instagram import InstagramIE, InstagramUserIE
 from .internetvideoarchive import InternetVideoArchiveIE
 from .iprima import IPrimaIE
+from .iqiyi import IqiyiIE
 from .ir90tv import Ir90TvIE
 from .ivi import (
     IviIE,
@@ -244,6 +258,7 @@ from .kaltura import KalturaIE
 from .kanalplay import KanalPlayIE
 from .kankan import KankanIE
 from .karaoketv import KaraoketvIE
+from .karrierevideos import KarriereVideosIE
 from .keezmovies import KeezMoviesIE
 from .khanacademy import KhanAcademyIE
 from .kickstarter import KickStarterIE
@@ -251,15 +266,27 @@ from .keek import KeekIE
 from .kontrtube import KontrTubeIE
 from .krasview import KrasViewIE
 from .ku6 import Ku6IE
+from .kuwo import (
+    KuwoIE,
+    KuwoAlbumIE,
+    KuwoChartIE,
+    KuwoSingerIE,
+    KuwoCategoryIE,
+    KuwoMvIE,
+)
 from .la7 import LA7IE
 from .laola1tv import Laola1TvIE
+from .lecture2go import Lecture2GoIE
 from .letv import (
     LetvIE,
     LetvTvIE,
     LetvPlaylistIE
 )
 from .libsyn import LibsynIE
-from .lifenews import LifeNewsIE
+from .lifenews import (
+    LifeNewsIE,
+    LifeEmbedIE,
+)
 from .liveleak import LiveLeakIE
 from .livestream import (
     LivestreamIE,
@@ -311,6 +338,7 @@ from .musicvault import MusicVaultIE
 from .muzu import MuzuTVIE
 from .myspace import MySpaceIE, MySpaceAlbumIE
 from .myspass import MySpassIE
+from .myvi import MyviIE
 from .myvideo import MyVideoIE
 from .myvidster import MyVidsterIE
 from .nationalgeographic import NationalGeographicIE
@@ -322,18 +350,29 @@ from .nbc import (
     NBCSportsIE,
     NBCSportsVPlayerIE,
 )
-from .ndr import NDRIE
+from .ndr import (
+    NDRIE,
+    NJoyIE,
+)
 from .ndtv import NDTVIE
 from .netzkino import NetzkinoIE
 from .nerdcubed import NerdCubedFeedIE
 from .nerdist import NerdistIE
+from .neteasemusic import (
+    NetEaseMusicIE,
+    NetEaseMusicAlbumIE,
+    NetEaseMusicSingerIE,
+    NetEaseMusicListIE,
+    NetEaseMusicMvIE,
+    NetEaseMusicProgramIE,
+    NetEaseMusicDjRadioIE,
+)
 from .newgrounds import NewgroundsIE
 from .newstube import NewstubeIE
 from .nextmedia import (
     NextMediaIE,
     NextMediaActionNewsIE,
-    AppleDailyRealtimeNewsIE,
-    AppleDailyAnimationNewsIE
+    AppleDailyIE,
 )
 from .nfb import NFBIE
 from .nfl import NFLIE
@@ -347,15 +386,18 @@ from .ninegag import NineGagIE
 from .noco import NocoIE
 from .normalboots import NormalbootsIE
 from .nosvideo import NosVideoIE
+from .nova import NovaIE
 from .novamov import NovaMovIE
 from .nowness import NownessIE
+from .nowtv import NowTVIE
 from .nowvideo import NowVideoIE
 from .npo import (
     NPOIE,
     NPOLiveIE,
     NPORadioIE,
     NPORadioFragmentIE,
-    TegenlichtVproIE,
+    VPROIE,
+    WNLIE
 )
 from .nrk import (
     NRKIE,
@@ -371,7 +413,11 @@ from .nytimes import (
 from .nuvid import NuvidIE
 from .odnoklassniki import OdnoklassnikiIE
 from .oktoberfesttv import OktoberfestTVIE
-from .ooyala import OoyalaIE
+from .onionstudios import OnionStudiosIE
+from .ooyala import (
+    OoyalaIE,
+    OoyalaExternalIE,
+)
 from .openfilm import OpenFilmIE
 from .orf import (
     ORFTVthekIE,
@@ -385,6 +431,7 @@ from .pbs import PBSIE
 from .philharmoniedeparis import PhilharmonieDeParisIE
 from .phoenix import PhoenixIE
 from .photobucket import PhotobucketIE
+from .pinkbike import PinkbikeIE
 from .planetaplay import PlanetaPlayIE
 from .pladform import PladformIE
 from .played import PlayedIE
@@ -392,6 +439,7 @@ from .playfm import PlayFMIE
 from .playvid import PlayvidIE
 from .playwire import PlaywireIE
 from .podomatic import PodomaticIE
+from .porn91 import Porn91IE
 from .pornhd import PornHdIE
 from .pornhub import (
     PornHubIE,
@@ -409,6 +457,8 @@ from .qqmusic import (
     QQMusicIE,
     QQMusicSingerIE,
     QQMusicAlbumIE,
+    QQMusicToplistIE,
+    QQMusicPlaylistIE,
 )
 from .quickvid import QuickVidIE
 from .r7 import R7IE
@@ -418,6 +468,7 @@ from .radiobremen import RadioBremenIE
 from .radiofrance import RadioFranceIE
 from .rai import RaiIE
 from .rbmaradio import RBMARadioIE
+from .rds import RDSIE
 from .redtube import RedTubeIE
 from .restudy import RestudyIE
 from .reverbnation import ReverbNationIE
@@ -428,7 +479,6 @@ from .roxwel import RoxwelIE
 from .rtbf import RTBFIE
 from .rte import RteIE
 from .rtlnl import RtlNlIE
-from .rtlnow import RTLnowIE
 from .rtl2 import RTL2IE
 from .rtp import RTPIE
 from .rts import RTSIE
@@ -442,6 +492,7 @@ from .rutube import (
     RutubePersonIE,
 )
 from .rutv import RUTVIE
+from .ruutu import RuutuIE
 from .sandia import SandiaIE
 from .safari import (
     SafariIE,
@@ -469,9 +520,16 @@ from .smotri import (
     SmotriUserIE,
     SmotriBroadcastIE,
 )
+from .snagfilms import (
+    SnagFilmsIE,
+    SnagFilmsEmbedIE,
+)
 from .snotr import SnotrIE
-from .sockshare import SockshareIE
 from .sohu import SohuIE
+from .soompi import (
+    SoompiIE,
+    SoompiShowIE,
+)
 from .soundcloud import (
     SoundcloudIE,
     SoundcloudSetIE,
@@ -484,8 +542,9 @@ from .soundgasm import (
 )
 from .southpark import (
     SouthParkIE,
-    SouthParkEsIE,
     SouthParkDeIE,
+    SouthParkDkIE,
+    SouthParkEsIE,
     SouthParkNlIE
 )
 from .space import SpaceIE
@@ -495,7 +554,10 @@ from .spiegel import SpiegelIE, SpiegelArticleIE
 from .spiegeltv import SpiegeltvIE
 from .spike import SpikeIE
 from .sport5 import Sport5IE
-from .sportbox import SportBoxIE
+from .sportbox import (
+    SportBoxIE,
+    SportBoxEmbedIE,
+)
 from .sportdeutschland import SportDeutschlandIE
 from .srf import SrfIE
 from .srmediathek import SRMediathekIE
@@ -506,7 +568,10 @@ from .streamcloud import StreamcloudIE
 from .streamcz import StreamCZIE
 from .streetvoice import StreetVoiceIE
 from .sunporno import SunPornoIE
-from .svtplay import SVTPlayIE
+from .svt import (
+    SVTIE,
+    SVTPlayIE,
+)
 from .swrmediathek import SWRMediathekIE
 from .syfy import SyfyIE
 from .sztvhu import SztvHuIE
@@ -532,11 +597,19 @@ from .tf1 import TF1IE
 from .theonion import TheOnionIE
 from .theplatform import ThePlatformIE
 from .thesixtyone import TheSixtyOneIE
+from .thisamericanlife import ThisAmericanLifeIE
 from .thisav import ThisAVIE
 from .tinypic import TinyPicIE
 from .tlc import TlcIE, TlcDeIE
-from .tmz import TMZIE
-from .tnaflix import TNAFlixIE
+from .tmz import (
+    TMZIE,
+    TMZArticleIE,
+)
+from .tnaflix import (
+    TNAFlixIE,
+    EMPFlixIE,
+    MovieFapIE,
+)
 from .thvideo import (
     THVideoIE,
     THVideoPlaylistIE
@@ -547,12 +620,21 @@ from .traileraddict import TrailerAddictIE
 from .trilulilu import TriluliluIE
 from .trutube import TruTubeIE
 from .tube8 import Tube8IE
+from .tubitv import TubiTvIE
 from .tudou import TudouIE
 from .tumblr import TumblrIE
 from .tunein import TuneInIE
 from .turbo import TurboIE
 from .tutv import TutvIE
+from .tv2 import (
+    TV2IE,
+    TV2ArticleIE,
+)
 from .tv4 import TV4IE
+from .tvc import (
+    TVCIE,
+    TVCArticleIE,
+)
 from .tvigle import TvigleIE
 from .tvp import TvpIE, TvpSeriesIE
 from .tvplay import TVPlayIE
@@ -571,6 +653,7 @@ from .twitch import (
     TwitchBookmarksIE,
     TwitchStreamIE,
 )
+from .twitter import TwitterCardIE
 from .ubu import UbuIE
 from .udemy import (
     UdemyIE,
@@ -588,7 +671,11 @@ from .veoh import VeohIE
 from .vessel import VesselIE
 from .vesti import VestiIE
 from .vevo import VevoIE
-from .vgtv import VGTVIE
+from .vgtv import (
+    BTArticleIE,
+    BTVestlendingenIE,
+    VGTVIE,
+)
 from .vh1 import VH1IE
 from .vice import ViceIE
 from .viddler import ViddlerIE
@@ -619,12 +706,16 @@ from .vine import (
     VineIE,
     VineUserIE,
 )
-from .viki import VikiIE
+from .viki import (
+    VikiIE,
+    VikiChannelIE,
+)
 from .vk import (
     VKIE,
     VKUserVideosIE,
 )
 from .vodlocker import VodlockerIE
+from .voicerepublic import VoiceRepublicIE
 from .vporn import VpornIE
 from .vrt import VRTIE
 from .vube import VubeIE
@@ -639,7 +730,10 @@ from .wdr import (
     WDRMobileIE,
     WDRMausIE,
 )
-from .webofstories import WebOfStoriesIE
+from .webofstories import (
+    WebOfStoriesIE,
+    WebOfStoriesPlaylistIE,
+)
 from .weibo import WeiboIE
 from .wimp import WimpIE
 from .wistia import WistiaIE
@@ -648,12 +742,16 @@ from .wrzuta import WrzutaIE
 from .wsj import WSJIE
 from .xbef import XBefIE
 from .xboxclips import XboxClipsIE
-from .xhamster import XHamsterIE
+from .xhamster import (
+    XHamsterIE,
+    XHamsterEmbedIE,
+)
 from .xminus import XMinusIE
 from .xnxx import XNXXIE
-from .xvideos import XVideosIE
+from .xstream import XstreamIE
 from .xtube import XTubeUserIE, XTubeIE
 from .xuite import XuiteIE
+from .xvideos import XVideosIE
 from .xxxymovies import XXXYMoviesIE
 from .yahoo import (
     YahooIE,
@@ -666,6 +764,7 @@ from .yandexmusic import (
     YandexMusicPlaylistIE,
 )
 from .yesjapan import YesJapanIE
+from .yinyuetai import YinYueTaiIE
 from .ynet import YnetIE
 from .youjizz import YouJizzIE
 from .youku import YoukuIE
index 97d12856092975a094ec18a5fd7ecafef39c255a..5e43adc51f98c2f22e728c49150b84ae64f704e3 100644 (file)
@@ -5,6 +5,8 @@ from ..utils import (
     parse_duration,
     unified_strdate,
     str_to_int,
+    float_or_none,
+    ISO639Utils,
 )
 
 
@@ -69,3 +71,61 @@ class AdobeTVIE(InfoExtractor):
             'view_count': view_count,
             'formats': formats,
         }
+
+
+class AdobeTVVideoIE(InfoExtractor):
+    _VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P<id>\d+)'
+
+    _TEST = {
+        # From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners
+        'url': 'https://video.tv.adobe.com/v/2456/',
+        'md5': '43662b577c018ad707a63766462b1e87',
+        'info_dict': {
+            'id': '2456',
+            'ext': 'mp4',
+            'title': 'New experience with Acrobat DC',
+            'description': 'New experience with Acrobat DC',
+            'duration': 248.667,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        player_params = self._parse_json(self._search_regex(
+            r'var\s+bridge\s*=\s*([^;]+);', webpage, 'player parameters'),
+            video_id)
+
+        formats = [{
+            'url': source['src'],
+            'width': source.get('width'),
+            'height': source.get('height'),
+            'tbr': source.get('bitrate'),
+        } for source in player_params['sources']]
+
+        # For both metadata and downloaded files the duration varies among
+        # formats. I just pick the max one
+        duration = max(filter(None, [
+            float_or_none(source.get('duration'), scale=1000)
+            for source in player_params['sources']]))
+
+        subtitles = {}
+        for translation in player_params.get('translations', []):
+            lang_id = translation.get('language_w3c') or ISO639Utils.long2short(translation['language_medium'])
+            if lang_id not in subtitles:
+                subtitles[lang_id] = []
+            subtitles[lang_id].append({
+                'url': translation['vttPath'],
+                'ext': 'vtt',
+            })
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': player_params['title'],
+            'description': self._og_search_description(webpage),
+            'duration': duration,
+            'subtitles': subtitles,
+        }
index e15c015fbafd466aaba089bef979843a397f1ab7..0c00acfb5766ad6e899877b8ed173225c10fc4ec 100644 (file)
@@ -1,21 +1,11 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
-from ..utils import (
-    int_or_none,
-    parse_iso8601,
-    xpath_with_ns,
-    xpath_text,
-    find_xpath_attr,
-)
 
 
 class AftenpostenIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?aftenposten\.no/webtv/(?:#!/)?video/(?P<id>\d+)'
-
     _TEST = {
         'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more',
         'md5': 'fd828cd29774a729bf4d4425fe192972',
@@ -30,69 +20,4 @@ class AftenpostenIE(InfoExtractor):
     }
 
     def _real_extract(self, url):
-        video_id = self._match_id(url)
-
-        data = self._download_xml(
-            'http://frontend.xstream.dk/ap/feed/video/?platform=web&id=%s' % video_id, video_id)
-
-        NS_MAP = {
-            'atom': 'http://www.w3.org/2005/Atom',
-            'xt': 'http://xstream.dk/',
-            'media': 'http://search.yahoo.com/mrss/',
-        }
-
-        entry = data.find(xpath_with_ns('./atom:entry', NS_MAP))
-
-        title = xpath_text(
-            entry, xpath_with_ns('./atom:title', NS_MAP), 'title')
-        description = xpath_text(
-            entry, xpath_with_ns('./atom:summary', NS_MAP), 'description')
-        timestamp = parse_iso8601(xpath_text(
-            entry, xpath_with_ns('./atom:published', NS_MAP), 'upload date'))
-
-        formats = []
-        media_group = entry.find(xpath_with_ns('./media:group', NS_MAP))
-        for media_content in media_group.findall(xpath_with_ns('./media:content', NS_MAP)):
-            media_url = media_content.get('url')
-            if not media_url:
-                continue
-            tbr = int_or_none(media_content.get('bitrate'))
-            mobj = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', media_url)
-            if mobj:
-                formats.append({
-                    'url': mobj.group('url'),
-                    'play_path': 'mp4:%s' % mobj.group('playpath'),
-                    'app': mobj.group('app'),
-                    'ext': 'flv',
-                    'tbr': tbr,
-                    'format_id': 'rtmp-%d' % tbr,
-                })
-            else:
-                formats.append({
-                    'url': media_url,
-                    'tbr': tbr,
-                })
-        self._sort_formats(formats)
-
-        link = find_xpath_attr(
-            entry, xpath_with_ns('./atom:link', NS_MAP), 'rel', 'original')
-        if link is not None:
-            formats.append({
-                'url': link.get('href'),
-                'format_id': link.get('rel'),
-            })
-
-        thumbnails = [{
-            'url': splash.get('url'),
-            'width': int_or_none(splash.get('width')),
-            'height': int_or_none(splash.get('height')),
-        } for splash in media_group.findall(xpath_with_ns('./xt:splash', NS_MAP))]
-
-        return {
-            'id': video_id,
-            'title': title,
-            'description': description,
-            'timestamp': timestamp,
-            'formats': formats,
-            'thumbnails': thumbnails,
-        }
+        return self.url_result('xstream:ap:%s' % self._match_id(url), 'Xstream')
index a117502bc0ad7bfec11592ec57da575898cacc3d..e0518cf261fbffc4dd23bc4a3800d04eae324139 100644 (file)
@@ -6,11 +6,11 @@ from ..utils import int_or_none
 
 
 class AftonbladetIE(InfoExtractor):
-    _VALID_URL = r'http://tv\.aftonbladet\.se/webbtv.+?(?P<id>article[0-9]+)\.ab(?:$|[?#])'
+    _VALID_URL = r'http://tv\.aftonbladet\.se/abtv/articles/(?P<id>[0-9]+)'
     _TEST = {
-        'url': 'http://tv.aftonbladet.se/webbtv/nyheter/vetenskap/rymden/article36015.ab',
+        'url': 'http://tv.aftonbladet.se/abtv/articles/36015',
         'info_dict': {
-            'id': 'article36015',
+            'id': '36015',
             'ext': 'mp4',
             'title': 'Vulkanutbrott i rymden - nu släpper NASA bilderna',
             'description': 'Jupiters måne mest aktiv av alla himlakroppar',
@@ -25,8 +25,9 @@ class AftonbladetIE(InfoExtractor):
 
         # find internal video meta data
         meta_url = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json'
-        internal_meta_id = self._html_search_regex(
-            r'data-aptomaId="([\w\d]+)"', webpage, 'internal_meta_id')
+        player_config = self._parse_json(self._html_search_regex(
+            r'data-player-config="([^"]+)"', webpage, 'player config'), video_id)
+        internal_meta_id = player_config['videoId']
         internal_meta_url = meta_url % internal_meta_id
         internal_meta_json = self._download_json(
             internal_meta_url, video_id, 'Downloading video meta data')
diff --git a/youtube_dl/extractor/appleconnect.py b/youtube_dl/extractor/appleconnect.py
new file mode 100644 (file)
index 0000000..ea7a703
--- /dev/null
@@ -0,0 +1,50 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    str_to_int,
+    ExtractorError
+)
+
+
+class AppleConnectIE(InfoExtractor):
+    _VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/idsa\.(?P<id>[\w-]+)'
+    _TEST = {
+        'url': 'https://itunes.apple.com/us/post/idsa.4ab17a39-2720-11e5-96c5-a5b38f6c42d3',
+        'md5': '10d0f2799111df4cb1c924520ca78f98',
+        'info_dict': {
+            'id': '4ab17a39-2720-11e5-96c5-a5b38f6c42d3',
+            'ext': 'm4v',
+            'title': 'Energy',
+            'uploader': 'Drake',
+            'thumbnail': 'http://is5.mzstatic.com/image/thumb/Video5/v4/78/61/c5/7861c5fa-ad6d-294b-1464-cf7605b911d6/source/1920x1080sr.jpg',
+            'upload_date': '20150710',
+            'timestamp': 1436545535,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        try:
+            video_json = self._html_search_regex(
+                r'class="auc-video-data">(\{.*?\})', webpage, 'json')
+        except ExtractorError:
+            raise ExtractorError('This post doesn\'t contain a video', expected=True)
+
+        video_data = self._parse_json(video_json, video_id)
+        timestamp = str_to_int(self._html_search_regex(r'data-timestamp="(\d+)"', webpage, 'timestamp'))
+        like_count = str_to_int(self._html_search_regex(r'(\d+) Loves', webpage, 'like count'))
+
+        return {
+            'id': video_id,
+            'url': video_data['sslSrc'],
+            'title': video_data['title'],
+            'description': video_data['description'],
+            'uploader': video_data['artistName'],
+            'thumbnail': video_data['artworkUrl'],
+            'timestamp': timestamp,
+            'like_count': like_count,
+        }
index 6a35ea463edcafe3b9d7db4c53b9bf0c53198fd0..6f465789b497a6625776c383ff699a64b0b5c346 100644 (file)
@@ -8,6 +8,7 @@ from .generic import GenericIE
 from ..utils import (
     determine_ext,
     ExtractorError,
+    get_element_by_attribute,
     qualities,
     int_or_none,
     parse_duration,
@@ -22,19 +23,125 @@ class ARDMediathekIE(InfoExtractor):
     _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
 
     _TESTS = [{
-        'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
-        'only_matching': True,
+        'url': 'http://www.ardmediathek.de/tv/Dokumentation-und-Reportage/Ich-liebe-das-Leben-trotzdem/rbb-Fernsehen/Video?documentId=29582122&bcastId=3822114',
+        'info_dict': {
+            'id': '29582122',
+            'ext': 'mp4',
+            'title': 'Ich liebe das Leben trotzdem',
+            'description': 'md5:45e4c225c72b27993314b31a84a5261c',
+            'duration': 4557,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
     }, {
-        'url': 'http://www.ardmediathek.de/tv/Tatort/Das-Wunder-von-Wolbeck-Video-tgl-ab-20/Das-Erste/Video?documentId=22490580&bcastId=602916',
+        'url': 'http://www.ardmediathek.de/tv/Tatort/Tatort-Scheinwelten-H%C3%B6rfassung-Video/Das-Erste/Video?documentId=29522730&bcastId=602916',
+        'md5': 'f4d98b10759ac06c0072bbcd1f0b9e3e',
         'info_dict': {
-            'id': '22490580',
+            'id': '29522730',
             'ext': 'mp4',
-            'title': 'Das Wunder von Wolbeck (Video tgl. ab 20 Uhr)',
-            'description': 'Auf einem restaurierten Hof bei Wolbeck wird der Heilpraktiker Raffael Lembeck eines morgens von seiner Frau Stella tot aufgefunden. Das Opfer war offensichtlich in seiner Praxis zu Fall gekommen und ist dann verblutet, erklärt Prof. Boerne am Tatort.',
+            'title': 'Tatort: Scheinwelten - Hörfassung (Video tgl. ab 20 Uhr)',
+            'description': 'md5:196392e79876d0ac94c94e8cdb2875f1',
+            'duration': 5252,
         },
-        'skip': 'Blocked outside of Germany',
+    }, {
+        # audio
+        'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086',
+        'md5': '219d94d8980b4f538c7fcb0865eb7f2c',
+        'info_dict': {
+            'id': '28488308',
+            'ext': 'mp3',
+            'title': 'Tod eines Fußballers',
+            'description': 'md5:f6e39f3461f0e1f54bfa48c8875c86ef',
+            'duration': 3240,
+        },
+    }, {
+        'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
+        'only_matching': True,
     }]
 
+    def _extract_media_info(self, media_info_url, webpage, video_id):
+        media_info = self._download_json(
+            media_info_url, video_id, 'Downloading media JSON')
+
+        formats = self._extract_formats(media_info, video_id)
+
+        if not formats:
+            if '"fsk"' in webpage:
+                raise ExtractorError(
+                    'This video is only available after 20:00', expected=True)
+            elif media_info.get('_geoblocked'):
+                raise ExtractorError('This video is not available due to geo restriction', expected=True)
+
+        self._sort_formats(formats)
+
+        duration = int_or_none(media_info.get('_duration'))
+        thumbnail = media_info.get('_previewImage')
+
+        subtitles = {}
+        subtitle_url = media_info.get('_subtitleUrl')
+        if subtitle_url:
+            subtitles['de'] = [{
+                'ext': 'srt',
+                'url': subtitle_url,
+            }]
+
+        return {
+            'id': video_id,
+            'duration': duration,
+            'thumbnail': thumbnail,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
+
+    def _extract_formats(self, media_info, video_id):
+        type_ = media_info.get('_type')
+        media_array = media_info.get('_mediaArray', [])
+        formats = []
+        for num, media in enumerate(media_array):
+            for stream in media.get('_mediaStreamArray', []):
+                stream_urls = stream.get('_stream')
+                if not stream_urls:
+                    continue
+                if not isinstance(stream_urls, list):
+                    stream_urls = [stream_urls]
+                quality = stream.get('_quality')
+                server = stream.get('_server')
+                for stream_url in stream_urls:
+                    ext = determine_ext(stream_url)
+                    if ext == 'f4m':
+                        formats.extend(self._extract_f4m_formats(
+                            stream_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124',
+                            video_id, preference=-1, f4m_id='hds'))
+                    elif ext == 'm3u8':
+                        formats.extend(self._extract_m3u8_formats(
+                            stream_url, video_id, 'mp4', preference=1, m3u8_id='hls'))
+                    else:
+                        if server and server.startswith('rtmp'):
+                            f = {
+                                'url': server,
+                                'play_path': stream_url,
+                                'format_id': 'a%s-rtmp-%s' % (num, quality),
+                            }
+                        elif stream_url.startswith('http'):
+                            f = {
+                                'url': stream_url,
+                                'format_id': 'a%s-%s-%s' % (num, ext, quality)
+                            }
+                        else:
+                            continue
+                        m = re.search(r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$', stream_url)
+                        if m:
+                            f.update({
+                                'width': int(m.group('width')),
+                                'height': int(m.group('height')),
+                            })
+                        if type_ == 'audio':
+                            f['vcodec'] = 'none'
+                        formats.append(f)
+        return formats
+
     def _real_extract(self, url):
         # determine video id from url
         m = re.match(self._VALID_URL, url)
@@ -92,46 +199,22 @@ class ARDMediathekIE(InfoExtractor):
                     'format_id': fid,
                     'url': furl,
                 })
+            self._sort_formats(formats)
+            info = {
+                'formats': formats,
+            }
         else:  # request JSON file
-            media_info = self._download_json(
-                'http://www.ardmediathek.de/play/media/%s' % video_id, video_id)
-            # The second element of the _mediaArray contains the standard http urls
-            streams = media_info['_mediaArray'][1]['_mediaStreamArray']
-            if not streams:
-                if '"fsk"' in webpage:
-                    raise ExtractorError('This video is only available after 20:00')
-
-            formats = []
-            for s in streams:
-                if type(s['_stream']) == list:
-                    for index, url in enumerate(s['_stream'][::-1]):
-                        quality = s['_quality'] + index
-                        formats.append({
-                            'quality': quality,
-                            'url': url,
-                            'format_id': '%s-%s' % (determine_ext(url), quality)
-                        })
-                    continue
-
-                format = {
-                    'quality': s['_quality'],
-                    'url': s['_stream'],
-                }
-
-                format['format_id'] = '%s-%s' % (
-                    determine_ext(format['url']), format['quality'])
+            info = self._extract_media_info(
+                'http://www.ardmediathek.de/play/media/%s' % video_id, webpage, video_id)
 
-                formats.append(format)
-
-        self._sort_formats(formats)
-
-        return {
+        info.update({
             'id': video_id,
             'title': title,
             'description': description,
-            'formats': formats,
             'thumbnail': thumbnail,
-        }
+        })
+
+        return info
 
 
 class ARDIE(InfoExtractor):
@@ -189,3 +272,41 @@ class ARDIE(InfoExtractor):
             'upload_date': upload_date,
             'thumbnail': thumbnail,
         }
+
+
+class SportschauIE(ARDMediathekIE):
+    IE_NAME = 'Sportschau'
+    _VALID_URL = r'(?P<baseurl>https?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video(?P<id>[^/#?]+))\.html'
+    _TESTS = [{
+        'url': 'http://www.sportschau.de/tourdefrance/videoseppeltkokainhatnichtsmitklassischemdopingzutun100.html',
+        'info_dict': {
+            'id': 'seppeltkokainhatnichtsmitklassischemdopingzutun100',
+            'ext': 'mp4',
+            'title': 'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun"',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'description': 'Der ARD-Doping Experte Hajo Seppelt gibt seine Einschätzung zum ersten Dopingfall der diesjährigen Tour de France um den Italiener Luca Paolini ab.',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        base_url = mobj.group('baseurl')
+
+        webpage = self._download_webpage(url, video_id)
+        title = get_element_by_attribute('class', 'headline', webpage)
+        description = self._html_search_meta('description', webpage, 'description')
+
+        info = self._extract_media_info(
+            base_url + '-mc_defaultQuality-h.json', webpage, video_id)
+
+        info.update({
+            'title': title,
+            'description': description,
+        })
+
+        return info
index 8273bd6c9ae3cdff82052c8f63efc68be97561b3..76de244774369dd53c510961ecf6b6a7641c7027 100644 (file)
@@ -7,7 +7,6 @@ from .common import InfoExtractor
 from ..utils import (
     find_xpath_attr,
     unified_strdate,
-    get_element_by_id,
     get_element_by_attribute,
     int_or_none,
     qualities,
@@ -195,7 +194,9 @@ class ArteTVFutureIE(ArteTVPlus7IE):
     def _real_extract(self, url):
         anchor_id, lang = self._extract_url_info(url)
         webpage = self._download_webpage(url, anchor_id)
-        row = get_element_by_id(anchor_id, webpage)
+        row = self._search_regex(
+            r'(?s)id="%s"[^>]*>.+?(<div[^>]*arte_vp_url[^>]*>)' % anchor_id,
+            webpage, 'row')
         return self._extract_from_webpage(row, anchor_id, lang)
 
 
index 906895c1e197f510c358e47376c6f50d81bd8bdf..e37ee44403a34afe03c02660a71765e2af89cdba 100644 (file)
@@ -8,6 +8,7 @@ from ..compat import compat_urlparse
 
 
 class BaiduVideoIE(InfoExtractor):
+    IE_DESC = '百度视频'
     _VALID_URL = r'http://v\.baidu\.com/(?P<type>[a-z]+)/(?P<id>\d+)\.htm'
     _TESTS = [{
         'url': 'http://v.baidu.com/comic/1069.htm?frp=bdbrand&q=%E4%B8%AD%E5%8D%8E%E5%B0%8F%E5%BD%93%E5%AE%B6',
index 249bc6bbde85dc568796f094f421f989df664a1c..b2e5f741870bc2437e1e2c126b468303ae781dda 100644 (file)
@@ -129,6 +129,20 @@ class BBCCoUkIE(InfoExtractor):
                 'skip_download': True,
             },
             'skip': 'geolocation',
+        }, {
+            'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
+            'info_dict': {
+                'id': 'b05zmgw1',
+                'ext': 'flv',
+                'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
+                'title': 'Royal Academy Summer Exhibition',
+                'duration': 3540,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+            'skip': 'geolocation',
         }, {
             'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
             'only_matching': True,
@@ -196,12 +210,12 @@ class BBCCoUkIE(InfoExtractor):
 
     def _extract_video(self, media, programme_id):
         formats = []
-        vbr = int(media.get('bitrate'))
+        vbr = int_or_none(media.get('bitrate'))
         vcodec = media.get('encoding')
         service = media.get('service')
-        width = int(media.get('width'))
-        height = int(media.get('height'))
-        file_size = int(media.get('media_file_size'))
+        width = int_or_none(media.get('width'))
+        height = int_or_none(media.get('height'))
+        file_size = int_or_none(media.get('media_file_size'))
         for connection in self._extract_connections(media):
             conn_formats = self._extract_connection(connection, programme_id)
             for format in conn_formats:
@@ -218,7 +232,7 @@ class BBCCoUkIE(InfoExtractor):
 
     def _extract_audio(self, media, programme_id):
         formats = []
-        abr = int(media.get('bitrate'))
+        abr = int_or_none(media.get('bitrate'))
         acodec = media.get('encoding')
         service = media.get('service')
         for connection in self._extract_connections(media):
@@ -237,26 +251,11 @@ class BBCCoUkIE(InfoExtractor):
         for connection in self._extract_connections(media):
             captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
             lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
-            ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}'))
-            srt = ''
-
-            def _extract_text(p):
-                if p.text is not None:
-                    stripped_text = p.text.strip()
-                    if stripped_text:
-                        return stripped_text
-                return ' '.join(span.text.strip() for span in p.findall('{http://www.w3.org/2006/10/ttaf1}span'))
-            for pos, p in enumerate(ps):
-                srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), _extract_text(p))
             subtitles[lang] = [
                 {
                     'url': connection.get('href'),
                     'ext': 'ttml',
                 },
-                {
-                    'data': srt,
-                    'ext': 'srt',
-                },
             ]
         return subtitles
 
@@ -267,7 +266,7 @@ class BBCCoUkIE(InfoExtractor):
                 programme_id, 'Downloading media selection XML')
         except ExtractorError as ee:
             if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
-                media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().encode('utf-8'))
+                media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8'))
             else:
                 raise
 
@@ -301,7 +300,7 @@ class BBCCoUkIE(InfoExtractor):
                     if kind != 'programme' and kind != 'radioProgramme':
                         continue
                     programme_id = item.get('vpid')
-                    duration = int(item.get('duration'))
+                    duration = int_or_none(item.get('duration'))
                     formats, subtitles = self._download_media_selector(programme_id)
                 return programme_id, title, description, duration, formats, subtitles
         except ExtractorError as ee:
@@ -333,7 +332,7 @@ class BBCCoUkIE(InfoExtractor):
             title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text
             description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text
             programme_id = item.get('identifier')
-            duration = int(item.get('duration'))
+            duration = int_or_none(item.get('duration'))
             formats, subtitles = self._download_media_selector(programme_id)
 
         return programme_id, title, description, duration, formats, subtitles
@@ -362,7 +361,7 @@ class BBCCoUkIE(InfoExtractor):
             formats, subtitles = self._download_media_selector(programme_id)
             title = self._og_search_title(webpage)
             description = self._search_regex(
-                r'<p class="medium-description">([^<]+)</p>',
+                r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
                 webpage, 'description', fatal=False)
         else:
             programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
index d2abd4d772c95e9877a607af7cc0b6e4d56e123a..03dad4636afdf0443735fde8f1d643aea553ba10 100644 (file)
@@ -1,7 +1,7 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..compat import compat_urllib_parse
+from ..compat import compat_urllib_parse_unquote
 from ..utils import (
     xpath_text,
     xpath_with_ns,
@@ -16,11 +16,11 @@ class BetIE(InfoExtractor):
         {
             'url': 'http://www.bet.com/news/politics/2014/12/08/in-bet-exclusive-obama-talks-race-and-racism.html',
             'info_dict': {
-                'id': '740ab250-bb94-4a8a-8787-fe0de7c74471',
+                'id': 'news/national/2014/a-conversation-with-president-obama',
                 'display_id': 'in-bet-exclusive-obama-talks-race-and-racism',
                 'ext': 'flv',
-                'title': 'BET News Presents: A Conversation With President Obama',
-                'description': 'md5:5a88d8ae912c1b33e090290af7ec33c6',
+                'title': 'A Conversation With President Obama',
+                'description': 'md5:699d0652a350cf3e491cd15cc745b5da',
                 'duration': 1534,
                 'timestamp': 1418075340,
                 'upload_date': '20141208',
@@ -35,7 +35,7 @@ class BetIE(InfoExtractor):
         {
             'url': 'http://www.bet.com/video/news/national/2014/justice-for-ferguson-a-community-reacts.html',
             'info_dict': {
-                'id': 'bcd1b1df-673a-42cf-8d01-b282db608f2d',
+                'id': 'news/national/2014/justice-for-ferguson-a-community-reacts',
                 'display_id': 'justice-for-ferguson-a-community-reacts',
                 'ext': 'flv',
                 'title': 'Justice for Ferguson: A Community Reacts',
@@ -57,10 +57,13 @@ class BetIE(InfoExtractor):
         display_id = self._match_id(url)
         webpage = self._download_webpage(url, display_id)
 
-        media_url = compat_urllib_parse.unquote(self._search_regex(
+        media_url = compat_urllib_parse_unquote(self._search_regex(
             [r'mediaURL\s*:\s*"([^"]+)"', r"var\s+mrssMediaUrl\s*=\s*'([^']+)'"],
             webpage, 'media URL'))
 
+        video_id = self._search_regex(
+            r'/video/(.*)/_jcr_content/', media_url, 'video id')
+
         mrss = self._download_xml(media_url, display_id)
 
         item = mrss.find('./channel/item')
@@ -75,8 +78,6 @@ class BetIE(InfoExtractor):
         description = xpath_text(
             item, './description', 'description', fatal=False)
 
-        video_id = xpath_text(item, './guid', 'video id', fatal=False)
-
         timestamp = parse_iso8601(xpath_text(
             item, xpath_with_ns('./dc:date', NS_MAP),
             'upload date', fatal=False))
index 7ca835e31f3477f8e46c74804aea36ecfe789686..ecc17ebebca9e1819fc804f37d48dcceb80c44c5 100644 (file)
@@ -3,6 +3,8 @@ from __future__ import unicode_literals
 
 import re
 import itertools
+import json
+import xml.etree.ElementTree as ET
 
 from .common import InfoExtractor
 from ..utils import (
@@ -39,8 +41,15 @@ class BiliBiliIE(InfoExtractor):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
 
-        if self._search_regex(r'(此视频不存在或被删除)', webpage, 'error message', default=None):
-            raise ExtractorError('The video does not exist or was deleted', expected=True)
+        if '(此视频不存在或被删除)' in webpage:
+            raise ExtractorError(
+                'The video does not exist or was deleted', expected=True)
+
+        if '>你没有权限浏览! 由于版权相关问题 我们不对您所在的地区提供服务<' in webpage:
+            raise ExtractorError(
+                'The video is not available in your region due to copyright reasons',
+                expected=True)
+
         video_code = self._search_regex(
             r'(?s)<div itemprop="video".*?>(.*?)</div>', webpage, 'video code')
 
@@ -67,11 +76,19 @@ class BiliBiliIE(InfoExtractor):
 
         entries = []
 
-        lq_doc = self._download_xml(
+        lq_page = self._download_webpage(
             'http://interface.bilibili.com/v_cdn_play?appkey=1&cid=%s' % cid,
             video_id,
             note='Downloading LQ video info'
         )
+        try:
+            err_info = json.loads(lq_page)
+            raise ExtractorError(
+                'BiliBili said: ' + err_info['error_text'], expected=True)
+        except ValueError:
+            pass
+
+        lq_doc = ET.fromstring(lq_page)
         lq_durls = lq_doc.findall('./durl')
 
         hq_doc = self._download_xml(
@@ -80,9 +97,11 @@ class BiliBiliIE(InfoExtractor):
             note='Downloading HQ video info',
             fatal=False,
         )
-        hq_durls = hq_doc.findall('./durl') if hq_doc is not False else itertools.repeat(None)
-
-        assert len(lq_durls) == len(hq_durls)
+        if hq_doc is not False:
+            hq_durls = hq_doc.findall('./durl')
+            assert len(lq_durls) == len(hq_durls)
+        else:
+            hq_durls = itertools.repeat(None)
 
         i = 1
         for lq_durl, hq_durl in zip(lq_durls, hq_durls):
@@ -93,7 +112,7 @@ class BiliBiliIE(InfoExtractor):
                 'filesize': int_or_none(
                     lq_durl.find('./size'), get_attr='text'),
             }]
-            if hq_durl:
+            if hq_durl is not None:
                 formats.append({
                     'format_id': 'hq',
                     'quality': 2,
index fb56cd78d07ab396ae26ff5a15ac7ebdef933237..c3296283d0dfd1dd753b6e31082e48361414baa3 100644 (file)
@@ -5,7 +5,6 @@ import re
 from .common import InfoExtractor
 
 from ..compat import (
-    compat_str,
     compat_urllib_request,
     compat_urlparse,
 )
@@ -14,6 +13,8 @@ from ..utils import (
     int_or_none,
     parse_iso8601,
     unescapeHTML,
+    xpath_text,
+    xpath_with_ns,
 )
 
 
@@ -23,10 +24,10 @@ class BlipTVIE(InfoExtractor):
     _TESTS = [
         {
             'url': 'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352',
-            'md5': 'c6934ad0b6acf2bd920720ec888eb812',
+            'md5': '80baf1ec5c3d2019037c1c707d676b9f',
             'info_dict': {
                 'id': '5779306',
-                'ext': 'mov',
+                'ext': 'm4v',
                 'title': 'CBR EXCLUSIVE: "Gotham City Imposters" Bats VS Jokerz Short 3',
                 'description': 'md5:9bc31f227219cde65e47eeec8d2dc596',
                 'timestamp': 1323138843,
@@ -100,6 +101,20 @@ class BlipTVIE(InfoExtractor):
                 'vcodec': 'none',
             }
         },
+        {
+            # missing duration
+            'url': 'http://blip.tv/rss/flash/6700880',
+            'info_dict': {
+                'id': '6684191',
+                'ext': 'm4v',
+                'title': 'Cowboy Bebop: Gateway Shuffle Review',
+                'description': 'md5:3acc480c0f9ae157f5fe88547ecaf3f8',
+                'timestamp': 1386639757,
+                'upload_date': '20131210',
+                'uploader': 'sfdebris',
+                'uploader_id': '706520',
+            }
+        }
     ]
 
     @staticmethod
@@ -128,35 +143,34 @@ class BlipTVIE(InfoExtractor):
 
         rss = self._download_xml('http://blip.tv/rss/flash/%s' % video_id, video_id, 'Downloading video RSS')
 
-        def blip(s):
-            return '{http://blip.tv/dtd/blip/1.0}%s' % s
-
-        def media(s):
-            return '{http://search.yahoo.com/mrss/}%s' % s
-
-        def itunes(s):
-            return '{http://www.itunes.com/dtds/podcast-1.0.dtd}%s' % s
+        def _x(p):
+            return xpath_with_ns(p, {
+                'blip': 'http://blip.tv/dtd/blip/1.0',
+                'media': 'http://search.yahoo.com/mrss/',
+                'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd',
+            })
 
         item = rss.find('channel/item')
 
-        video_id = item.find(blip('item_id')).text
-        title = item.find('./title').text
-        description = clean_html(compat_str(item.find(blip('puredescription')).text))
-        timestamp = parse_iso8601(item.find(blip('datestamp')).text)
-        uploader = item.find(blip('user')).text
-        uploader_id = item.find(blip('userid')).text
-        duration = int(item.find(blip('runtime')).text)
-        media_thumbnail = item.find(media('thumbnail'))
-        thumbnail = media_thumbnail.get('url') if media_thumbnail is not None else item.find(itunes('image')).text
-        categories = [category.text for category in item.findall('category')]
+        video_id = xpath_text(item, _x('blip:item_id'), 'video id') or lookup_id
+        title = xpath_text(item, 'title', 'title', fatal=True)
+        description = clean_html(xpath_text(item, _x('blip:puredescription'), 'description'))
+        timestamp = parse_iso8601(xpath_text(item, _x('blip:datestamp'), 'timestamp'))
+        uploader = xpath_text(item, _x('blip:user'), 'uploader')
+        uploader_id = xpath_text(item, _x('blip:userid'), 'uploader id')
+        duration = int_or_none(xpath_text(item, _x('blip:runtime'), 'duration'))
+        media_thumbnail = item.find(_x('media:thumbnail'))
+        thumbnail = (media_thumbnail.get('url') if media_thumbnail is not None
+                     else xpath_text(item, 'image', 'thumbnail'))
+        categories = [category.text for category in item.findall('category') if category is not None]
 
         formats = []
         subtitles_urls = {}
 
-        media_group = item.find(media('group'))
-        for media_content in media_group.findall(media('content')):
+        media_group = item.find(_x('media:group'))
+        for media_content in media_group.findall(_x('media:content')):
             url = media_content.get('url')
-            role = media_content.get(blip('role'))
+            role = media_content.get(_x('blip:role'))
             msg = self._download_webpage(
                 url + '?showplayer=20140425131715&referrer=http://blip.tv&mask=7&skin=flashvars&view=url',
                 video_id, 'Resolving URL for %s' % role)
@@ -175,8 +189,8 @@ class BlipTVIE(InfoExtractor):
                     'url': real_url,
                     'format_id': role,
                     'format_note': media_type,
-                    'vcodec': media_content.get(blip('vcodec')) or 'none',
-                    'acodec': media_content.get(blip('acodec')),
+                    'vcodec': media_content.get(_x('blip:vcodec')) or 'none',
+                    'acodec': media_content.get(_x('blip:acodec')),
                     'filesize': media_content.get('filesize'),
                     'width': int_or_none(media_content.get('width')),
                     'height': int_or_none(media_content.get('height')),
index 45ba5173246575ab617dbab911280b75d61d61e8..66e394e1093105b936191da798734128d4ea1afe 100644 (file)
@@ -16,27 +16,38 @@ class BRIE(InfoExtractor):
 
     _TESTS = [
         {
-            'url': 'http://www.br.de/mediathek/video/sendungen/heimatsound/heimatsound-festival-2014-trailer-100.html',
-            'md5': '93556dd2bcb2948d9259f8670c516d59',
+            'url': 'http://www.br.de/mediathek/video/sendungen/abendschau/betriebliche-altersvorsorge-104.html',
+            'md5': '83a0477cf0b8451027eb566d88b51106',
             'info_dict': {
-                'id': '25e279aa-1ffd-40fd-9955-5325bd48a53a',
+                'id': '48f656ef-287e-486f-be86-459122db22cc',
                 'ext': 'mp4',
-                'title': 'Wenn das Traditions-Theater wackelt',
-                'description': 'Heimatsound-Festival 2014: Wenn das Traditions-Theater wackelt',
-                'duration': 34,
-                'uploader': 'BR',
-                'upload_date': '20140802',
+                'title': 'Die böse Überraschung',
+                'description': 'Betriebliche Altersvorsorge: Die böse Überraschung',
+                'duration': 180,
+                'uploader': 'Reinhard Weber',
+                'upload_date': '20150422',
             }
         },
         {
-            'url': 'http://www.br.de/nachrichten/schaeuble-haushaltsentwurf-bundestag-100.html',
-            'md5': '3db0df1a9a9cd9fa0c70e6ea8aa8e820',
+            'url': 'http://www.br.de/nachrichten/oberbayern/inhalt/muenchner-polizeipraesident-schreiber-gestorben-100.html',
+            'md5': 'a44396d73ab6a68a69a568fae10705bb',
             'info_dict': {
-                'id': 'c6aae3de-2cf9-43f2-957f-f17fef9afaab',
+                'id': 'a4b83e34-123d-4b81-9f4e-c0d3121a4e05',
+                'ext': 'mp4',
+                'title': 'Manfred Schreiber ist tot',
+                'description': 'Abendschau kompakt: Manfred Schreiber ist tot',
+                'duration': 26,
+            }
+        },
+        {
+            'url': 'http://www.br.de/radio/br-klassik/sendungen/allegro/premiere-urauffuehrung-the-land-2015-dance-festival-muenchen-100.html',
+            'md5': '8b5b27c0b090f3b35eac4ab3f7a73d3d',
+            'info_dict': {
+                'id': '74c603c9-26d3-48bb-b85b-079aeed66e0b',
                 'ext': 'aac',
-                'title': '"Keine neuen Schulden im nächsten Jahr"',
-                'description': 'Haushaltsentwurf: "Keine neuen Schulden im nächsten Jahr"',
-                'duration': 64,
+                'title': 'Kurzweilig und sehr bewegend',
+                'description': '"The Land" von Peeping Tom: Kurzweilig und sehr bewegend',
+                'duration': 296,
             }
         },
         {
index 4f60d53660fa7777b9e1b6152967ce2e7e567ec9..4721c22930f15cb51d0daaac294eeeca3a329092 100644 (file)
@@ -13,6 +13,7 @@ from ..compat import (
     compat_urllib_parse_urlparse,
     compat_urllib_request,
     compat_urlparse,
+    compat_xml_parse_error,
 )
 from ..utils import (
     determine_ext,
@@ -119,7 +120,7 @@ class BrightcoveIE(InfoExtractor):
 
         try:
             object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8'))
-        except xml.etree.ElementTree.ParseError:
+        except compat_xml_parse_error:
             return
 
         fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars')
@@ -156,6 +157,28 @@ class BrightcoveIE(InfoExtractor):
         linkBase = find_param('linkBaseURL')
         if linkBase is not None:
             params['linkBaseURL'] = linkBase
+        return cls._make_brightcove_url(params)
+
+    @classmethod
+    def _build_brighcove_url_from_js(cls, object_js):
+        # The layout of JS is as follows:
+        # customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) {
+        #   // build Brightcove <object /> XML
+        # }
+        m = re.search(
+            r'''(?x)customBC.\createVideo\(
+                .*?                                                  # skipping width and height
+                ["\'](?P<playerID>\d+)["\']\s*,\s*                   # playerID
+                ["\'](?P<playerKey>AQ[^"\']{48})[^"\']*["\']\s*,\s*  # playerKey begins with AQ and is 50 characters
+                                                                     # in length, however it's appended to itself
+                                                                     # in places, so truncate
+                ["\'](?P<videoID>\d+)["\']                           # @videoPlayer
+            ''', object_js)
+        if m:
+            return cls._make_brightcove_url(m.groupdict())
+
+    @classmethod
+    def _make_brightcove_url(cls, params):
         data = compat_urllib_parse.urlencode(params)
         return cls._FEDERATED_URL_TEMPLATE % data
 
@@ -172,7 +195,7 @@ class BrightcoveIE(InfoExtractor):
         """Return a list of all Brightcove URLs from the webpage """
 
         url_m = re.search(
-            r'<meta\s+property="og:video"\s+content="(https?://(?:secure|c)\.brightcove.com/[^"]+)"',
+            r'<meta\s+property=[\'"]og:video[\'"]\s+content=[\'"](https?://(?:secure|c)\.brightcove.com/[^\'"]+)[\'"]',
             webpage)
         if url_m:
             url = unescapeHTML(url_m.group(1))
@@ -188,7 +211,12 @@ class BrightcoveIE(InfoExtractor):
                 [^>]*?>\s*<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/
             ).+?>\s*</object>''',
             webpage)
-        return list(filter(None, [cls._build_brighcove_url(m) for m in matches]))
+        if matches:
+            return list(filter(None, [cls._build_brighcove_url(m) for m in matches]))
+
+        return list(filter(None, [
+            cls._build_brighcove_url_from_js(custom_bc)
+            for custom_bc in re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)]))
 
     def _real_extract(self, url):
         url, smuggled_data = unsmuggle_url(url, {})
index 6252be05b7f4b57787152b4edae5378675a96847..3b2de517e53da39e06912ce1a97c4aafe7fa250e 100644 (file)
@@ -16,7 +16,7 @@ class BYUtvIE(InfoExtractor):
             'ext': 'mp4',
             'description': 'md5:5438d33774b6bdc662f9485a340401cc',
             'title': 'Season 5 Episode 5',
-            'thumbnail': 're:^https?://.*promo.*'
+            'thumbnail': 're:^https?://.*\.jpg$'
         },
         'params': {
             'skip_download': True,
index 1b14471e57198c2a04833089c174c0c6c3108ab8..57e0cda2c4aafdf65cd070c24f7503af9f85adab 100644 (file)
@@ -25,14 +25,14 @@ class CanalplusIE(InfoExtractor):
     }
 
     _TESTS = [{
-        'url': 'http://www.canalplus.fr/c-infos-documentaires/pid1830-c-zapping.html?vid=922470',
-        'md5': '3db39fb48b9685438ecf33a1078023e4',
+        'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1263092',
+        'md5': 'b3481d7ca972f61e37420798d0a9d934',
         'info_dict': {
-            'id': '922470',
+            'id': '1263092',
             'ext': 'flv',
-            'title': 'Zapping - 26/08/13',
-            'description': 'Le meilleur de toutes les chaînes, tous les jours.\nEmission du 26 août 2013',
-            'upload_date': '20130826',
+            'title': 'Le Zapping - 13/05/15',
+            'description': 'md5:09738c0d06be4b5d06a0940edb0da73f',
+            'upload_date': '20150513',
         },
     }, {
         'url': 'http://www.piwiplus.fr/videos-piwi/pid1405-le-labyrinthe-boing-super-ranger.html?vid=1108190',
@@ -56,7 +56,7 @@ class CanalplusIE(InfoExtractor):
         'skip': 'videos get deleted after a while',
     }, {
         'url': 'http://www.itele.fr/france/video/aubervilliers-un-lycee-en-colere-111559',
-        'md5': '65aa83ad62fe107ce29e564bb8712580',
+        'md5': 'f3a46edcdf28006598ffaf5b30e6a2d4',
         'info_dict': {
             'id': '1213714',
             'ext': 'flv',
@@ -106,15 +106,11 @@ class CanalplusIE(InfoExtractor):
                 continue
             format_id = fmt.tag
             if format_id == 'HLS':
-                hls_formats = self._extract_m3u8_formats(format_url, video_id, 'flv')
-                for fmt in hls_formats:
-                    fmt['preference'] = preference(format_id)
-                formats.extend(hls_formats)
+                formats.extend(self._extract_m3u8_formats(
+                    format_url, video_id, 'mp4', preference=preference(format_id)))
             elif format_id == 'HDS':
-                hds_formats = self._extract_f4m_formats(format_url + '?hdcore=2.11.3', video_id)
-                for fmt in hds_formats:
-                    fmt['preference'] = preference(format_id)
-                formats.extend(hds_formats)
+                formats.extend(self._extract_f4m_formats(
+                    format_url + '?hdcore=2.11.3', video_id, preference=preference(format_id)))
             else:
                 formats.append({
                     'url': format_url,
index 1ceb9d8d9df6c0268e33de5e34c01a245e134e05..75fffb1563ae9f95bf862ad156111b6962a8429e 100644 (file)
@@ -4,12 +4,13 @@ from .common import InfoExtractor
 
 
 class CBSIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?cbs\.com/shows/[^/]+/(?:video|artist)/(?P<id>[^/]+)/.*'
+    _VALID_URL = r'https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/[^/]+/(?P<id>[^/]+)'
 
     _TESTS = [{
         'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/',
         'info_dict': {
             'id': '4JUVEwq3wUT7',
+            'display_id': 'connect-chat-feat-garth-brooks',
             'ext': 'flv',
             'title': 'Connect Chat feat. Garth Brooks',
             'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!',
@@ -24,6 +25,7 @@ class CBSIE(InfoExtractor):
         'url': 'http://www.cbs.com/shows/liveonletterman/artist/221752/st-vincent/',
         'info_dict': {
             'id': 'WWF_5KqY3PK1',
+            'display_id': 'st-vincent',
             'ext': 'flv',
             'title': 'Live on Letterman - St. Vincent',
             'description': 'Live On Letterman: St. Vincent in concert from New York\'s Ed Sullivan Theater on Tuesday, July 16, 2014.',
@@ -34,12 +36,23 @@ class CBSIE(InfoExtractor):
             'skip_download': True,
         },
         '_skip': 'Blocked outside the US',
+    }, {
+        'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
-        video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
         real_id = self._search_regex(
-            r"video\.settings\.pid\s*=\s*'([^']+)';",
+            [r"video\.settings\.pid\s*=\s*'([^']+)';", r"cbsplayer\.pid\s*=\s*'([^']+)';"],
             webpage, 'real video ID')
-        return self.url_result('theplatform:%s' % real_id)
+        return {
+            '_type': 'url_transparent',
+            'ie_key': 'ThePlatform',
+            'url': 'theplatform:%s' % real_id,
+            'display_id': display_id,
+        }
index 7e47960ab08f3ffba7a1e596b34c2fbfc7fd6c59..52e61d85b3a20bc939771cee2b94188d32f16d17 100644 (file)
@@ -32,7 +32,7 @@ class CBSNewsIE(InfoExtractor):
                 'id': 'fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack',
                 'ext': 'flv',
                 'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack',
-                'thumbnail': 'http://cbsnews2.cbsistatic.com/hub/i/r/2014/04/04/0c9fbc66-576b-41ca-8069-02d122060dd2/thumbnail/140x90/6dad7a502f88875ceac38202984b6d58/en-0404-werner-replace-640x360.jpg',
+                'thumbnail': 're:^https?://.*\.jpg$',
                 'duration': 205,
             },
             'params': {
index 2a5d4be185d64f47ce37ca83f9bb6bd1d7eac829..6924eac704cd5cf02d266bd60125dad9cf13e765 100644 (file)
@@ -16,7 +16,7 @@ class CCCIE(InfoExtractor):
 
     _TEST = {
         'url': 'http://media.ccc.de/browse/congress/2013/30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor.html#video',
-        'md5': '205a365d0d57c0b1e43a12c9ffe8f9be',
+        'md5': '3a1eda8f3a29515d27f5adb967d7e740',
         'info_dict': {
             'id': '20131228183',
             'ext': 'mp4',
@@ -51,7 +51,7 @@ class CCCIE(InfoExtractor):
 
         matches = re.finditer(r'''(?xs)
             <(?:span|div)\s+class='label\s+filetype'>(?P<format>.*?)</(?:span|div)>\s*
-            <a\s+href='(?P<http_url>[^']+)'>\s*
+            <a\s+download\s+href='(?P<http_url>[^']+)'>\s*
             (?:
                 .*?
                 <a\s+href='(?P<torrent_url>[^']+\.torrent)'
index 65f6be62313dfc623cf1f9aa7adc52282872aade..dda583680a03ba3cb420beb74a99af2ec60cbc83 100644 (file)
@@ -7,6 +7,7 @@ from .common import InfoExtractor
 from ..compat import (
     compat_urllib_request,
     compat_urllib_parse,
+    compat_urllib_parse_unquote,
     compat_urllib_parse_urlparse,
 )
 from ..utils import (
@@ -88,7 +89,7 @@ class CeskaTelevizeIE(InfoExtractor):
         if playlist_url == 'error_region':
             raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
 
-        req = compat_urllib_request.Request(compat_urllib_parse.unquote(playlist_url))
+        req = compat_urllib_request.Request(compat_urllib_parse_unquote(playlist_url))
         req.add_header('Referer', url)
 
         playlist = self._download_json(req, video_id)
index c922f695905d70e4052ddfa5c8f336c01221413b..0206d96db4670fb29a40353839dae15911b9c6d3 100644 (file)
@@ -57,7 +57,7 @@ class ChilloutzoneIE(InfoExtractor):
 
         base64_video_info = self._html_search_regex(
             r'var cozVidData = "(.+?)";', webpage, 'video data')
-        decoded_video_info = base64.b64decode(base64_video_info).decode("utf-8")
+        decoded_video_info = base64.b64decode(base64_video_info.encode('utf-8')).decode('utf-8')
         video_info_dict = json.loads(decoded_video_info)
 
         # get video information from dict
index cf0a7551b7f3df672d20aad7a00d7ced43bf2945..c949a481477c187d9433f39e6e851b87c97edf79 100644 (file)
@@ -60,6 +60,17 @@ class CinemassacreIE(InfoExtractor):
                 'uploader_id': 'Cinemassacre',
                 'title': 'AVGN: McKids',
             }
+        },
+        {
+            'url': 'http://cinemassacre.com/2015/05/25/mario-kart-64-nintendo-64-james-mike-mondays/',
+            'md5': '1376908e49572389e7b06251a53cdd08',
+            'info_dict': {
+                'id': 'Cinemassacre-555779690c440',
+                'ext': 'mp4',
+                'description': 'Let’s Play Mario Kart 64 !! Mario Kart 64 is a classic go-kart racing game released for the Nintendo 64 (N64). Today James & Mike do 4 player Battle Mode with Kyle and Bootsy!',
+                'title': 'Mario Kart 64 (Nintendo 64) James & Mike Mondays',
+                'upload_date': '20150525',
+            }
         }
     ]
 
@@ -72,7 +83,7 @@ class CinemassacreIE(InfoExtractor):
 
         playerdata_url = self._search_regex(
             [
-                r'src="(http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"',
+                r'src="(http://(?:player2\.screenwavemedia\.com|player\.screenwavemedia\.com/play)/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"',
                 r'<iframe[^>]+src="((?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"',
             ],
             webpage, 'player data URL', default=None)
index d07d544eaf7742bb782a8b367a09561f14c44a4e..8306d6fb7d0d4414cff36f7b381ca9c877820f58 100644 (file)
@@ -1,7 +1,5 @@
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
 from ..utils import (
     find_xpath_attr,
@@ -10,9 +8,9 @@ from ..utils import (
 
 
 class ClipsyndicateIE(InfoExtractor):
-    _VALID_URL = r'http://www\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)'
+    _VALID_URL = r'http://(?:chic|www)\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe',
         'md5': '4d7d549451bad625e0ff3d7bd56d776c',
         'info_dict': {
@@ -22,11 +20,13 @@ class ClipsyndicateIE(InfoExtractor):
             'duration': 612,
             'thumbnail': 're:^https?://.+\.jpg',
         },
-    }
+    }, {
+        'url': 'http://chic.clipsyndicate.com/video/play/5844117/shark_attack',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = self._match_id(url)
         js_player = self._download_webpage(
             'http://eplayer.clipsyndicate.com/embed/player.js?va_id=%s' % video_id,
             video_id, 'Downlaoding player')
index 3145b30514ea2a075f92077b9f87b64c9e8820a7..5dd69bff7ac73bcc0adc4d91c614045ddf116a9c 100644 (file)
@@ -11,7 +11,7 @@ from ..utils import (
 
 class CNETIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?cnet\.com/videos/(?P<id>[^/]+)/'
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/',
         'info_dict': {
             'id': '56f4ea68-bd21-4852-b08c-4de5b8354c60',
@@ -25,7 +25,20 @@ class CNETIE(InfoExtractor):
         'params': {
             'skip_download': 'requires rtmpdump',
         }
-    }
+    }, {
+        'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/',
+        'info_dict': {
+            'id': '56527b93-d25d-44e3-b738-f989ce2e49ba',
+            'ext': 'flv',
+            'description': 'Khail and Ashley wonder what other civic woes can be solved by self-tweeting objects, investigate a new kind of VR camera and watch an origami robot self-assemble, walk, climb, dig and dissolve. #TDPothole',
+            'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40',
+            'uploader': 'Ashley Esqueda',
+            'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)',
+        },
+        'params': {
+            'skip_download': True,  # requires rtmpdump
+        },
+    }]
 
     def _real_extract(self, url):
         display_id = self._match_id(url)
@@ -42,7 +55,7 @@ class CNETIE(InfoExtractor):
             raise ExtractorError('Cannot find video data')
 
         mpx_account = data['config']['players']['default']['mpx_account']
-        vid = vdata['files']['rtmp']
+        vid = vdata['files'].get('rtmp', vdata['files']['hds'])
         tp_link = 'http://link.theplatform.com/s/%s/%s' % (mpx_account, vid)
 
         video_id = vdata['id']
index 5efc5f4fe556a4424542b441a83f2d6dbd5bc8e7..3b1bd4033fd1c01986c83ab44cc1cebaa1b19e5b 100644 (file)
@@ -12,7 +12,7 @@ from ..utils import (
 
 class CNNIE(InfoExtractor):
     _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/
-        (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z]{3,5})(?:-ap)?|(?=&)))'''
+        (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))'''
 
     _TESTS = [{
         'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn',
index 981e34bc7ccce5a87b2ec518efaa016afbd27002..14b9b4fe2320c6125b083ebf4afc683c1a2e4b52 100644 (file)
@@ -22,18 +22,20 @@ from ..compat import (
     compat_str,
 )
 from ..utils import (
+    NO_DEFAULT,
     age_restricted,
     bug_reports_message,
     clean_html,
     compiled_regex_type,
+    determine_ext,
     ExtractorError,
+    fix_xml_ampersands,
     float_or_none,
     int_or_none,
     RegexNotFoundError,
     sanitize_filename,
     unescapeHTML,
 )
-_NO_DEFAULT = object()
 
 
 class InfoExtractor(object):
@@ -63,7 +65,7 @@ class InfoExtractor(object):
 
                     Potential fields:
                     * url        Mandatory. The URL of the video file
-                    * ext        Will be calculated from url if missing
+                    * ext        Will be calculated from URL if missing
                     * format     A human-readable description of the format
                                  ("mp4 container with h264/opus").
                                  Calculated from the format_id, width, height.
@@ -153,7 +155,7 @@ class InfoExtractor(object):
                     lower to higher preference, each element is a dictionary
                     with the "ext" entry and one of:
                         * "data": The subtitles file contents
-                        * "url": A url pointing to the subtitles file
+                        * "url": A URL pointing to the subtitles file
     automatic_captions: Like 'subtitles', used by the YoutubeIE for
                     automatically generated captions
     duration:       Length of the video in seconds, as an integer.
@@ -174,13 +176,17 @@ class InfoExtractor(object):
                                      Set to "root" to indicate that this is a
                                      comment to the original video.
     age_limit:      Age restriction for the video, as an integer (years)
-    webpage_url:    The url to the video webpage, if given to youtube-dl it
+    webpage_url:    The URL to the video webpage, if given to youtube-dl it
                     should allow to get the same result again. (It will be set
                     by YoutubeDL if it's missing)
     categories:     A list of categories that the video falls in, for example
                     ["Sports", "Berlin"]
     is_live:        True, False, or None (=unknown). Whether this video is a
                     live stream that goes on instead of a fixed-length video.
+    start_time:     Time in seconds where the reproduction should start, as
+                    specified in the URL.
+    end_time:       Time in seconds where the reproduction should end, as
+                    specified in the URL.
 
     Unless mentioned otherwise, the fields should be Unicode strings.
 
@@ -499,7 +505,7 @@ class InfoExtractor(object):
     # Methods for following #608
     @staticmethod
     def url_result(url, ie=None, video_id=None, video_title=None):
-        """Returns a url that points to a page that should be processed"""
+        """Returns a URL that points to a page that should be processed"""
         # TODO: ie should be the class used for getting the info
         video_info = {'_type': 'url',
                       'url': url,
@@ -523,7 +529,7 @@ class InfoExtractor(object):
             video_info['description'] = playlist_description
         return video_info
 
-    def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
+    def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
         """
         Perform a regex search on the given string, using a single or a list of
         patterns returning the first matching group.
@@ -549,7 +555,7 @@ class InfoExtractor(object):
                 return next(g for g in mobj.groups() if g is not None)
             else:
                 return mobj.group(group)
-        elif default is not _NO_DEFAULT:
+        elif default is not NO_DEFAULT:
             return default
         elif fatal:
             raise RegexNotFoundError('Unable to extract %s' % _name)
@@ -557,7 +563,7 @@ class InfoExtractor(object):
             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
             return None
 
-    def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
+    def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
         """
         Like _search_regex, but strips HTML tags and unescapes entities.
         """
@@ -633,7 +639,7 @@ class InfoExtractor(object):
         return unescapeHTML(escaped)
 
     def _og_search_thumbnail(self, html, **kargs):
-        return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
+        return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 
     def _og_search_description(self, html, **kargs):
         return self._og_search_property('description', html, fatal=False, **kargs)
@@ -705,6 +711,25 @@ class InfoExtractor(object):
         return self._html_search_meta('twitter:player', html,
                                       'twitter card player')
 
+    @staticmethod
+    def _hidden_inputs(html):
+        return dict([
+            (input.group('name'), input.group('value')) for input in re.finditer(
+                r'''(?x)
+                    <input\s+
+                        type=(?P<q_hidden>["\'])hidden(?P=q_hidden)\s+
+                        name=(?P<q_name>["\'])(?P<name>.+?)(?P=q_name)\s+
+                        (?:id=(?P<q_id>["\']).+?(?P=q_id)\s+)?
+                        value=(?P<q_value>["\'])(?P<value>.*?)(?P=q_value)
+                ''', html)
+        ])
+
+    def _form_hidden_inputs(self, form_id, html):
+        form = self._search_regex(
+            r'(?s)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
+            html, '%s form' % form_id, group='form')
+        return self._hidden_inputs(form)
+
     def _sort_formats(self, formats, field_preference=None):
         if not formats:
             raise ExtractorError('No video formats found')
@@ -786,8 +811,8 @@ class InfoExtractor(object):
             return True
         except ExtractorError as e:
             if isinstance(e.cause, compat_HTTPError):
-                self.report_warning(
-                    '%s URL is invalid, skipping' % item, video_id)
+                self.to_screen(
+                    '%s: %s URL is invalid, skipping' % (video_id, item))
                 return False
             raise
 
@@ -815,10 +840,14 @@ class InfoExtractor(object):
         self.to_screen(msg)
         time.sleep(timeout)
 
-    def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None):
+    def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
+                             transform_source=lambda s: fix_xml_ampersands(s).strip()):
         manifest = self._download_xml(
             manifest_url, video_id, 'Downloading f4m manifest',
-            'Unable to download f4m manifest')
+            'Unable to download f4m manifest',
+            # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
+            # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
+            transform_source=transform_source)
 
         formats = []
         manifest_version = '1.0'
@@ -828,8 +857,19 @@ class InfoExtractor(object):
             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
         for i, media_el in enumerate(media_nodes):
             if manifest_version == '2.0':
-                manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/' +
-                                (media_el.attrib.get('href') or media_el.attrib.get('url')))
+                media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
+                if not media_url:
+                    continue
+                manifest_url = (
+                    media_url if media_url.startswith('http://') or media_url.startswith('https://')
+                    else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
+                # If media_url is itself a f4m manifest do the recursive extraction
+                # since bitrates in parent manifest (this one) and media_url manifest
+                # may differ leading to inability to resolve the format by requested
+                # bitrate in f4m downloader
+                if determine_ext(manifest_url) == 'f4m':
+                    formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id))
+                    continue
             tbr = int_or_none(media_el.attrib.get('bitrate'))
             formats.append({
                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
@@ -846,7 +886,8 @@ class InfoExtractor(object):
 
     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
                               entry_protocol='m3u8', preference=None,
-                              m3u8_id=None):
+                              m3u8_id=None, note=None, errnote=None,
+                              fatal=True):
 
         formats = [{
             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
@@ -865,8 +906,11 @@ class InfoExtractor(object):
 
         m3u8_doc = self._download_webpage(
             m3u8_url, video_id,
-            note='Downloading m3u8 information',
-            errnote='Failed to download m3u8 information')
+            note=note or 'Downloading m3u8 information',
+            errnote=errnote or 'Failed to download m3u8 information',
+            fatal=fatal)
+        if m3u8_doc is False:
+            return m3u8_doc
         last_info = None
         last_media = None
         kv_rex = re.compile(
@@ -956,7 +1000,7 @@ class InfoExtractor(object):
     def _parse_smil_video(self, video, video_id, base, rtmp_count):
         src = video.get('src')
         if not src:
-            return ([], rtmp_count)
+            return [], rtmp_count
         bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
         width = int_or_none(video.get('width'))
         height = int_or_none(video.get('height'))
@@ -969,7 +1013,7 @@ class InfoExtractor(object):
                     proto = 'http'
         ext = video.get('ext')
         if proto == 'm3u8':
-            return (self._extract_m3u8_formats(src, video_id, ext), rtmp_count)
+            return self._extract_m3u8_formats(src, video_id, ext), rtmp_count
         elif proto == 'rtmp':
             rtmp_count += 1
             streamer = video.get('streamer') or base
@@ -1072,14 +1116,11 @@ class InfoExtractor(object):
     def _get_automatic_captions(self, *args, **kwargs):
         raise NotImplementedError("This method must be implemented by subclasses")
 
-    def _subtitles_timecode(self, seconds):
-        return '%02d:%02d:%02d.%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
-
 
 class SearchInfoExtractor(InfoExtractor):
     """
     Base class for paged search queries extractors.
-    They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
+    They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
     Instances should define _SEARCH_KEY and _MAX_RESULTS.
     """
 
index 1c77df47ef346173fc11a58396c98768e5afc986..d1b6d7366e847015af4581160c918d4a1ee6e11f 100644 (file)
@@ -12,6 +12,7 @@ from math import pow, sqrt, floor
 from .common import InfoExtractor
 from ..compat import (
     compat_urllib_parse,
+    compat_urllib_parse_unquote,
     compat_urllib_request,
 )
 from ..utils import (
@@ -27,7 +28,7 @@ from ..aes import (
 
 
 class CrunchyrollIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)'
+    _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)'
     _NETRC_MACHINE = 'crunchyroll'
     _TESTS = [{
         'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
@@ -45,6 +46,22 @@ class CrunchyrollIE(InfoExtractor):
             # rtmp
             'skip_download': True,
         },
+    }, {
+        'url': 'http://www.crunchyroll.com/media-589804/culture-japan-1',
+        'info_dict': {
+            'id': '589804',
+            'ext': 'flv',
+            'title': 'Culture Japan Episode 1 – Rebuilding Japan after the 3.11',
+            'description': 'md5:fe2743efedb49d279552926d0bd0cd9e',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'uploader': 'Danny Choo Network',
+            'upload_date': '20120213',
+        },
+        'params': {
+            # rtmp
+            'skip_download': True,
+        },
+
     }, {
         'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697',
         'only_matching': True,
@@ -76,8 +93,8 @@ class CrunchyrollIE(InfoExtractor):
         self._login()
 
     def _decrypt_subtitles(self, data, iv, id):
-        data = bytes_to_intlist(data)
-        iv = bytes_to_intlist(iv)
+        data = bytes_to_intlist(base64.b64decode(data.encode('utf-8')))
+        iv = bytes_to_intlist(base64.b64decode(iv.encode('utf-8')))
         id = int(id)
 
         def obfuscate_key_aux(count, modulo, start):
@@ -179,6 +196,16 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
 
         return output
 
+    def _extract_subtitles(self, subtitle):
+        sub_root = xml.etree.ElementTree.fromstring(subtitle)
+        return [{
+            'ext': 'srt',
+            'data': self._convert_subtitles_to_srt(sub_root),
+        }, {
+            'ext': 'ass',
+            'data': self._convert_subtitles_to_ass(sub_root),
+        }]
+
     def _get_subtitles(self, video_id, webpage):
         subtitles = {}
         for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage):
@@ -190,25 +217,11 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
             data = self._search_regex(r'<data>([^<]+)', sub_page, 'subtitle_data', fatal=False)
             if not id or not iv or not data:
                 continue
-            id = int(id)
-            iv = base64.b64decode(iv)
-            data = base64.b64decode(data)
-
             subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8')
             lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False)
             if not lang_code:
                 continue
-            sub_root = xml.etree.ElementTree.fromstring(subtitle)
-            subtitles[lang_code] = [
-                {
-                    'ext': 'srt',
-                    'data': self._convert_subtitles_to_srt(sub_root),
-                },
-                {
-                    'ext': 'ass',
-                    'data': self._convert_subtitles_to_ass(sub_root),
-                },
-            ]
+            subtitles[lang_code] = self._extract_subtitles(subtitle)
         return subtitles
 
     def _real_extract(self, url):
@@ -242,7 +255,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
             video_upload_date = unified_strdate(video_upload_date)
         video_uploader = self._html_search_regex(r'<div>\s*Publisher:(.+?)</div>', webpage, 'video_uploader', fatal=False, flags=re.DOTALL)
 
-        playerdata_url = compat_urllib_parse.unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url'))
+        playerdata_url = compat_urllib_parse_unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url'))
         playerdata_req = compat_urllib_request.Request(playerdata_url)
         playerdata_req.data = compat_urllib_parse.urlencode({'current_page': webpage_url})
         playerdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
@@ -255,16 +268,17 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
         for fmt in re.findall(r'showmedia\.([0-9]{3,4})p', webpage):
             stream_quality, stream_format = self._FORMAT_IDS[fmt]
             video_format = fmt + 'p'
-            streamdata_req = compat_urllib_request.Request('http://www.crunchyroll.com/xml/')
-            # urlencode doesn't work!
-            streamdata_req.data = 'req=RpcApiVideoEncode%5FGetStreamInfo&video%5Fencode%5Fquality=' + stream_quality + '&media%5Fid=' + stream_id + '&video%5Fformat=' + stream_format
+            streamdata_req = compat_urllib_request.Request(
+                'http://www.crunchyroll.com/xml/?req=RpcApiVideoPlayer_GetStandardConfig&media_id=%s&video_format=%s&video_quality=%s'
+                % (stream_id, stream_format, stream_quality),
+                compat_urllib_parse.urlencode({'current_page': url}).encode('utf-8'))
             streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
-            streamdata_req.add_header('Content-Length', str(len(streamdata_req.data)))
             streamdata = self._download_xml(
                 streamdata_req, video_id,
                 note='Downloading media info for %s' % video_format)
-            video_url = streamdata.find('./host').text
-            video_play_path = streamdata.find('./file').text
+            stream_info = streamdata.find('./{default}preload/stream_info')
+            video_url = stream_info.find('./host').text
+            video_play_path = stream_info.find('./file').text
             formats.append({
                 'url': video_url,
                 'play_path': video_play_path,
index 0226f8036c81d97246c87f2308614d84202ae540..45049bf371370da6e4b64952441e76d86814fd6a 100644 (file)
@@ -6,6 +6,7 @@ from ..utils import parse_iso8601, ExtractorError
 
 
 class CtsNewsIE(InfoExtractor):
+    IE_DESC = '華視新聞'
     # https connection failed (Connection reset)
     _VALID_URL = r'http://news\.cts\.com\.tw/[a-z]+/[a-z]+/\d+/(?P<id>\d+)\.html'
     _TESTS = [{
index aa595af20f66445559784afbcbb5a54a38eb69aa..85d945509d07ca686490c3471d0b109991524ed1 100644 (file)
@@ -13,8 +13,10 @@ from ..compat import (
 )
 from ..utils import (
     ExtractorError,
+    determine_ext,
     int_or_none,
     orderedSet,
+    parse_iso8601,
     str_to_int,
     unescapeHTML,
 )
@@ -28,10 +30,16 @@ class DailymotionBaseInfoExtractor(InfoExtractor):
         request.add_header('Cookie', 'family_filter=off; ff=off')
         return request
 
+    def _download_webpage_handle_no_ff(self, url, *args, **kwargs):
+        request = self._build_request(url)
+        return self._download_webpage_handle(request, *args, **kwargs)
 
-class DailymotionIE(DailymotionBaseInfoExtractor):
-    """Information Extractor for Dailymotion"""
+    def _download_webpage_no_ff(self, url, *args, **kwargs):
+        request = self._build_request(url)
+        return self._download_webpage(request, *args, **kwargs)
 
+
+class DailymotionIE(DailymotionBaseInfoExtractor):
     _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(embed|#)/)?video/(?P<id>[^/?_]+)'
     IE_NAME = 'dailymotion'
 
@@ -50,8 +58,17 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
             'info_dict': {
                 'id': 'x2iuewm',
                 'ext': 'mp4',
-                'uploader': 'IGN',
                 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News',
+                'description': 'Several come bundled with the Steam Controller.',
+                'thumbnail': 're:^https?:.*\.(?:jpg|png)$',
+                'duration': 74,
+                'timestamp': 1425657362,
+                'upload_date': '20150306',
+                'uploader': 'IGN',
+                'uploader_id': 'xijv66',
+                'age_limit': 0,
+                'view_count': int,
+                'comment_count': int,
             }
         },
         # Vevo video
@@ -85,38 +102,106 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        url = 'https://www.dailymotion.com/video/%s' % video_id
 
-        # Retrieve video webpage to extract further information
-        request = self._build_request(url)
-        webpage = self._download_webpage(request, video_id)
+        webpage = self._download_webpage_no_ff(
+            'https://www.dailymotion.com/video/%s' % video_id, video_id)
+
+        age_limit = self._rta_search(webpage)
 
-        # Extract URL, uploader and title from webpage
-        self.report_extraction(video_id)
+        description = self._og_search_description(webpage) or self._html_search_meta(
+            'description', webpage, 'description')
 
-        # It may just embed a vevo video:
-        m_vevo = re.search(
+        view_count = str_to_int(self._search_regex(
+            [r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:(\d+)"',
+             r'video_views_count[^>]+>\s+([\d\.,]+)'],
+            webpage, 'view count', fatal=False))
+        comment_count = int_or_none(self._search_regex(
+            r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserComments:(\d+)"',
+            webpage, 'comment count', fatal=False))
+
+        player_v5 = self._search_regex(
+            r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);',
+            webpage, 'player v5', default=None)
+        if player_v5:
+            player = self._parse_json(player_v5, video_id)
+            metadata = player['metadata']
+            formats = []
+            for quality, media_list in metadata['qualities'].items():
+                for media in media_list:
+                    media_url = media.get('url')
+                    if not media_url:
+                        continue
+                    type_ = media.get('type')
+                    if type_ == 'application/vnd.lumberjack.manifest':
+                        continue
+                    if type_ == 'application/x-mpegURL' or determine_ext(media_url) == 'm3u8':
+                        formats.extend(self._extract_m3u8_formats(
+                            media_url, video_id, 'mp4', m3u8_id='hls'))
+                    else:
+                        f = {
+                            'url': media_url,
+                            'format_id': quality,
+                        }
+                        m = re.search(r'H264-(?P<width>\d+)x(?P<height>\d+)', media_url)
+                        if m:
+                            f.update({
+                                'width': int(m.group('width')),
+                                'height': int(m.group('height')),
+                            })
+                        formats.append(f)
+            self._sort_formats(formats)
+
+            title = metadata['title']
+            duration = int_or_none(metadata.get('duration'))
+            timestamp = int_or_none(metadata.get('created_time'))
+            thumbnail = metadata.get('poster_url')
+            uploader = metadata.get('owner', {}).get('screenname')
+            uploader_id = metadata.get('owner', {}).get('id')
+
+            subtitles = {}
+            for subtitle_lang, subtitle in metadata.get('subtitles', {}).get('data', {}).items():
+                subtitles[subtitle_lang] = [{
+                    'ext': determine_ext(subtitle_url),
+                    'url': subtitle_url,
+                } for subtitle_url in subtitle.get('urls', [])]
+
+            return {
+                'id': video_id,
+                'title': title,
+                'description': description,
+                'thumbnail': thumbnail,
+                'duration': duration,
+                'timestamp': timestamp,
+                'uploader': uploader,
+                'uploader_id': uploader_id,
+                'age_limit': age_limit,
+                'view_count': view_count,
+                'comment_count': comment_count,
+                'formats': formats,
+                'subtitles': subtitles,
+            }
+
+        # vevo embed
+        vevo_id = self._search_regex(
             r'<link rel="video_src" href="[^"]*?vevo.com[^"]*?video=(?P<id>[\w]*)',
-            webpage)
-        if m_vevo is not None:
-            vevo_id = m_vevo.group('id')
-            self.to_screen('Vevo video detected: %s' % vevo_id)
-            return self.url_result('vevo:%s' % vevo_id, ie='Vevo')
+            webpage, 'vevo embed', default=None)
+        if vevo_id:
+            return self.url_result('vevo:%s' % vevo_id, 'Vevo')
 
-        age_limit = self._rta_search(webpage)
+        # fallback old player
+        embed_page = self._download_webpage_no_ff(
+            'https://www.dailymotion.com/embed/video/%s' % video_id,
+            video_id, 'Downloading embed page')
+
+        timestamp = parse_iso8601(self._html_search_meta(
+            'video:release_date', webpage, 'upload date'))
+
+        info = self._parse_json(
+            self._search_regex(
+                r'var info = ({.*?}),$', embed_page,
+                'video info', flags=re.MULTILINE),
+            video_id)
 
-        video_upload_date = None
-        mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
-        if mobj is not None:
-            video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
-
-        embed_url = 'https://www.dailymotion.com/embed/video/%s' % video_id
-        embed_request = self._build_request(embed_url)
-        embed_page = self._download_webpage(
-            embed_request, video_id, 'Downloading embed page')
-        info = self._search_regex(r'var info = ({.*?}),$', embed_page,
-                                  'video info', flags=re.MULTILINE)
-        info = json.loads(info)
         if info.get('error') is not None:
             msg = 'Couldn\'t get video, Dailymotion says: %s' % info['error']['title']
             raise ExtractorError(msg, expected=True)
@@ -137,16 +222,11 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
                     'width': width,
                     'height': height,
                 })
-        if not formats:
-            raise ExtractorError('Unable to extract video URL')
+        self._sort_formats(formats)
 
         # subtitles
         video_subtitles = self.extract_subtitles(video_id, webpage)
 
-        view_count = str_to_int(self._search_regex(
-            r'video_views_count[^>]+>\s+([\d\.,]+)',
-            webpage, 'view count', fatal=False))
-
         title = self._og_search_title(webpage, default=None)
         if title is None:
             title = self._html_search_regex(
@@ -157,12 +237,14 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
             'id': video_id,
             'formats': formats,
             'uploader': info['owner.screenname'],
-            'upload_date': video_upload_date,
+            'timestamp': timestamp,
             'title': title,
+            'description': description,
             'subtitles': video_subtitles,
             'thumbnail': info['thumbnail_url'],
             'age_limit': age_limit,
             'view_count': view_count,
+            'duration': info['duration']
         }
 
     def _get_subtitles(self, video_id, webpage):
@@ -197,10 +279,17 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
 
     def _extract_entries(self, id):
         video_ids = []
+        processed_urls = set()
         for pagenum in itertools.count(1):
-            request = self._build_request(self._PAGE_TEMPLATE % (id, pagenum))
-            webpage = self._download_webpage(request,
-                                             id, 'Downloading page %s' % pagenum)
+            page_url = self._PAGE_TEMPLATE % (id, pagenum)
+            webpage, urlh = self._download_webpage_handle_no_ff(
+                page_url, id, 'Downloading page %s' % pagenum)
+            if urlh.geturl() in processed_urls:
+                self.report_warning('Stopped at duplicated page %s, which is the same as %s' % (
+                    page_url, urlh.geturl()), id)
+                break
+
+            processed_urls.add(urlh.geturl())
 
             video_ids.extend(re.findall(r'data-xid="(.+?)"', webpage))
 
@@ -224,7 +313,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
 
 class DailymotionUserIE(DailymotionPlaylistIE):
     IE_NAME = 'dailymotion:user'
-    _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?:old/)?user/(?P<user>[^/]+)'
+    _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)'
     _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s'
     _TESTS = [{
         'url': 'https://www.dailymotion.com/user/nqtv',
@@ -233,12 +322,24 @@ class DailymotionUserIE(DailymotionPlaylistIE):
             'title': 'Rémi Gaillard',
         },
         'playlist_mincount': 100,
+    }, {
+        'url': 'http://www.dailymotion.com/user/UnderProject',
+        'info_dict': {
+            'id': 'UnderProject',
+            'title': 'UnderProject',
+        },
+        'playlist_mincount': 1800,
+        'expected_warnings': [
+            'Stopped at duplicated page',
+        ],
+        'skip': 'Takes too long time',
     }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         user = mobj.group('user')
-        webpage = self._download_webpage(url, user)
+        webpage = self._download_webpage(
+            'https://www.dailymotion.com/user/%s' % user, user)
         full_user = unescapeHTML(self._html_search_regex(
             r'<a class="nav-image" title="([^"]+)" href="/%s">' % re.escape(user),
             webpage, 'user'))
@@ -249,3 +350,52 @@ class DailymotionUserIE(DailymotionPlaylistIE):
             'title': full_user,
             'entries': self._extract_entries(user),
         }
+
+
+class DailymotionCloudIE(DailymotionBaseInfoExtractor):
+    _VALID_URL_PREFIX = r'http://api\.dmcloud\.net/(?:player/)?embed/'
+    _VALID_URL = r'%s[^/]+/(?P<id>[^/?]+)' % _VALID_URL_PREFIX
+    _VALID_EMBED_URL = r'%s[^/]+/[^\'"]+' % _VALID_URL_PREFIX
+
+    _TESTS = [{
+        # From http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html
+        # Tested at FranceTvInfo_2
+        'url': 'http://api.dmcloud.net/embed/4e7343f894a6f677b10006b4/556e03339473995ee145930c?auth=1464865870-0-jyhsm84b-ead4c701fb750cf9367bf4447167a3db&autoplay=1',
+        'only_matching': True,
+    }, {
+        # http://www.francetvinfo.fr/societe/larguez-les-amarres-le-cobaturage-se-developpe_980101.html
+        'url': 'http://api.dmcloud.net/player/embed/4e7343f894a6f677b10006b4/559545469473996d31429f06?auth=1467430263-0-90tglw2l-a3a4b64ed41efe48d7fccad85b8b8fda&autoplay=1',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def _extract_dmcloud_url(self, webpage):
+        mobj = re.search(r'<iframe[^>]+src=[\'"](%s)[\'"]' % self._VALID_EMBED_URL, webpage)
+        if mobj:
+            return mobj.group(1)
+
+        mobj = re.search(
+            r'<input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=[\'"](%s)[\'"]' % self._VALID_EMBED_URL,
+            webpage)
+        if mobj:
+            return mobj.group(1)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage_no_ff(url, video_id)
+
+        title = self._html_search_regex(r'<title>([^>]+)</title>', webpage, 'title')
+
+        video_info = self._parse_json(self._search_regex(
+            r'var\s+info\s*=\s*([^;]+);', webpage, 'video info'), video_id)
+
+        # TODO: parse ios_url, which is in fact a manifest
+        video_url = video_info['mp4_url']
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'thumbnail': video_info.get('thumbnail_url'),
+        }
index 8049779b0a31049f704bae256a3752a9a22ad789..263532cc6e66a94c79670caa5e1600444ce909da 100644 (file)
@@ -3,42 +3,47 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..utils import unified_strdate
 
 
 class DFBIE(InfoExtractor):
     IE_NAME = 'tv.dfb.de'
-    _VALID_URL = r'https?://tv\.dfb\.de/video/[^/]+/(?P<id>\d+)'
+    _VALID_URL = r'https?://tv\.dfb\.de/video/(?P<display_id>[^/]+)/(?P<id>\d+)'
 
     _TEST = {
-        'url': 'http://tv.dfb.de/video/highlights-des-empfangs-in-berlin/9070/',
+        'url': 'http://tv.dfb.de/video/u-19-em-stimmen-zum-spiel-gegen-russland/11633/',
         # The md5 is different each time
         'info_dict': {
-            'id': '9070',
+            'id': '11633',
+            'display_id': 'u-19-em-stimmen-zum-spiel-gegen-russland',
             'ext': 'flv',
-            'title': 'Highlights des Empfangs in Berlin',
-            'upload_date': '20140716',
+            'title': 'U 19-EM: Stimmen zum Spiel gegen Russland',
+            'upload_date': '20150714',
         },
     }
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
+        display_id = mobj.group('display_id')
 
-        webpage = self._download_webpage(url, video_id)
+        webpage = self._download_webpage(url, display_id)
         player_info = self._download_xml(
             'http://tv.dfb.de/server/hd_video.php?play=%s' % video_id,
-            video_id)
+            display_id)
         video_info = player_info.find('video')
 
-        f4m_info = self._download_xml(self._proto_relative_url(video_info.find('url').text.strip()), video_id)
+        f4m_info = self._download_xml(
+            self._proto_relative_url(video_info.find('url').text.strip()), display_id)
         token_el = f4m_info.find('token')
         manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + '&hdcore=3.2.0'
+        formats = self._extract_f4m_formats(manifest_url, display_id)
 
         return {
             'id': video_id,
+            'display_id': display_id,
             'title': video_info.find('title').text,
-            'url': manifest_url,
-            'ext': 'flv',
             'thumbnail': self._og_search_thumbnail(webpage),
-            'upload_date': ''.join(video_info.find('time_date').text.split('.')[::-1]),
+            'upload_date': unified_strdate(video_info.find('time_date').text),
+            'formats': formats,
         }
index d3e6675283cddcb8f6a6dfffbfbd1e1ea3da11bc..d6723ecf26ea67356b288df6e5f3bf612141b91a 100644 (file)
@@ -2,19 +2,19 @@ from __future__ import unicode_literals
 
 from .common import InfoExtractor
 from ..utils import (
+    parse_duration,
     parse_iso8601,
-    int_or_none,
 )
+from ..compat import compat_str
 
 
 class DiscoveryIE(InfoExtractor):
     _VALID_URL = r'http://www\.discovery\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9_\-]*)(?:\.htm)?'
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm',
-        'md5': '3c69d77d9b0d82bfd5e5932a60f26504',
         'info_dict': {
-            'id': 'mission-impossible-outtakes',
-            'ext': 'flv',
+            'id': '20769',
+            'ext': 'mp4',
             'title': 'Mission Impossible Outtakes',
             'description': ('Watch Jamie Hyneman and Adam Savage practice being'
                             ' each other -- to the point of confusing Jamie\'s dog -- and '
@@ -24,22 +24,36 @@ class DiscoveryIE(InfoExtractor):
             'timestamp': 1303099200,
             'upload_date': '20110418',
         },
-    }
+        'params': {
+            'skip_download': True,  # requires ffmpeg
+        }
+    }, {
+        'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mythbusters-the-simpsons',
+        'info_dict': {
+            'id': 'mythbusters-the-simpsons',
+            'title': 'MythBusters: The Simpsons',
+        },
+        'playlist_count': 9,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
+        info = self._download_json(url + '?flat=1', video_id)
 
-        info = self._parse_json(self._search_regex(
-            r'(?s)<script type="application/ld\+json">(.*?)</script>',
-            webpage, 'video info'), video_id)
+        video_title = info.get('playlist_title') or info.get('video_title')
 
-        return {
-            'id': video_id,
-            'title': info['name'],
-            'url': info['contentURL'],
-            'description': info.get('description'),
-            'thumbnail': info.get('thumbnailUrl'),
-            'timestamp': parse_iso8601(info.get('uploadDate')),
-            'duration': int_or_none(info.get('duration')),
-        }
+        entries = [{
+            'id': compat_str(video_info['id']),
+            'formats': self._extract_m3u8_formats(
+                video_info['src'], video_id, ext='mp4',
+                note='Download m3u8 information for video %d' % (idx + 1)),
+            'title': video_info['title'],
+            'description': video_info.get('description'),
+            'duration': parse_duration(video_info.get('video_length')),
+            'webpage_url': video_info.get('href'),
+            'thumbnail': video_info.get('thumbnailURL'),
+            'alt_title': video_info.get('secondary_title'),
+            'timestamp': parse_iso8601(video_info.get('publishedDate')),
+        } for idx, video_info in enumerate(info['playlist'])]
+
+        return self.playlist_result(entries, video_id, video_title)
index 479430c51072ab91e976df4d459af372c5608cdd..373b3b4b4735d8544128c48a10037eed3c570e5d 100644 (file)
@@ -9,6 +9,7 @@ from ..compat import (compat_str, compat_basestring)
 
 
 class DouyuTVIE(InfoExtractor):
+    IE_DESC = '斗鱼'
     _VALID_URL = r'http://(?:www\.)?douyutv\.com/(?P<id>[A-Za-z0-9]+)'
     _TESTS = [{
         'url': 'http://www.douyutv.com/iseven',
diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py
new file mode 100644 (file)
index 0000000..38e6597
--- /dev/null
@@ -0,0 +1,216 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_HTTPError,
+    compat_urllib_parse,
+    compat_urllib_request,
+    compat_urlparse,
+)
+from ..utils import (
+    ExtractorError,
+    clean_html,
+    determine_ext,
+    int_or_none,
+    parse_iso8601,
+)
+
+
+class DramaFeverBaseIE(InfoExtractor):
+    _LOGIN_URL = 'https://www.dramafever.com/accounts/login/'
+    _NETRC_MACHINE = 'dramafever'
+
+    _CONSUMER_SECRET = 'DA59dtVXYLxajktV'
+
+    _consumer_secret = None
+
+    def _get_consumer_secret(self):
+        mainjs = self._download_webpage(
+            'http://www.dramafever.com/static/51afe95/df2014/scripts/main.js',
+            None, 'Downloading main.js', fatal=False)
+        if not mainjs:
+            return self._CONSUMER_SECRET
+        return self._search_regex(
+            r"var\s+cs\s*=\s*'([^']+)'", mainjs,
+            'consumer secret', default=self._CONSUMER_SECRET)
+
+    def _real_initialize(self):
+        self._login()
+        self._consumer_secret = self._get_consumer_secret()
+
+    def _login(self):
+        (username, password) = self._get_login_info()
+        if username is None:
+            return
+
+        login_form = {
+            'username': username,
+            'password': password,
+        }
+
+        request = compat_urllib_request.Request(
+            self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
+        response = self._download_webpage(
+            request, None, 'Logging in as %s' % username)
+
+        if all(logout_pattern not in response
+               for logout_pattern in ['href="/accounts/logout/"', '>Log out<']):
+            error = self._html_search_regex(
+                r'(?s)class="hidden-xs prompt"[^>]*>(.+?)<',
+                response, 'error message', default=None)
+            if error:
+                raise ExtractorError('Unable to login: %s' % error, expected=True)
+            raise ExtractorError('Unable to log in')
+
+
+class DramaFeverIE(DramaFeverBaseIE):
+    IE_NAME = 'dramafever'
+    _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P<id>[0-9]+/[0-9]+)(?:/|$)'
+    _TEST = {
+        'url': 'http://www.dramafever.com/drama/4512/1/Cooking_with_Shin/',
+        'info_dict': {
+            'id': '4512.1',
+            'ext': 'flv',
+            'title': 'Cooking with Shin 4512.1',
+            'description': 'md5:a8eec7942e1664a6896fcd5e1287bfd0',
+            'thumbnail': 're:^https?://.*\.jpg',
+            'timestamp': 1404336058,
+            'upload_date': '20140702',
+            'duration': 343,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url).replace('/', '.')
+
+        try:
+            feed = self._download_json(
+                'http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id,
+                video_id, 'Downloading episode JSON')['channel']['item']
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError):
+                raise ExtractorError(
+                    'Currently unavailable in your country.', expected=True)
+            raise
+
+        media_group = feed.get('media-group', {})
+
+        formats = []
+        for media_content in media_group['media-content']:
+            src = media_content.get('@attributes', {}).get('url')
+            if not src:
+                continue
+            ext = determine_ext(src)
+            if ext == 'f4m':
+                formats.extend(self._extract_f4m_formats(
+                    src, video_id, f4m_id='hds'))
+            elif ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    src, video_id, 'mp4', m3u8_id='hls'))
+            else:
+                formats.append({
+                    'url': src,
+                })
+        self._sort_formats(formats)
+
+        title = media_group.get('media-title')
+        description = media_group.get('media-description')
+        duration = int_or_none(media_group['media-content'][0].get('@attributes', {}).get('duration'))
+        thumbnail = self._proto_relative_url(
+            media_group.get('media-thumbnail', {}).get('@attributes', {}).get('url'))
+        timestamp = parse_iso8601(feed.get('pubDate'), ' ')
+
+        subtitles = {}
+        for media_subtitle in media_group.get('media-subTitle', []):
+            lang = media_subtitle.get('@attributes', {}).get('lang')
+            href = media_subtitle.get('@attributes', {}).get('href')
+            if not lang or not href:
+                continue
+            subtitles[lang] = [{
+                'ext': 'ttml',
+                'url': href,
+            }]
+
+        series_id, episode_number = video_id.split('.')
+        episode_info = self._download_json(
+            # We only need a single episode info, so restricting page size to one episode
+            # and dealing with page number as with episode number
+            r'http://www.dramafever.com/api/4/episode/series/?cs=%s&series_id=%s&page_number=%s&page_size=1'
+            % (self._consumer_secret, series_id, episode_number),
+            video_id, 'Downloading episode info JSON', fatal=False)
+        if episode_info:
+            value = episode_info.get('value')
+            if value:
+                subfile = value[0].get('subfile') or value[0].get('new_subfile')
+                if subfile and subfile != 'http://www.dramafever.com/st/':
+                    subtitles.setdefault('English', []).append({
+                        'ext': 'srt',
+                        'url': subfile,
+                    })
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
+            'duration': duration,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
+
+
+class DramaFeverSeriesIE(DramaFeverBaseIE):
+    IE_NAME = 'dramafever:series'
+    _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P<id>[0-9]+)(?:/(?:(?!\d+(?:/|$)).+)?)?$'
+    _TESTS = [{
+        'url': 'http://www.dramafever.com/drama/4512/Cooking_with_Shin/',
+        'info_dict': {
+            'id': '4512',
+            'title': 'Cooking with Shin',
+            'description': 'md5:84a3f26e3cdc3fb7f500211b3593b5c1',
+        },
+        'playlist_count': 4,
+    }, {
+        'url': 'http://www.dramafever.com/drama/124/IRIS/',
+        'info_dict': {
+            'id': '124',
+            'title': 'IRIS',
+            'description': 'md5:b3a30e587cf20c59bd1c01ec0ee1b862',
+        },
+        'playlist_count': 20,
+    }]
+
+    _PAGE_SIZE = 60  # max is 60 (see http://api.drama9.com/#get--api-4-episode-series-)
+
+    def _real_extract(self, url):
+        series_id = self._match_id(url)
+
+        series = self._download_json(
+            'http://www.dramafever.com/api/4/series/query/?cs=%s&series_id=%s'
+            % (self._consumer_secret, series_id),
+            series_id, 'Downloading series JSON')['series'][series_id]
+
+        title = clean_html(series['name'])
+        description = clean_html(series.get('description') or series.get('description_short'))
+
+        entries = []
+        for page_num in itertools.count(1):
+            episodes = self._download_json(
+                'http://www.dramafever.com/api/4/episode/series/?cs=%s&series_id=%s&page_size=%d&page_number=%d'
+                % (self._consumer_secret, series_id, self._PAGE_SIZE, page_num),
+                series_id, 'Downloading episodes JSON page #%d' % page_num)
+            for episode in episodes.get('value', []):
+                episode_url = episode.get('episode_url')
+                if not episode_url:
+                    continue
+                entries.append(self.url_result(
+                    compat_urlparse.urljoin(url, episode_url),
+                    'DramaFever', episode.get('guid')))
+            if page_num == episodes['num_pages']:
+                break
+
+        return self.playlist_result(entries, series_id, title, description)
index 7626219baf33522958960bca0d8babe67b8a332e..8b98b013adeee32c67c769acfb88d76edff9a1f7 100644 (file)
@@ -15,7 +15,6 @@ class DRBonanzaIE(InfoExtractor):
 
     _TESTS = [{
         'url': 'http://www.dr.dk/bonanza/serie/portraetter/Talkshowet.htm?assetId=65517',
-        'md5': 'fe330252ddea607635cf2eb2c99a0af3',
         'info_dict': {
             'id': '65517',
             'ext': 'mp4',
@@ -26,6 +25,9 @@ class DRBonanzaIE(InfoExtractor):
             'upload_date': '20110120',
             'duration': 3664,
         },
+        'params': {
+            'skip_download': True,  # requires rtmp
+        },
     }, {
         'url': 'http://www.dr.dk/bonanza/radio/serie/sport/fodbold.htm?assetId=59410',
         'md5': '6dfe039417e76795fb783c52da3de11d',
@@ -93,6 +95,11 @@ class DRBonanzaIE(InfoExtractor):
                         'format_id': file['Type'].replace('Video', ''),
                         'preference': preferencemap.get(file['Type'], -10),
                     })
+                    if format['url'].startswith('rtmp'):
+                        rtmp_url = format['url']
+                        format['rtmp_live'] = True  # --resume does not work
+                        if '/bonanza/' in rtmp_url:
+                            format['play_path'] = rtmp_url.split('/bonanza/')[1]
                     formats.append(format)
                 elif file['Type'] == "Thumb":
                     thumbnail = file['Location']
@@ -111,9 +118,6 @@ class DRBonanzaIE(InfoExtractor):
         description = '%s\n%s\n%s\n' % (
             info['Description'], info['Actors'], info['Colophon'])
 
-        for f in formats:
-            f['url'] = f['url'].replace('rtmp://vod-bonanza.gss.dr.dk/bonanza/', 'http://vodfiles.dr.dk/')
-            f['url'] = f['url'].replace('mp4:bonanza', 'bonanza')
         self._sort_formats(formats)
 
         display_id = re.sub(r'[^\w\d-]', '', re.sub(r' ', '-', title.lower())) + '-' + asset_id
index 37c5c181f799efd8ee69d850c0b6076130c64073..639f9182c5484a22f0056e25fc6aa7e56f193df5 100644 (file)
@@ -36,25 +36,24 @@ class DrTuberIE(InfoExtractor):
             r'<source src="([^"]+)"', webpage, 'video URL')
 
         title = self._html_search_regex(
-            [r'class="hd_title" style="[^"]+">([^<]+)</h1>', r'<title>([^<]+) - \d+'],
+            [r'<p[^>]+class="title_substrate">([^<]+)</p>', r'<title>([^<]+) - \d+'],
             webpage, 'title')
 
         thumbnail = self._html_search_regex(
             r'poster="([^"]+)"',
             webpage, 'thumbnail', fatal=False)
 
-        like_count = str_to_int(self._html_search_regex(
-            r'<span id="rate_likes">\s*<img[^>]+>\s*<span>([\d,\.]+)</span>',
-            webpage, 'like count', fatal=False))
-        dislike_count = str_to_int(self._html_search_regex(
-            r'<span id="rate_dislikes">\s*<img[^>]+>\s*<span>([\d,\.]+)</span>',
-            webpage, 'like count', fatal=False))
-        comment_count = str_to_int(self._html_search_regex(
-            r'<span class="comments_count">([\d,\.]+)</span>',
-            webpage, 'comment count', fatal=False))
+        def extract_count(id_, name):
+            return str_to_int(self._html_search_regex(
+                r'<span[^>]+(?:class|id)="%s"[^>]*>([\d,\.]+)</span>' % id_,
+                webpage, '%s count' % name, fatal=False))
+
+        like_count = extract_count('rate_likes', 'like')
+        dislike_count = extract_count('rate_dislikes', 'dislike')
+        comment_count = extract_count('comments_count', 'comment')
 
         cats_str = self._search_regex(
-            r'<span>Categories:</span><div>(.+?)</div>', webpage, 'categories', fatal=False)
+            r'<div[^>]+class="categories_list">(.+?)</div>', webpage, 'categories', fatal=False)
         categories = [] if not cats_str else re.findall(r'<a title="([^"]+)"', cats_str)
 
         return {
index f25ab319e66d4d5b151cd9a9d4509807b6a88617..baa24c6d13abe016cceb83bb927db15d7d300509 100644 (file)
@@ -1,8 +1,11 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-from .common import InfoExtractor, ExtractorError
-from ..utils import parse_iso8601
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    parse_iso8601,
+)
 
 
 class DRTVIE(InfoExtractor):
@@ -60,19 +63,31 @@ class DRTVIE(InfoExtractor):
                 restricted_to_denmark = asset['RestrictedToDenmark']
                 spoken_subtitles = asset['Target'] == 'SpokenSubtitles'
                 for link in asset['Links']:
-                    target = link['Target']
                     uri = link['Uri']
+                    target = link['Target']
                     format_id = target
-                    preference = -1 if target == 'HDS' else -2
+                    preference = None
                     if spoken_subtitles:
-                        preference -= 2
+                        preference = -1
                         format_id += '-spoken-subtitles'
-                    formats.append({
-                        'url': uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43' if target == 'HDS' else uri,
-                        'format_id': format_id,
-                        'ext': link['FileFormat'],
-                        'preference': preference,
-                    })
+                    if target == 'HDS':
+                        formats.extend(self._extract_f4m_formats(
+                            uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43',
+                            video_id, preference, f4m_id=format_id))
+                    elif target == 'HLS':
+                        formats.extend(self._extract_m3u8_formats(
+                            uri, video_id, 'mp4', preference=preference,
+                            m3u8_id=format_id))
+                    else:
+                        bitrate = link.get('Bitrate')
+                        if bitrate:
+                            format_id += '-%s' % bitrate
+                        formats.append({
+                            'url': uri,
+                            'format_id': format_id,
+                            'tbr': bitrate,
+                            'ext': link.get('FileFormat'),
+                        })
                 subtitles_list = asset.get('SubtitlesList')
                 if isinstance(subtitles_list, list):
                     LANGS = {
index 9c594b757e4bbe41371cc603441c4dd5230e7b74..999fb5620df2976073122fb95fbad1bb133f357a 100644 (file)
@@ -26,7 +26,7 @@ class DumpertIE(InfoExtractor):
         video_id = self._match_id(url)
 
         req = compat_urllib_request.Request(url)
-        req.add_header('Cookie', 'nsfw=1')
+        req.add_header('Cookie', 'nsfw=1; cpc=10')
         webpage = self._download_webpage(req, video_id)
 
         files_base64 = self._search_regex(
index 9cb1bf301b9ae3e327e4831bdb8a7d2437b43803..b1cd4f5d4e6fe1dbaaf6ec230aad75947887301d 100644 (file)
@@ -1,9 +1,7 @@
 from __future__ import unicode_literals
 
-from ..compat import (
-    compat_urllib_parse,
-)
 from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
 
 
 class EHowIE(InfoExtractor):
@@ -26,7 +24,7 @@ class EHowIE(InfoExtractor):
         webpage = self._download_webpage(url, video_id)
         video_url = self._search_regex(
             r'(?:file|source)=(http[^\'"&]*)', webpage, 'video URL')
-        final_url = compat_urllib_parse.unquote(video_url)
+        final_url = compat_urllib_parse_unquote(video_url)
         uploader = self._html_search_meta('uploader', webpage)
         title = self._og_search_title(webpage).replace(' | eHow', '')
 
diff --git a/youtube_dl/extractor/empflix.py b/youtube_dl/extractor/empflix.py
deleted file mode 100644 (file)
index 70f8efe..0000000
+++ /dev/null
@@ -1,25 +0,0 @@
-from __future__ import unicode_literals
-
-from .tnaflix import TNAFlixIE
-
-
-class EMPFlixIE(TNAFlixIE):
-    _VALID_URL = r'^https?://www\.empflix\.com/videos/(?P<display_id>[0-9a-zA-Z-]+)-(?P<id>[0-9]+)\.html'
-
-    _TITLE_REGEX = r'name="title" value="(?P<title>[^"]*)"'
-    _DESCRIPTION_REGEX = r'name="description" value="([^"]*)"'
-    _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"'
-
-    _TEST = {
-        'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html',
-        'md5': 'b1bc15b6412d33902d6e5952035fcabc',
-        'info_dict': {
-            'id': '33051',
-            'display_id': 'Amateur-Finger-Fuck',
-            'ext': 'mp4',
-            'title': 'Amateur Finger Fuck',
-            'description': 'Amateur solo finger fucking.',
-            'thumbnail': 're:https?://.*\.jpg$',
-            'age_limit': 18,
-        }
-    }
index 0cbca90b061cf2358600146f37f6da5b61d71709..316033cf18b42cefead780ceca15b361ebbddac7 100644 (file)
@@ -4,7 +4,10 @@ import re
 
 from .common import InfoExtractor
 from ..compat import compat_urllib_parse
-from ..utils import ExtractorError
+from ..utils import (
+    ExtractorError,
+    unescapeHTML
+)
 
 
 class EroProfileIE(InfoExtractor):
@@ -75,8 +78,8 @@ class EroProfileIE(InfoExtractor):
             [r"glbUpdViews\s*\('\d*','(\d+)'", r'p/report/video/(\d+)'],
             webpage, 'video id', default=None)
 
-        video_url = self._search_regex(
-            r'<source src="([^"]+)', webpage, 'video url')
+        video_url = unescapeHTML(self._search_regex(
+            r'<source src="([^"]+)', webpage, 'video url'))
         title = self._html_search_regex(
             r'Title:</th><td>([^<]+)</td>', webpage, 'title')
         thumbnail = self._search_regex(
diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py
new file mode 100644 (file)
index 0000000..e6f8f03
--- /dev/null
@@ -0,0 +1,55 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class ESPNIE(InfoExtractor):
+    _VALID_URL = r'https?://espn\.go\.com/(?:[^/]+/)*(?P<id>[^/]+)'
+    _WORKING = False
+    _TESTS = [{
+        'url': 'http://espn.go.com/video/clip?id=10365079',
+        'info_dict': {
+            'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG',
+            'ext': 'mp4',
+            'title': 'dm_140128_30for30Shorts___JudgingJewellv2',
+            'description': '',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://espn.go.com/video/iframe/twitter/?cms=espn&id=10365079',
+        'only_matching': True,
+    }, {
+        'url': 'http://espn.go.com/nba/recap?gameId=400793786',
+        'only_matching': True,
+    }, {
+        'url': 'http://espn.go.com/blog/golden-state-warriors/post/_/id/593/how-warriors-rapidly-regained-a-winning-edge',
+        'only_matching': True,
+    }, {
+        'url': 'http://espn.go.com/sports/endurance/story/_/id/12893522/dzhokhar-tsarnaev-sentenced-role-boston-marathon-bombings',
+        'only_matching': True,
+    }, {
+        'url': 'http://espn.go.com/nba/playoffs/2015/story/_/id/12887571/john-wall-washington-wizards-no-swelling-left-hand-wrist-game-5-return',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_id = self._search_regex(
+            r'class="video-play-button"[^>]+data-id="(\d+)',
+            webpage, 'video id')
+
+        player = self._download_webpage(
+            'https://espn.go.com/video/iframe/twitter/?id=%s' % video_id, video_id)
+
+        pcode = self._search_regex(
+            r'["\']pcode=([^"\']+)["\']', player, 'pcode')
+
+        return self.url_result(
+            'ooyalaexternal:espn:%s:%s' % (video_id, pcode),
+            'OoyalaExternal')
index 937b28fcccf3bd58929adcca1bda9d05966460e5..e17bb9aeac51e2e10e2b68b4391d3022af35bcd5 100644 (file)
@@ -9,7 +9,7 @@ from ..compat import (
     compat_http_client,
     compat_str,
     compat_urllib_error,
-    compat_urllib_parse,
+    compat_urllib_parse_unquote,
     compat_urllib_request,
 )
 from ..utils import (
@@ -50,7 +50,10 @@ class FacebookIE(InfoExtractor):
             'id': '274175099429670',
             'ext': 'mp4',
             'title': 'Facebook video #274175099429670',
-        }
+        },
+        'expected_warnings': [
+            'title'
+        ]
     }, {
         'url': 'https://www.facebook.com/video.php?v=10204634152394104',
         'only_matching': True,
@@ -133,7 +136,7 @@ class FacebookIE(InfoExtractor):
             else:
                 raise ExtractorError('Cannot parse data')
         data = dict(json.loads(m.group(1)))
-        params_raw = compat_urllib_parse.unquote(data['params'])
+        params_raw = compat_urllib_parse_unquote(data['params'])
         params = json.loads(params_raw)
         video_data = params['video_data'][0]
 
@@ -149,12 +152,12 @@ class FacebookIE(InfoExtractor):
             raise ExtractorError('Cannot find video formats')
 
         video_title = self._html_search_regex(
-            r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, 'title',
-            fatal=False)
+            r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, 'title',
+            default=None)
         if not video_title:
             video_title = self._html_search_regex(
                 r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>',
-                webpage, 'alternative title', default=None)
+                webpage, 'alternative title', fatal=False)
             video_title = limit_length(video_title, 80)
         if not video_title:
             video_title = 'Facebook video #%s' % video_id
index 3c39ca451a38e69a822968911e758847657380e9..cebdd0193a82eaccc673dffe9d001f766e9e31d1 100644 (file)
@@ -6,9 +6,9 @@ from .common import InfoExtractor
 
 class FazIE(InfoExtractor):
     IE_NAME = 'faz.net'
-    _VALID_URL = r'https?://www\.faz\.net/multimedia/videos/.*?-(?P<id>\d+)\.html'
+    _VALID_URL = r'https?://(?:www\.)?faz\.net/(?:[^/]+/)*.*?-(?P<id>\d+)\.html'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.faz.net/multimedia/videos/stockholm-chemie-nobelpreis-fuer-drei-amerikanische-forscher-12610585.html',
         'info_dict': {
             'id': '12610585',
@@ -16,7 +16,22 @@ class FazIE(InfoExtractor):
             'title': 'Stockholm: Chemie-Nobelpreis für drei amerikanische Forscher',
             'description': 'md5:1453fbf9a0d041d985a47306192ea253',
         },
-    }
+    }, {
+        'url': 'http://www.faz.net/aktuell/politik/berlin-gabriel-besteht-zerreissprobe-ueber-datenspeicherung-13659345.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.faz.net/berlin-gabriel-besteht-zerreissprobe-ueber-datenspeicherung-13659345.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.faz.net/-13659345.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.faz.net/aktuell/politik/-13659345.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.faz.net/foobarblafasel-13659345.html',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
diff --git a/youtube_dl/extractor/firedrive.py b/youtube_dl/extractor/firedrive.py
deleted file mode 100644 (file)
index 3191116..0000000
+++ /dev/null
@@ -1,80 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..compat import (
-    compat_urllib_parse,
-    compat_urllib_request,
-)
-from ..utils import (
-    ExtractorError,
-)
-
-
-class FiredriveIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?firedrive\.com/' + \
-                 '(?:file|embed)/(?P<id>[0-9a-zA-Z]+)'
-    _FILE_DELETED_REGEX = r'<div class="removed_file_image">'
-
-    _TESTS = [{
-        'url': 'https://www.firedrive.com/file/FEB892FA160EBD01',
-        'md5': 'd5d4252f80ebeab4dc2d5ceaed1b7970',
-        'info_dict': {
-            'id': 'FEB892FA160EBD01',
-            'ext': 'flv',
-            'title': 'bbb_theora_486kbit.flv',
-            'thumbnail': 're:^http://.*\.jpg$',
-        },
-    }]
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-        url = 'http://firedrive.com/file/%s' % video_id
-        webpage = self._download_webpage(url, video_id)
-
-        if re.search(self._FILE_DELETED_REGEX, webpage) is not None:
-            raise ExtractorError('Video %s does not exist' % video_id,
-                                 expected=True)
-
-        fields = dict(re.findall(r'''(?x)<input\s+
-            type="hidden"\s+
-            name="([^"]+)"\s+
-            value="([^"]*)"
-            ''', webpage))
-
-        post = compat_urllib_parse.urlencode(fields)
-        req = compat_urllib_request.Request(url, post)
-        req.add_header('Content-type', 'application/x-www-form-urlencoded')
-
-        # Apparently, this header is required for confirmation to work.
-        req.add_header('Host', 'www.firedrive.com')
-
-        webpage = self._download_webpage(req, video_id,
-                                         'Downloading video page')
-
-        title = self._search_regex(r'class="external_title_left">(.+)</div>',
-                                   webpage, 'title')
-        thumbnail = self._search_regex(r'image:\s?"(//[^\"]+)', webpage,
-                                       'thumbnail', fatal=False)
-        if thumbnail is not None:
-            thumbnail = 'http:' + thumbnail
-
-        ext = self._search_regex(r'type:\s?\'([^\']+)\',',
-                                 webpage, 'extension', fatal=False)
-        video_url = self._search_regex(
-            r'file:\s?loadURL\(\'(http[^\']+)\'\),', webpage, 'file url')
-
-        formats = [{
-            'format_id': 'sd',
-            'url': video_url,
-            'ext': ext,
-        }]
-
-        return {
-            'id': video_id,
-            'title': title,
-            'thumbnail': thumbnail,
-            'formats': formats,
-        }
diff --git a/youtube_dl/extractor/fivetv.py b/youtube_dl/extractor/fivetv.py
new file mode 100644 (file)
index 0000000..13fbc4d
--- /dev/null
@@ -0,0 +1,88 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class FiveTVIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    http://
+                        (?:www\.)?5-tv\.ru/
+                        (?:
+                            (?:[^/]+/)+(?P<id>\d+)|
+                            (?P<path>[^/?#]+)(?:[/?#])?
+                        )
+                    '''
+
+    _TESTS = [{
+        'url': 'http://5-tv.ru/news/96814/',
+        'md5': 'bbff554ad415ecf5416a2f48c22d9283',
+        'info_dict': {
+            'id': '96814',
+            'ext': 'mp4',
+            'title': 'Россияне выбрали имя для общенациональной платежной системы',
+            'description': 'md5:a8aa13e2b7ad36789e9f77a74b6de660',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 180,
+        },
+    }, {
+        'url': 'http://5-tv.ru/video/1021729/',
+        'info_dict': {
+            'id': '1021729',
+            'ext': 'mp4',
+            'title': '3D принтер',
+            'description': 'md5:d76c736d29ef7ec5c0cf7d7c65ffcb41',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 180,
+        },
+    }, {
+        'url': 'http://www.5-tv.ru/glavnoe/#itemDetails',
+        'info_dict': {
+            'id': 'glavnoe',
+            'ext': 'mp4',
+            'title': 'Итоги недели с 8 по 14 июня 2015 года',
+            'thumbnail': 're:^https?://.*\.jpg$',
+        },
+    }, {
+        'url': 'http://www.5-tv.ru/glavnoe/broadcasts/508645/',
+        'only_matching': True,
+    }, {
+        'url': 'http://5-tv.ru/films/1507502/',
+        'only_matching': True,
+    }, {
+        'url': 'http://5-tv.ru/programs/broadcast/508713/',
+        'only_matching': True,
+    }, {
+        'url': 'http://5-tv.ru/angel/',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.5-tv.ru/schedule/?iframe=true&width=900&height=450',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id') or mobj.group('path')
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_url = self._search_regex(
+            r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"',
+            webpage, 'video url')
+
+        title = self._og_search_title(webpage, default=None) or self._search_regex(
+            r'<title>([^<]+)</title>', webpage, 'title')
+        duration = int_or_none(self._og_search_property(
+            'video:duration', webpage, 'duration', default=None))
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'description': self._og_search_description(webpage, default=None),
+            'thumbnail': self._og_search_thumbnail(webpage, default=None),
+            'duration': duration,
+        }
index 363866b6483fd9b419aa3a7704321339ed7494ac..df7665176ec3827f836e18b8ca46e3fec7c97c3b 100644 (file)
@@ -5,7 +5,7 @@ from ..utils import smuggle_url
 
 
 class FoxSportsIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?foxsports\.com/video\?vid=(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:www\.)?foxsports\.com/(?:[^/]+/)*(?P<id>[^/]+)'
 
     _TEST = {
         'url': 'http://www.foxsports.com/video?vid=432609859715',
index edf555b2987520618b70bf8bd423c5fc1f60e5a9..75723c00dc9e96c018e3b6771e634ff93c293ba1 100644 (file)
@@ -6,18 +6,15 @@ import re
 import json
 
 from .common import InfoExtractor
-from ..compat import (
-    compat_urllib_parse_urlparse,
-    compat_urlparse,
-)
+from ..compat import compat_urlparse
 from ..utils import (
     clean_html,
     ExtractorError,
     int_or_none,
-    float_or_none,
     parse_duration,
     determine_ext,
 )
+from .dailymotion import DailymotionCloudIE
 
 
 class FranceTVBaseInfoExtractor(InfoExtractor):
@@ -58,12 +55,12 @@ class FranceTVBaseInfoExtractor(InfoExtractor):
                     # See https://github.com/rg3/youtube-dl/issues/3963
                     # m3u8 urls work fine
                     continue
-                video_url_parsed = compat_urllib_parse_urlparse(video_url)
                 f4m_url = self._download_webpage(
-                    'http://hdfauth.francetv.fr/esi/urltokengen2.html?url=%s' % video_url_parsed.path,
+                    'http://hdfauth.francetv.fr/esi/TA?url=%s' % video_url,
                     video_id, 'Downloading f4m manifest token', fatal=False)
                 if f4m_url:
-                    formats.extend(self._extract_f4m_formats(f4m_url, video_id, 1, format_id))
+                    formats.extend(self._extract_f4m_formats(
+                        f4m_url + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, 1, format_id))
             elif ext == 'm3u8':
                 formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4', m3u8_id=format_id))
             elif video_url.startswith('rtmp'):
@@ -86,7 +83,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor):
             'title': info['titre'],
             'description': clean_html(info['synopsis']),
             'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', info['image']),
-            'duration': float_or_none(info.get('real_duration'), 1000) or parse_duration(info['duree']),
+            'duration': int_or_none(info.get('real_duration')) or parse_duration(info['duree']),
             'timestamp': int_or_none(info['diffusion']['timestamp']),
             'formats': formats,
         }
@@ -131,12 +128,26 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
             'skip_download': 'HLS (reqires ffmpeg)'
         },
         'skip': 'Ce direct est terminé et sera disponible en rattrapage dans quelques minutes.',
+    }, {
+        'url': 'http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html',
+        'md5': 'f485bda6e185e7d15dbc69b72bae993e',
+        'info_dict': {
+            'id': '556e03339473995ee145930c',
+            'ext': 'mp4',
+            'title': 'Les entreprises familiales : le secret de la réussite',
+            'thumbnail': 're:^https?://.*\.jpe?g$',
+        }
     }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         page_title = mobj.group('title')
         webpage = self._download_webpage(url, page_title)
+
+        dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage)
+        if dmcloud_url:
+            return self.url_result(dmcloud_url, 'DailymotionCloud')
+
         video_id, catalogue = self._search_regex(
             r'id-video=([^@]+@[^"]+)', webpage, 'video id').split('@')
         return self._extract_video(video_id, catalogue)
@@ -145,11 +156,21 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
 class FranceTVIE(FranceTVBaseInfoExtractor):
     IE_NAME = 'francetv'
     IE_DESC = 'France 2, 3, 4, 5 and Ô'
-    _VALID_URL = r'''(?x)https?://www\.france[2345o]\.fr/
-        (?:
-            emissions/.*?/(videos|emissions)/(?P<id>[^/?]+)
-        |   (emissions?|jt)/(?P<key>[^/?]+)
-        )'''
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            (?:www\.)?france[2345o]\.fr/
+                                (?:
+                                    emissions/[^/]+/(?:videos|diffusions)|
+                                    emission/[^/]+|
+                                    videos|
+                                    jt
+                                )
+                            /|
+                            embed\.francetv\.fr/\?ue=
+                        )
+                        (?P<id>[^/?]+)
+                    '''
 
     _TESTS = [
         # france2
@@ -206,24 +227,46 @@ class FranceTVIE(FranceTVBaseInfoExtractor):
         },
         # franceo
         {
-            'url': 'http://www.franceo.fr/jt/info-afrique/04-12-2013',
-            'md5': '52f0bfe202848b15915a2f39aaa8981b',
+            'url': 'http://www.franceo.fr/jt/info-soir/18-07-2015',
+            'md5': '47d5816d3b24351cdce512ad7ab31da8',
             'info_dict': {
-                'id': '108634970',
+                'id': '125377621',
                 'ext': 'flv',
-                'title': 'Infô Afrique',
-                'description': 'md5:ebf346da789428841bee0fd2a935ea55',
-                'upload_date': '20140915',
-                'timestamp': 1410822000,
+                'title': 'Infô soir',
+                'description': 'md5:01b8c6915a3d93d8bbbd692651714309',
+                'upload_date': '20150718',
+                'timestamp': 1437241200,
+                'duration': 414,
+            },
+        },
+        {
+            # francetv embed
+            'url': 'http://embed.francetv.fr/?ue=8d7d3da1e3047c42ade5a5d7dfd3fc87',
+            'info_dict': {
+                'id': 'EV_30231',
+                'ext': 'flv',
+                'title': 'Alcaline, le concert avec Calogero',
+                'description': 'md5:61f08036dcc8f47e9cfc33aed08ffaff',
+                'upload_date': '20150226',
+                'timestamp': 1424989860,
+                'duration': 5400,
             },
         },
+        {
+            'url': 'http://www.france4.fr/emission/highlander/diffusion-du-17-07-2015-04h05',
+            'only_matching': True,
+        },
+        {
+            'url': 'http://www.franceo.fr/videos/125377617',
+            'only_matching': True,
+        }
     ]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        webpage = self._download_webpage(url, mobj.group('key') or mobj.group('id'))
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
         video_id, catalogue = self._html_search_regex(
-            r'href="http://videos\.francetv\.fr/video/([^@]+@[^"]+)"',
+            r'href="http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"',
             webpage, 'video ID').split('@')
         return self._extract_video(video_id, catalogue)
 
index 47373e21540030d4c9a19dbfc1c5943f468fea4f..b3f1bafcc37ee98f1c5b89a644909f3ee0a32049 100644 (file)
@@ -5,7 +5,7 @@ import json
 
 from .common import InfoExtractor
 from ..compat import (
-    compat_urllib_parse,
+    compat_urllib_parse_unquote,
     compat_urlparse,
 )
 from ..utils import (
@@ -14,8 +14,8 @@ from ..utils import (
 
 
 class GameSpotIE(InfoExtractor):
-    _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?'
-    _TEST = {
+    _VALID_URL = r'http://(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?'
+    _TESTS = [{
         'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/',
         'md5': 'b2a30deaa8654fcccd43713a6b6a4825',
         'info_dict': {
@@ -23,8 +23,16 @@ class GameSpotIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'Arma 3 - Community Guide: SITREP I',
             'description': 'Check out this video where some of the basics of Arma 3 is explained.',
-        }
-    }
+        },
+    }, {
+        'url': 'http://www.gamespot.com/videos/the-witcher-3-wild-hunt-xbox-one-now-playing/2300-6424837/',
+        'info_dict': {
+            'id': 'gs-2300-6424837',
+            'ext': 'flv',
+            'title': 'The Witcher 3: Wild Hunt [Xbox ONE]  - Now Playing',
+            'description': 'Join us as we take a look at the early hours of The Witcher 3: Wild Hunt and more.',
+        },
+    }]
 
     def _real_extract(self, url):
         page_id = self._match_id(url)
@@ -32,30 +40,42 @@ class GameSpotIE(InfoExtractor):
         data_video_json = self._search_regex(
             r'data-video=["\'](.*?)["\']', webpage, 'data video')
         data_video = json.loads(unescapeHTML(data_video_json))
+        streams = data_video['videoStreams']
 
-        # Transform the manifest url to a link to the mp4 files
-        # they are used in mobile devices.
-        f4m_url = data_video['videoStreams']['f4m_stream']
-        f4m_path = compat_urlparse.urlparse(f4m_url).path
-        QUALITIES_RE = r'((,\d+)+,?)'
-        qualities = self._search_regex(QUALITIES_RE, f4m_path, 'qualities').strip(',').split(',')
-        http_path = f4m_path[1:].split('/', 1)[1]
-        http_template = re.sub(QUALITIES_RE, r'%s', http_path)
-        http_template = http_template.replace('.csmil/manifest.f4m', '')
-        http_template = compat_urlparse.urljoin(
-            'http://video.gamespotcdn.com/', http_template)
         formats = []
-        for q in qualities:
-            formats.append({
-                'url': http_template % q,
-                'ext': 'mp4',
-                'format_id': q,
-            })
+        f4m_url = streams.get('f4m_stream')
+        if f4m_url is not None:
+            # Transform the manifest url to a link to the mp4 files
+            # they are used in mobile devices.
+            f4m_path = compat_urlparse.urlparse(f4m_url).path
+            QUALITIES_RE = r'((,\d+)+,?)'
+            qualities = self._search_regex(QUALITIES_RE, f4m_path, 'qualities').strip(',').split(',')
+            http_path = f4m_path[1:].split('/', 1)[1]
+            http_template = re.sub(QUALITIES_RE, r'%s', http_path)
+            http_template = http_template.replace('.csmil/manifest.f4m', '')
+            http_template = compat_urlparse.urljoin(
+                'http://video.gamespotcdn.com/', http_template)
+            for q in qualities:
+                formats.append({
+                    'url': http_template % q,
+                    'ext': 'mp4',
+                    'format_id': q,
+                })
+        else:
+            for quality in ['sd', 'hd']:
+                # It's actually a link to a flv file
+                flv_url = streams.get('f4m_{0}'.format(quality))
+                if flv_url is not None:
+                    formats.append({
+                        'url': flv_url,
+                        'ext': 'flv',
+                        'format_id': quality,
+                    })
 
         return {
             'id': data_video['guid'],
             'display_id': page_id,
-            'title': compat_urllib_parse.unquote(data_video['title']),
+            'title': compat_urllib_parse_unquote(data_video['title']),
             'formats': formats,
             'description': self._html_search_meta('description', webpage),
             'thumbnail': self._og_search_thumbnail(webpage),
index cd7c47d6d1801ef2468d789215f0f5f0b438565e..6d2efb22e784ecd40dcdebe5195a0d8dde63d632 100644 (file)
@@ -8,7 +8,8 @@ import re
 from .common import InfoExtractor
 from .youtube import YoutubeIE
 from ..compat import (
-    compat_urllib_parse,
+    compat_urllib_parse_unquote,
+    compat_urllib_request,
     compat_urlparse,
     compat_xml_parse_error,
 )
@@ -32,11 +33,21 @@ from .brightcove import BrightcoveIE
 from .nbc import NBCSportsVPlayerIE
 from .ooyala import OoyalaIE
 from .rutv import RUTVIE
+from .tvc import TVCIE
+from .sportbox import SportBoxEmbedIE
 from .smotri import SmotriIE
+from .myvi import MyviIE
 from .condenast import CondeNastIE
 from .udn import UDNEmbedIE
 from .senateisvp import SenateISVPIE
 from .bliptv import BlipTVIE
+from .svt import SVTIE
+from .pornhub import PornHubIE
+from .xhamster import XHamsterEmbedIE
+from .vimeo import VimeoIE
+from .dailymotion import DailymotionCloudIE
+from .onionstudios import OnionStudiosIE
+from .snagfilms import SnagFilmsEmbedIE
 
 
 class GenericIE(InfoExtractor):
@@ -44,6 +55,97 @@ class GenericIE(InfoExtractor):
     _VALID_URL = r'.*'
     IE_NAME = 'generic'
     _TESTS = [
+        # Direct link to a video
+        {
+            'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
+            'md5': '67d406c2bcb6af27fa886f31aa934bbe',
+            'info_dict': {
+                'id': 'trailer',
+                'ext': 'mp4',
+                'title': 'trailer',
+                'upload_date': '20100513',
+            }
+        },
+        # Direct link to media delivered compressed (until Accept-Encoding is *)
+        {
+            'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac',
+            'md5': '128c42e68b13950268b648275386fc74',
+            'info_dict': {
+                'id': 'FictionJunction-Parallel_Hearts',
+                'ext': 'flac',
+                'title': 'FictionJunction-Parallel_Hearts',
+                'upload_date': '20140522',
+            },
+            'expected_warnings': [
+                'URL could be a direct video link, returning it as such.'
+            ]
+        },
+        # Direct download with broken HEAD
+        {
+            'url': 'http://ai-radio.org:8000/radio.opus',
+            'info_dict': {
+                'id': 'radio',
+                'ext': 'opus',
+                'title': 'radio',
+            },
+            'params': {
+                'skip_download': True,  # infinite live stream
+            },
+            'expected_warnings': [
+                r'501.*Not Implemented'
+            ],
+        },
+        # Direct link with incorrect MIME type
+        {
+            'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
+            'md5': '4ccbebe5f36706d85221f204d7eb5913',
+            'info_dict': {
+                'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
+                'id': '5_Lennart_Poettering_-_Systemd',
+                'ext': 'webm',
+                'title': '5_Lennart_Poettering_-_Systemd',
+                'upload_date': '20141120',
+            },
+            'expected_warnings': [
+                'URL could be a direct video link, returning it as such.'
+            ]
+        },
+        # RSS feed
+        {
+            'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
+            'info_dict': {
+                'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
+                'title': 'Zero Punctuation',
+                'description': 're:.*groundbreaking video review series.*'
+            },
+            'playlist_mincount': 11,
+        },
+        # RSS feed with enclosure
+        {
+            'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
+            'info_dict': {
+                'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
+                'ext': 'm4v',
+                'upload_date': '20150228',
+                'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
+            }
+        },
+        # google redirect
+        {
+            'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
+            'info_dict': {
+                'id': 'cmQHVoWB5FY',
+                'ext': 'mp4',
+                'upload_date': '20130224',
+                'uploader_id': 'TheVerge',
+                'description': 're:^Chris Ziegler takes a look at the\.*',
+                'uploader': 'The Verge',
+                'title': 'First Firefox OS phones side-by-side',
+            },
+            'params': {
+                'skip_download': False,
+            }
+        },
         {
             'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
             'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
@@ -123,17 +225,6 @@ class GenericIE(InfoExtractor):
                 'skip_download': True,  # m3u8 download
             },
         },
-        # Direct link to a video
-        {
-            'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
-            'md5': '67d406c2bcb6af27fa886f31aa934bbe',
-            'info_dict': {
-                'id': 'trailer',
-                'ext': 'mp4',
-                'title': 'trailer',
-                'upload_date': '20100513',
-            }
-        },
         # ooyala video
         {
             'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
@@ -158,22 +249,6 @@ class GenericIE(InfoExtractor):
             },
             'add_ie': ['Ooyala'],
         },
-        # google redirect
-        {
-            'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
-            'info_dict': {
-                'id': 'cmQHVoWB5FY',
-                'ext': 'mp4',
-                'upload_date': '20130224',
-                'uploader_id': 'TheVerge',
-                'description': 're:^Chris Ziegler takes a look at the\.*',
-                'uploader': 'The Verge',
-                'title': 'First Firefox OS phones side-by-side',
-            },
-            'params': {
-                'skip_download': False,
-            }
-        },
         # embed.ly video
         {
             'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
@@ -223,6 +298,66 @@ class GenericIE(InfoExtractor):
                 'skip_download': True,
             },
         },
+        # TVC embed
+        {
+            'url': 'http://sch1298sz.mskobr.ru/dou_edu/karamel_ki/filial_galleries/video/iframe_src_http_tvc_ru_video_iframe_id_55304_isplay_false_acc_video_id_channel_brand_id_11_show_episodes_episode_id_32307_frameb/',
+            'info_dict': {
+                'id': '55304',
+                'ext': 'mp4',
+                'title': 'Дошкольное воспитание',
+            },
+        },
+        # SportBox embed
+        {
+            'url': 'http://www.vestifinance.ru/articles/25753',
+            'info_dict': {
+                'id': '25753',
+                'title': 'Вести Экономика ― Прямые трансляции с Форума-выставки "Госзаказ-2013"',
+            },
+            'playlist': [{
+                'info_dict': {
+                    'id': '370908',
+                    'title': 'Госзаказ. День 3',
+                    'ext': 'mp4',
+                }
+            }, {
+                'info_dict': {
+                    'id': '370905',
+                    'title': 'Госзаказ. День 2',
+                    'ext': 'mp4',
+                }
+            }, {
+                'info_dict': {
+                    'id': '370902',
+                    'title': 'Госзаказ. День 1',
+                    'ext': 'mp4',
+                }
+            }],
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
+        # Myvi.ru embed
+        {
+            'url': 'http://www.kinomyvi.tv/news/detail/Pervij-dublirovannij-trejler--Uzhastikov-_nOw1',
+            'info_dict': {
+                'id': 'f4dafcad-ff21-423d-89b5-146cfd89fa1e',
+                'ext': 'mp4',
+                'title': 'Ужастики, русский трейлер (2015)',
+                'thumbnail': 're:^https?://.*\.jpg$',
+                'duration': 153,
+            }
+        },
+        # XHamster embed
+        {
+            'url': 'http://www.numisc.com/forum/showthread.php?11696-FM15-which-pumiscer-was-this-%28-vid-%29-%28-alfa-as-fuck-srx-%29&s=711f5db534502e22260dec8c5e2d66d8',
+            'info_dict': {
+                'id': 'showthread',
+                'title': '[NSFL] [FM15] which pumiscer was this ( vid ) ( alfa as fuck srx )',
+            },
+            'playlist_mincount': 7,
+        },
         # Embedded TED video
         {
             'url': 'http://en.support.wordpress.com/videos/ted-talks/',
@@ -272,6 +407,26 @@ class GenericIE(InfoExtractor):
                 'skip_download': 'Requires rtmpdump'
             }
         },
+        # francetv embed
+        {
+            'url': 'http://www.tsprod.com/replay-du-concert-alcaline-de-calogero',
+            'info_dict': {
+                'id': 'EV_30231',
+                'ext': 'mp4',
+                'title': 'Alcaline, le concert avec Calogero',
+                'description': 'md5:61f08036dcc8f47e9cfc33aed08ffaff',
+                'upload_date': '20150226',
+                'timestamp': 1424989860,
+                'duration': 5400,
+            },
+            'params': {
+                # m3u8 downloads
+                'skip_download': True,
+            },
+            'expected_warnings': [
+                'Forbidden'
+            ]
+        },
         # Condé Nast embed
         {
             'url': 'http://www.wired.com/2014/04/honda-asimo/',
@@ -374,16 +529,6 @@ class GenericIE(InfoExtractor):
                 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
             }
         },
-        # RSS feed
-        {
-            'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
-            'info_dict': {
-                'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
-                'title': 'Zero Punctuation',
-                'description': 're:.*groundbreaking video review series.*'
-            },
-            'playlist_mincount': 11,
-        },
         # Multiple brightcove videos
         # https://github.com/rg3/youtube-dl/issues/2283
         {
@@ -413,19 +558,6 @@ class GenericIE(InfoExtractor):
                 'thumbnail': 're:^https?://.*\.jpg$',
             },
         },
-        # MLB articles
-        {
-            'url': 'http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer',
-            'md5': 'b190e70141fb9a1552a85426b4da1b5d',
-            'info_dict': {
-                'id': '75609783',
-                'ext': 'mp4',
-                'title': 'Must C: Pillar climbs for catch',
-                'description': '4/15/15: Blue Jays outfielder Kevin Pillar continues his defensive dominance by climbing the wall in left to rob Tim Beckham of a home run',
-                'timestamp': 1429124820,
-                'upload_date': '20150415',
-            }
-        },
         # Wistia embed
         {
             'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
@@ -450,21 +582,6 @@ class GenericIE(InfoExtractor):
                 'uploader': 'thoughtworks.wistia.com',
             },
         },
-        # Direct download with broken HEAD
-        {
-            'url': 'http://ai-radio.org:8000/radio.opus',
-            'info_dict': {
-                'id': 'radio',
-                'ext': 'opus',
-                'title': 'radio',
-            },
-            'params': {
-                'skip_download': True,  # infinite live stream
-            },
-            'expected_warnings': [
-                r'501.*Not Implemented'
-            ],
-        },
         # Soundcloud embed
         {
             'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
@@ -496,21 +613,6 @@ class GenericIE(InfoExtractor):
             },
             'playlist_mincount': 2,
         },
-        # Direct link with incorrect MIME type
-        {
-            'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
-            'md5': '4ccbebe5f36706d85221f204d7eb5913',
-            'info_dict': {
-                'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
-                'id': '5_Lennart_Poettering_-_Systemd',
-                'ext': 'webm',
-                'title': '5_Lennart_Poettering_-_Systemd',
-                'upload_date': '20141120',
-            },
-            'expected_warnings': [
-                'URL could be a direct video link, returning it as such.'
-            ]
-        },
         # Cinchcast embed
         {
             'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
@@ -598,6 +700,18 @@ class GenericIE(InfoExtractor):
                 'title': 'John Carlson Postgame 2/25/15',
             },
         },
+        # Kaltura embed (different embed code)
+        {
+            'url': 'http://www.premierchristianradio.com/Shows/Saturday/Unbelievable/Conference-Videos/Os-Guinness-Is-It-Fools-Talk-Unbelievable-Conference-2014',
+            'info_dict': {
+                'id': '1_a52wc67y',
+                'ext': 'flv',
+                'upload_date': '20150127',
+                'uploader_id': 'PremierMedia',
+                'timestamp': int,
+                'title': 'Os Guinness // Is It Fools Talk? // Unbelievable? Conference 2014',
+            },
+        },
         # Eagle.Platform embed (generic URL)
         {
             'url': 'http://lenta.ru/news/2015/03/06/navalny/',
@@ -658,15 +772,16 @@ class GenericIE(InfoExtractor):
                 'title': 'Facebook Creates "On This Day" | Crunch Report',
             },
         },
-        # RSS feed with enclosure
+        # SVT embed
         {
-            'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
+            'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun',
             'info_dict': {
-                'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
-                'ext': 'm4v',
-                'upload_date': '20150228',
-                'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
-            }
+                'id': '2900353',
+                'ext': 'flv',
+                'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)',
+                'duration': 27,
+                'age_limit': 0,
+            },
         },
         # Crooks and Liars embed
         {
@@ -742,6 +857,62 @@ class GenericIE(InfoExtractor):
                 # rtmpe downloads
                 'skip_download': True,
             }
+        },
+        # Brightcove URL in single quotes
+        {
+            'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/',
+            'md5': '4ae374f1f8b91c889c4b9203c8c752af',
+            'info_dict': {
+                'id': '4255764656001',
+                'ext': 'mp4',
+                'title': 'SN Presents: Russell Martin, World Citizen',
+                'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.',
+                'uploader': 'Rogers Sportsnet',
+            },
+        },
+        # Dailymotion Cloud video
+        {
+            'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910',
+            'md5': '49444254273501a64675a7e68c502681',
+            'info_dict': {
+                'id': '5585de919473990de4bee11b',
+                'ext': 'mp4',
+                'title': 'Le débat',
+                'thumbnail': 're:^https?://.*\.jpe?g$',
+            }
+        },
+        # OnionStudios embed
+        {
+            'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537',
+            'info_dict': {
+                'id': '2855',
+                'ext': 'mp4',
+                'title': 'Don’t Understand Bitcoin? This Man Will Mumble An Explanation At You',
+                'thumbnail': 're:^https?://.*\.jpe?g$',
+                'uploader': 'ClickHole',
+                'uploader_id': 'clickhole',
+            }
+        },
+        # SnagFilms embed
+        {
+            'url': 'http://whilewewatch.blogspot.ru/2012/06/whilewewatch-whilewewatch-gripping.html',
+            'info_dict': {
+                'id': '74849a00-85a9-11e1-9660-123139220831',
+                'ext': 'mp4',
+                'title': '#whilewewatch',
+            }
+        },
+        # AdobeTVVideo embed
+        {
+            'url': 'https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners',
+            'md5': '43662b577c018ad707a63766462b1e87',
+            'info_dict': {
+                'id': '2456',
+                'ext': 'mp4',
+                'title': 'New experience with Acrobat DC',
+                'description': 'New experience with Acrobat DC',
+                'duration': 248.667,
+            },
         }
     ]
 
@@ -863,7 +1034,7 @@ class GenericIE(InfoExtractor):
             force_videoid = smuggled_data['force_videoid']
             video_id = force_videoid
         else:
-            video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
+            video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
 
         self.to_screen('%s: Requesting header' % video_id)
 
@@ -885,7 +1056,9 @@ class GenericIE(InfoExtractor):
 
         full_response = None
         if head_response is False:
-            full_response = self._request_webpage(url, video_id)
+            request = compat_urllib_request.Request(url)
+            request.add_header('Accept-Encoding', '*')
+            full_response = self._request_webpage(request, video_id)
             head_response = full_response
 
         # Check for direct link to a video
@@ -896,7 +1069,7 @@ class GenericIE(InfoExtractor):
                 head_response.headers.get('Last-Modified'))
             return {
                 'id': video_id,
-                'title': os.path.splitext(url_basename(url))[0],
+                'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
                 'direct': True,
                 'formats': [{
                     'format_id': m.group('format_id'),
@@ -907,10 +1080,22 @@ class GenericIE(InfoExtractor):
             }
 
         if not self._downloader.params.get('test', False) and not is_intentional:
-            self._downloader.report_warning('Falling back on generic information extractor.')
+            force = self._downloader.params.get('force_generic_extractor', False)
+            self._downloader.report_warning(
+                '%s on generic information extractor.' % ('Forcing' if force else 'Falling back'))
 
         if not full_response:
-            full_response = self._request_webpage(url, video_id)
+            request = compat_urllib_request.Request(url)
+            # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
+            # making it impossible to download only chunk of the file (yet we need only 512kB to
+            # test whether it's HTML or not). According to youtube-dl default Accept-Encoding
+            # that will always result in downloading the whole file that is not desirable.
+            # Therefore for extraction pass we have to override Accept-Encoding to any in order
+            # to accept raw bytes and being able to download only a chunk.
+            # It may probably better to solve this by checking Content-Type for application/octet-stream
+            # after HEAD request finishes, but not sure if we can rely on this.
+            request.add_header('Accept-Encoding', '*')
+            full_response = self._request_webpage(request, video_id)
 
         # Maybe it's a direct link to a video?
         # Be careful not to download the whole thing!
@@ -922,7 +1107,7 @@ class GenericIE(InfoExtractor):
                 head_response.headers.get('Last-Modified'))
             return {
                 'id': video_id,
-                'title': os.path.splitext(url_basename(url))[0],
+                'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
                 'direct': True,
                 'url': url,
                 'upload_date': upload_date,
@@ -949,7 +1134,7 @@ class GenericIE(InfoExtractor):
         # Sometimes embedded video player is hidden behind percent encoding
         # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
         # Unescaping the whole page allows to handle those cases in a generic way
-        webpage = compat_urllib_parse.unquote(webpage)
+        webpage = compat_urllib_parse_unquote(webpage)
 
         # it's tempting to parse this further, but you would
         # have to take into account all the variations like
@@ -1002,23 +1187,20 @@ class GenericIE(InfoExtractor):
 
         # Look for embedded rtl.nl player
         matches = re.findall(
-            r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+video_embed[^"]+)"',
+            r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',
             webpage)
         if matches:
             return _playlist_from_matches(matches, ie='RtlNl')
 
-        # Look for embedded (iframe) Vimeo player
-        mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
-        if mobj:
-            player_url = unescapeHTML(mobj.group('url'))
-            surl = smuggle_url(player_url, {'Referer': url})
-            return self.url_result(surl)
-        # Look for embedded (swf embed) Vimeo player
-        mobj = re.search(
-            r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
-        if mobj:
-            return self.url_result(mobj.group(1))
+        vimeo_url = VimeoIE._extract_vimeo_url(url, webpage)
+        if vimeo_url is not None:
+            return self.url_result(vimeo_url)
+
+        vid_me_embed_url = self._search_regex(
+            r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]',
+            webpage, 'vid.me embed', default=None)
+        if vid_me_embed_url is not None:
+            return self.url_result(vid_me_embed_url, 'Vidme')
 
         # Look for embedded YouTube player
         matches = re.findall(r'''(?x)
@@ -1091,6 +1273,11 @@ class GenericIE(InfoExtractor):
         if bliptv_url:
             return self.url_result(bliptv_url, 'BlipTV')
 
+        # Look for SVT player
+        svt_url = SVTIE._extract_url(webpage)
+        if svt_url:
+            return self.url_result(svt_url, 'SVT')
+
         # Look for embedded condenast player
         matches = re.findall(
             r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
@@ -1207,7 +1394,7 @@ class GenericIE(InfoExtractor):
             return self.url_result(mobj.group('url'))
         mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
         if mobj is not None:
-            return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
+            return self.url_result(compat_urllib_parse_unquote(mobj.group('url')))
 
         # Look for funnyordie embed
         matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
@@ -1225,6 +1412,32 @@ class GenericIE(InfoExtractor):
         if rutv_url:
             return self.url_result(rutv_url, 'RUTV')
 
+        # Look for embedded TVC player
+        tvc_url = TVCIE._extract_url(webpage)
+        if tvc_url:
+            return self.url_result(tvc_url, 'TVC')
+
+        # Look for embedded SportBox player
+        sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)
+        if sportbox_urls:
+            return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed')
+
+        # Look for embedded PornHub player
+        pornhub_url = PornHubIE._extract_url(webpage)
+        if pornhub_url:
+            return self.url_result(pornhub_url, 'PornHub')
+
+        # Look for embedded XHamster player
+        xhamster_urls = XHamsterEmbedIE._extract_urls(webpage)
+        if xhamster_urls:
+            return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed')
+
+        # Look for embedded Tvigle player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'Tvigle')
+
         # Look for embedded TED player
         mobj = re.search(
             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
@@ -1244,11 +1457,23 @@ class GenericIE(InfoExtractor):
         if mobj is not None:
             return self.url_result(mobj.group('url'), 'ArteTVEmbed')
 
+        # Look for embedded francetv player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1',
+            webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'))
+
         # Look for embedded smotri.com player
         smotri_url = SmotriIE._extract_url(webpage)
         if smotri_url:
             return self.url_result(smotri_url, 'Smotri')
 
+        # Look for embedded Myvi.ru player
+        myvi_url = MyviIE._extract_url(webpage)
+        if myvi_url:
+            return self.url_result(myvi_url)
+
         # Look for embeded soundcloud player
         mobj = re.search(
             r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
@@ -1328,8 +1553,8 @@ class GenericIE(InfoExtractor):
             return self.url_result(mobj.group('url'), 'Zapiks')
 
         # Look for Kaltura embeds
-        mobj = re.search(
-            r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage)
+        mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage) or
+                re.search(r'(?s)(["\'])(?:https?:)?//cdnapisec\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?\1.*?entry_id\s*:\s*(["\'])(?P<id>[^\2]+?)\2', webpage))
         if mobj is not None:
             return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
 
@@ -1384,7 +1609,31 @@ class GenericIE(InfoExtractor):
         # Look for Senate ISVP iframe
         senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
         if senate_isvp_url:
-            return self.url_result(surl, 'SenateISVP')
+            return self.url_result(senate_isvp_url, 'SenateISVP')
+
+        # Look for Dailymotion Cloud videos
+        dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage)
+        if dmcloud_url:
+            return self.url_result(dmcloud_url, 'DailymotionCloud')
+
+        # Look for OnionStudios embeds
+        onionstudios_url = OnionStudiosIE._extract_url(webpage)
+        if onionstudios_url:
+            return self.url_result(onionstudios_url)
+
+        # Look for SnagFilms embeds
+        snagfilms_url = SnagFilmsEmbedIE._extract_url(webpage)
+        if snagfilms_url:
+            return self.url_result(snagfilms_url)
+
+        # Look for AdobeTVVideo embeds
+        mobj = re.search(
+            r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]',
+            webpage)
+        if mobj is not None:
+            return self.url_result(
+                self._proto_relative_url(unescapeHTML(mobj.group(1))),
+                'AdobeTVVideo')
 
         def check_video(vurl):
             if YoutubeIE.suitable(vurl):
@@ -1453,7 +1702,7 @@ class GenericIE(InfoExtractor):
                 if refresh_header:
                     found = re.search(REDIRECT_REGEX, refresh_header)
             if found:
-                new_url = compat_urlparse.urljoin(url, found.group(1))
+                new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1)))
                 self.report_following_redirect(new_url)
                 return {
                     '_type': 'url',
@@ -1465,7 +1714,7 @@ class GenericIE(InfoExtractor):
         entries = []
         for video_url in found:
             video_url = compat_urlparse.urljoin(url, video_url)
-            video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
+            video_id = compat_urllib_parse_unquote(os.path.basename(video_url))
 
             # Sometimes, jwplayer extraction will result in a YouTube URL
             if YoutubeIE.suitable(video_url):
index 397f1d42eea83b774791b6a53ce087f60a5dcca1..884700c52b90b53fdc8f581378d28611d7c36f33 100644 (file)
@@ -6,12 +6,13 @@ from ..utils import (
     int_or_none,
     float_or_none,
     qualities,
+    ExtractorError,
 )
 
 
 class GfycatIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?gfycat\.com/(?P<id>[^/?#]+)'
-    _TEST = {
+    _VALID_URL = r'https?://(?:www\.)?gfycat\.com/(?:ifr/)?(?P<id>[^/?#]+)'
+    _TESTS = [{
         'url': 'http://gfycat.com/DeadlyDecisiveGermanpinscher',
         'info_dict': {
             'id': 'DeadlyDecisiveGermanpinscher',
@@ -27,14 +28,33 @@ class GfycatIE(InfoExtractor):
             'categories': list,
             'age_limit': 0,
         }
-    }
+    }, {
+        'url': 'http://gfycat.com/ifr/JauntyTimelyAmazontreeboa',
+        'info_dict': {
+            'id': 'JauntyTimelyAmazontreeboa',
+            'ext': 'mp4',
+            'title': 'JauntyTimelyAmazontreeboa',
+            'timestamp': 1411720126,
+            'upload_date': '20140926',
+            'uploader': 'anonymous',
+            'duration': 3.52,
+            'view_count': int,
+            'like_count': int,
+            'dislike_count': int,
+            'categories': list,
+            'age_limit': 0,
+        }
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
         gfy = self._download_json(
             'http://gfycat.com/cajax/get/%s' % video_id,
-            video_id, 'Downloading video info')['gfyItem']
+            video_id, 'Downloading video info')
+        if 'error' in gfy:
+            raise ExtractorError('Gfycat said: ' + gfy['error'], expected=True)
+        gfy = gfy['gfyItem']
 
         title = gfy.get('title') or gfy['gfyName']
         description = gfy.get('description')
index 6147596e4c5d082d54f975b7428a400679b9dc32..f006f0cb105dc9d7b0c1f495dbcd4c840597858a 100644 (file)
@@ -78,12 +78,7 @@ class GorillaVidIE(InfoExtractor):
         if re.search(self._FILE_NOT_FOUND_REGEX, webpage) is not None:
             raise ExtractorError('Video %s does not exist' % video_id, expected=True)
 
-        fields = dict(re.findall(r'''(?x)<input\s+
-            type="hidden"\s+
-            name="([^"]+)"\s+
-            (?:id="[^"]+"\s+)?
-            value="([^"]*)"
-            ''', webpage))
+        fields = self._hidden_inputs(webpage)
 
         if fields['op'] == 'download1':
             countdown = int_or_none(self._search_regex(
index 63d87b74cc2d5258960fce3b0c6cd5eca48a0b11..f5aa73d18b47ff225b7e7e332051b97583ca8237 100644 (file)
@@ -1,7 +1,5 @@
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
 
 
@@ -19,20 +17,19 @@ class HentaiStigmaIE(InfoExtractor):
     }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = self._match_id(url)
 
         webpage = self._download_webpage(url, video_id)
 
         title = self._html_search_regex(
-            r'<h2 class="posttitle"><a[^>]*>([^<]+)</a>',
+            r'<h2[^>]+class="posttitle"[^>]*><a[^>]*>([^<]+)</a>',
             webpage, 'title')
         wrap_url = self._html_search_regex(
-            r'<iframe src="([^"]+mp4)"', webpage, 'wrapper url')
+            r'<iframe[^>]+src="([^"]+mp4)"', webpage, 'wrapper url')
         wrap_webpage = self._download_webpage(wrap_url, video_id)
 
         video_url = self._html_search_regex(
-            r'clip:\s*{\s*url: "([^"]*)"', wrap_webpage, 'video url')
+            r'file\s*:\s*"([^"]+)"', wrap_webpage, 'video url')
 
         return {
             'id': video_id,
index 704d0285d3e1c2ce10e8f3929543c6c66b0fd58a..a3154cfdeccf9b4c18cf5a5b01f7944243fb1509 100644 (file)
@@ -58,11 +58,7 @@ class HostingBulkIE(InfoExtractor):
             r'<img src="([^"]+)".+?class="pic"',
             webpage, 'thumbnail', fatal=False)
 
-        fields = dict(re.findall(r'''(?x)<input\s+
-            type="hidden"\s+
-            name="([^"]+)"\s+
-            value="([^"]*)"
-            ''', webpage))
+        fields = self._hidden_inputs(webpage)
 
         request = compat_urllib_request.Request(url, urlencode_postdata(fields))
         request.add_header('Content-type', 'application/x-www-form-urlencoded')
index 3f7d6666c0810e545c0f285dcf70689806c4dac7..16677f179ecd77040e8fdc42bfb9e4095aa1774a 100644 (file)
@@ -1,8 +1,7 @@
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
+from ..utils import parse_iso8601
 
 
 class HowcastIE(InfoExtractor):
@@ -13,29 +12,31 @@ class HowcastIE(InfoExtractor):
         'info_dict': {
             'id': '390161',
             'ext': 'mp4',
-            'description': 'The square knot, also known as the reef knot, is one of the oldest, most basic knots to tie, and can be used in many different ways. Here\'s the proper way to tie a square knot.',
             'title': 'How to Tie a Square Knot Properly',
-        }
+            'description': 'md5:dbe792e5f6f1489027027bf2eba188a3',
+            'timestamp': 1276081287,
+            'upload_date': '20100609',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
     }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
+        video_id = self._match_id(url)
 
-        video_id = mobj.group('id')
         webpage = self._download_webpage(url, video_id)
 
-        self.report_extraction(video_id)
-
-        video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
-                                       webpage, 'video URL')
-
-        video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
-                                                    webpage, 'description', fatal=False)
+        embed_code = self._search_regex(
+            r'<iframe[^>]+src="[^"]+\bembed_code=([^\b]+)\b',
+            webpage, 'ooyala embed code')
 
         return {
+            '_type': 'url_transparent',
+            'ie_key': 'Ooyala',
+            'url': 'ooyala:%s' % embed_code,
             'id': video_id,
-            'url': video_url,
-            'title': self._og_search_title(webpage),
-            'description': video_description,
-            'thumbnail': self._og_search_thumbnail(webpage),
+            'timestamp': parse_iso8601(self._html_search_meta(
+                'article:published_time', webpage, 'timestamp')),
         }
index e9733912132798d99be18bb935dcd3c3b190525d..663e6632a194d8ee271a0c031a921d7eed139005 100644 (file)
@@ -10,7 +10,7 @@ from ..utils import (
 
 
 class HowStuffWorksIE(InfoExtractor):
-    _VALID_URL = r'https?://[\da-z-]+\.howstuffworks\.com/(?:[^/]+/)*\d+-(?P<id>.+?)-video\.htm'
+    _VALID_URL = r'https?://[\da-z-]+\.howstuffworks\.com/(?:[^/]+/)*(?:\d+-)?(?P<id>.+?)-video\.htm'
     _TESTS = [
         {
             'url': 'http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm',
@@ -46,6 +46,10 @@ class HowStuffWorksIE(InfoExtractor):
                 'thumbnail': 're:^https?://.*\.jpg$',
             },
         },
+        {
+            'url': 'http://shows.howstuffworks.com/stuff-to-blow-your-mind/optical-illusions-video.htm',
+            'only_matching': True,
+        }
     ]
 
     def _real_extract(self, url):
index f29df36b5bf6bd7e732ad84cbfd7d3eeb412f5ff..4bb574cf37df2421721b088ded37a3fc66c8c2ea 100644 (file)
@@ -46,7 +46,7 @@ class ImdbIE(InfoExtractor):
             format_info = info['videoPlayerObject']['video']
             formats.append({
                 'format_id': f_id,
-                'url': format_info['url'],
+                'url': format_info['videoInfoList'][0]['videoUrl'],
             })
 
         return {
index fe5d95e2c9cad488f342233e7ebfd52e42a86de3..d692ea79ab493174038c9649445e6a592a86687c 100644 (file)
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..compat import compat_urlparse
 from ..utils import (
     int_or_none,
     js_to_json,
@@ -12,7 +13,7 @@ from ..utils import (
 
 
 class ImgurIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?P<id>[a-zA-Z0-9]+)(?:\.mp4|\.gifv)?'
+    _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?P<id>[a-zA-Z0-9]+)'
 
     _TESTS = [{
         'url': 'https://i.imgur.com/A61SaA1.gifv',
@@ -34,7 +35,8 @@ class ImgurIE(InfoExtractor):
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
+        webpage = self._download_webpage(
+            compat_urlparse.urljoin(url, video_id), video_id)
 
         width = int_or_none(self._search_regex(
             r'<param name="width" value="([0-9]+)"',
index 0847074eeb0f0f258db079495c7f30777b2d1838..65712abc28c3cc68cab7052ab709b2c1e6500cb5 100644 (file)
@@ -7,7 +7,7 @@ from .common import InfoExtractor
 
 
 class InaIE(InfoExtractor):
-    _VALID_URL = r'http://(?:www\.)?ina\.fr/video/(?P<id>I?[A-Z0-9]+)'
+    _VALID_URL = r'https?://(?:www\.)?ina\.fr/video/(?P<id>I?[A-Z0-9]+)'
     _TEST = {
         'url': 'http://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html',
         'md5': 'a667021bf2b41f8dc6049479d9bb38a3',
index f25f43664e262b25473557c5f11dae91e697e3f6..71cfd12c56549d0be540c9daee6a2732959039de 100644 (file)
@@ -4,14 +4,15 @@ import base64
 
 from .common import InfoExtractor
 from ..compat import (
-    compat_urllib_parse,
+    compat_urllib_parse_unquote,
+    compat_urlparse,
 )
 
 
 class InfoQIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?infoq\.com/[^/]+/(?P<id>[^/]+)$'
+    _VALID_URL = r'https?://(?:www\.)?infoq\.com/(?:[^/]+/)+(?P<id>[^/]+)'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things',
         'md5': 'b5ca0e0a8c1fed93b0e65e48e462f9a2',
         'info_dict': {
@@ -20,7 +21,10 @@ class InfoQIE(InfoExtractor):
             'description': 'Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.',
             'title': 'A Few of My Favorite [Python] Things',
         },
-    }
+    }, {
+        'url': 'http://www.infoq.com/fr/presentations/changez-avis-sur-javascript',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
@@ -35,14 +39,14 @@ class InfoQIE(InfoExtractor):
         # Extract video URL
         encoded_id = self._search_regex(
             r"jsclassref\s*=\s*'([^']*)'", webpage, 'encoded id')
-        real_id = compat_urllib_parse.unquote(base64.b64decode(encoded_id.encode('ascii')).decode('utf-8'))
+        real_id = compat_urllib_parse_unquote(base64.b64decode(encoded_id.encode('ascii')).decode('utf-8'))
         playpath = 'mp4:' + real_id
 
         video_filename = playpath.split('/')[-1]
         video_id, extension = video_filename.split('.')
 
         http_base = self._search_regex(
-            r'EXPRESSINSTALL_SWF\s*=\s*"(https?://[^/"]+/)', webpage,
+            r'EXPRESSINSTALL_SWF\s*=\s*[^"]*"((?:https?:)?//[^/"]+/)', webpage,
             'HTTP base URL')
 
         formats = [{
@@ -52,7 +56,7 @@ class InfoQIE(InfoExtractor):
             'play_path': playpath,
         }, {
             'format_id': 'http',
-            'url': http_base + real_id,
+            'url': compat_urlparse.urljoin(url, http_base) + real_id,
         }]
         self._sort_formats(formats)
 
index 65f6ca103973bb25c016ae92fcb551c65def31d1..3d78f78c46d1ad004339bc33ebcb09d1286e5092 100644 (file)
@@ -3,13 +3,16 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..utils import int_or_none
+from ..utils import (
+    int_or_none,
+    limit_length,
+)
 
 
 class InstagramIE(InfoExtractor):
-    _VALID_URL = r'https?://instagram\.com/p/(?P<id>[\da-zA-Z]+)'
+    _VALID_URL = r'https://instagram\.com/p/(?P<id>[\da-zA-Z]+)'
     _TEST = {
-        'url': 'http://instagram.com/p/aye83DjauH/?foo=bar#abc',
+        'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc',
         'md5': '0d2da106a9d2631273e192b372806516',
         'info_dict': {
             'id': 'aye83DjauH',
@@ -41,11 +44,11 @@ class InstagramIE(InfoExtractor):
 
 
 class InstagramUserIE(InfoExtractor):
-    _VALID_URL = r'http://instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])'
+    _VALID_URL = r'https://instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])'
     IE_DESC = 'Instagram user profile'
     IE_NAME = 'instagram:user'
     _TEST = {
-        'url': 'http://instagram.com/porsche',
+        'url': 'https://instagram.com/porsche',
         'info_dict': {
             'id': 'porsche',
             'title': 'porsche',
@@ -100,11 +103,13 @@ class InstagramUserIE(InfoExtractor):
                 thumbnails_el = it.get('images', {})
                 thumbnail = thumbnails_el.get('thumbnail', {}).get('url')
 
-                title = it.get('caption', {}).get('text', it['id'])
+                # In some cases caption is null, which corresponds to None
+                # in python. As a result, it.get('caption', {}) gives None
+                title = (it.get('caption') or {}).get('text', it['id'])
 
                 entries.append({
                     'id': it['id'],
-                    'title': title,
+                    'title': limit_length(title, 80),
                     'formats': formats,
                     'thumbnail': thumbnail,
                     'webpage_url': it.get('link'),
index 8529bedfc0ab283790e74144bc9d570df19dc4b3..821c8ec109236b787b9afa2985e450ff8a647595 100644 (file)
@@ -11,11 +11,12 @@ from ..compat import (
 )
 from ..utils import (
     ExtractorError,
+    remove_end,
 )
 
 
 class IPrimaIE(InfoExtractor):
-    _VALID_URL = r'https?://play\.iprima\.cz/[^?#]+/(?P<id>[^?#]+)'
+    _VALID_URL = r'https?://play\.iprima\.cz/(?:[^/]+/)*(?P<id>[^?#]+)'
 
     _TESTS = [{
         'url': 'http://play.iprima.cz/particka/particka-92',
@@ -23,7 +24,7 @@ class IPrimaIE(InfoExtractor):
             'id': '39152',
             'ext': 'flv',
             'title': 'Partička (92)',
-            'description': 'md5:3740fda51464da35a2d4d0670b8e4fd6',
+            'description': 'md5:74e9617e51bca67c3ecfb2c6f9766f45',
             'thumbnail': 'http://play.iprima.cz/sites/default/files/image_crops/image_620x349/3/491483_particka-92_image_620x349.jpg',
         },
         'params': {
@@ -35,13 +36,14 @@ class IPrimaIE(InfoExtractor):
             'id': '9718337',
             'ext': 'flv',
             'title': 'Tchibo Partička - Jarní móda',
-            'description': 'md5:589f8f59f414220621ff8882eb3ce7be',
             'thumbnail': 're:^http:.*\.jpg$',
         },
         'params': {
             'skip_download': True,  # requires rtmpdump
         },
-        'skip': 'Do not have permission to access this page',
+    }, {
+        'url': 'http://play.iprima.cz/zpravy-ftv-prima-2752015',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
@@ -102,8 +104,10 @@ class IPrimaIE(InfoExtractor):
 
         return {
             'id': real_id,
-            'title': self._og_search_title(webpage),
+            'title': remove_end(self._og_search_title(webpage), ' | Prima PLAY'),
             'thumbnail': self._og_search_thumbnail(webpage),
             'formats': formats,
-            'description': self._og_search_description(webpage),
+            'description': self._search_regex(
+                r'<p[^>]+itemprop="description"[^>]*>([^<]+)',
+                webpage, 'description', default=None),
         }
diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py
new file mode 100644 (file)
index 0000000..afb7f4e
--- /dev/null
@@ -0,0 +1,273 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import hashlib
+import math
+import random
+import time
+import uuid
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse
+from ..utils import ExtractorError
+
+
+class IqiyiIE(InfoExtractor):
+    IE_NAME = 'iqiyi'
+    IE_DESC = '爱奇艺'
+
+    _VALID_URL = r'http://(?:www\.)iqiyi.com/v_.+?\.html'
+
+    _TESTS = [{
+        'url': 'http://www.iqiyi.com/v_19rrojlavg.html',
+        'md5': '2cb594dc2781e6c941a110d8f358118b',
+        'info_dict': {
+            'id': '9c1fb1b99d192b21c559e5a1a2cb3c73',
+            'title': '美国德州空中惊现奇异云团 酷似UFO',
+            'ext': 'f4v',
+        }
+    }, {
+        'url': 'http://www.iqiyi.com/v_19rrhnnclk.html',
+        'info_dict': {
+            'id': 'e3f585b550a280af23c98b6cb2be19fb',
+            'title': '名侦探柯南第752集',
+        },
+        'playlist': [{
+            'info_dict': {
+                'id': 'e3f585b550a280af23c98b6cb2be19fb_part1',
+                'ext': 'f4v',
+                'title': '名侦探柯南第752集',
+            },
+        }, {
+            'info_dict': {
+                'id': 'e3f585b550a280af23c98b6cb2be19fb_part2',
+                'ext': 'f4v',
+                'title': '名侦探柯南第752集',
+            },
+        }, {
+            'info_dict': {
+                'id': 'e3f585b550a280af23c98b6cb2be19fb_part3',
+                'ext': 'f4v',
+                'title': '名侦探柯南第752集',
+            },
+        }, {
+            'info_dict': {
+                'id': 'e3f585b550a280af23c98b6cb2be19fb_part4',
+                'ext': 'f4v',
+                'title': '名侦探柯南第752集',
+            },
+        }, {
+            'info_dict': {
+                'id': 'e3f585b550a280af23c98b6cb2be19fb_part5',
+                'ext': 'f4v',
+                'title': '名侦探柯南第752集',
+            },
+        }, {
+            'info_dict': {
+                'id': 'e3f585b550a280af23c98b6cb2be19fb_part6',
+                'ext': 'f4v',
+                'title': '名侦探柯南第752集',
+            },
+        }, {
+            'info_dict': {
+                'id': 'e3f585b550a280af23c98b6cb2be19fb_part7',
+                'ext': 'f4v',
+                'title': '名侦探柯南第752集',
+            },
+        }, {
+            'info_dict': {
+                'id': 'e3f585b550a280af23c98b6cb2be19fb_part8',
+                'ext': 'f4v',
+                'title': '名侦探柯南第752集',
+            },
+        }],
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    _FORMATS_MAP = [
+        ('1', 'h6'),
+        ('2', 'h5'),
+        ('3', 'h4'),
+        ('4', 'h3'),
+        ('5', 'h2'),
+        ('10', 'h1'),
+    ]
+
+    def construct_video_urls(self, data, video_id, _uuid):
+        def do_xor(x, y):
+            a = y % 3
+            if a == 1:
+                return x ^ 121
+            if a == 2:
+                return x ^ 72
+            return x ^ 103
+
+        def get_encode_code(l):
+            a = 0
+            b = l.split('-')
+            c = len(b)
+            s = ''
+            for i in range(c - 1, -1, -1):
+                a = do_xor(int(b[c - i - 1], 16), i)
+                s += chr(a)
+            return s[::-1]
+
+        def get_path_key(x, format_id, segment_index):
+            mg = ')(*&^flash@#$%a'
+            tm = self._download_json(
+                'http://data.video.qiyi.com/t?tn=' + str(random.random()), video_id,
+                note='Download path key of segment %d for format %s' % (segment_index + 1, format_id)
+            )['t']
+            t = str(int(math.floor(int(tm) / (600.0))))
+            return hashlib.md5((t + mg + x).encode('utf8')).hexdigest()
+
+        video_urls_dict = {}
+        for format_item in data['vp']['tkl'][0]['vs']:
+            if 0 < int(format_item['bid']) <= 10:
+                format_id = self.get_format(format_item['bid'])
+            else:
+                continue
+
+            video_urls = []
+
+            video_urls_info = format_item['fs']
+            if not format_item['fs'][0]['l'].startswith('/'):
+                t = get_encode_code(format_item['fs'][0]['l'])
+                if t.endswith('mp4'):
+                    video_urls_info = format_item['flvs']
+
+            for segment_index, segment in enumerate(video_urls_info):
+                vl = segment['l']
+                if not vl.startswith('/'):
+                    vl = get_encode_code(vl)
+                key = get_path_key(
+                    vl.split('/')[-1].split('.')[0], format_id, segment_index)
+                filesize = segment['b']
+                base_url = data['vp']['du'].split('/')
+                base_url.insert(-1, key)
+                base_url = '/'.join(base_url)
+                param = {
+                    'su': _uuid,
+                    'qyid': uuid.uuid4().hex,
+                    'client': '',
+                    'z': '',
+                    'bt': '',
+                    'ct': '',
+                    'tn': str(int(time.time()))
+                }
+                api_video_url = base_url + vl + '?' + \
+                    compat_urllib_parse.urlencode(param)
+                js = self._download_json(
+                    api_video_url, video_id,
+                    note='Download video info of segment %d for format %s' % (segment_index + 1, format_id))
+                video_url = js['l']
+                video_urls.append(
+                    (video_url, filesize))
+
+            video_urls_dict[format_id] = video_urls
+        return video_urls_dict
+
+    def get_format(self, bid):
+        matched_format_ids = [_format_id for _bid, _format_id in self._FORMATS_MAP if _bid == str(bid)]
+        return matched_format_ids[0] if len(matched_format_ids) else None
+
+    def get_bid(self, format_id):
+        matched_bids = [_bid for _bid, _format_id in self._FORMATS_MAP if _format_id == format_id]
+        return matched_bids[0] if len(matched_bids) else None
+
+    def get_raw_data(self, tvid, video_id, enc_key, _uuid):
+        tm = str(int(time.time()))
+        param = {
+            'key': 'fvip',
+            'src': hashlib.md5(b'youtube-dl').hexdigest(),
+            'tvId': tvid,
+            'vid': video_id,
+            'vinfo': 1,
+            'tm': tm,
+            'enc': hashlib.md5(
+                (enc_key + tm + tvid).encode('utf8')).hexdigest(),
+            'qyid': _uuid,
+            'tn': random.random(),
+            'um': 0,
+            'authkey': hashlib.md5(
+                (tm + tvid).encode('utf8')).hexdigest()
+        }
+
+        api_url = 'http://cache.video.qiyi.com/vms' + '?' + \
+            compat_urllib_parse.urlencode(param)
+        raw_data = self._download_json(api_url, video_id)
+        return raw_data
+
+    def get_enc_key(self, swf_url, video_id):
+        enc_key = '8e29ab5666d041c3a1ea76e06dabdffb'
+        return enc_key
+
+    def _real_extract(self, url):
+        webpage = self._download_webpage(
+            url, 'temp_id', note='download video page')
+        tvid = self._search_regex(
+            r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid')
+        video_id = self._search_regex(
+            r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id')
+        swf_url = self._search_regex(
+            r'(http://[^\'"]+MainPlayer[^.]+\.swf)', webpage, 'swf player URL')
+        _uuid = uuid.uuid4().hex
+
+        enc_key = self.get_enc_key(swf_url, video_id)
+
+        raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid)
+
+        if raw_data['code'] != 'A000000':
+            raise ExtractorError('Unable to load data. Error code: ' + raw_data['code'])
+
+        if not raw_data['data']['vp']['tkl']:
+            raise ExtractorError('No support iQiqy VIP video')
+
+        data = raw_data['data']
+
+        title = data['vi']['vn']
+
+        # generate video_urls_dict
+        video_urls_dict = self.construct_video_urls(
+            data, video_id, _uuid)
+
+        # construct info
+        entries = []
+        for format_id in video_urls_dict:
+            video_urls = video_urls_dict[format_id]
+            for i, video_url_info in enumerate(video_urls):
+                if len(entries) < i + 1:
+                    entries.append({'formats': []})
+                entries[i]['formats'].append(
+                    {
+                        'url': video_url_info[0],
+                        'filesize': video_url_info[-1],
+                        'format_id': format_id,
+                        'preference': int(self.get_bid(format_id))
+                    }
+                )
+
+        for i in range(len(entries)):
+            self._sort_formats(entries[i]['formats'])
+            entries[i].update(
+                {
+                    'id': '%s_part%d' % (video_id, i + 1),
+                    'title': title,
+                }
+            )
+
+        if len(entries) > 1:
+            info = {
+                '_type': 'multi_video',
+                'id': video_id,
+                'title': title,
+                'entries': entries,
+            }
+        else:
+            info = entries[0]
+            info['id'] = video_id
+            info['title'] = title
+
+        return info
index 99a1361f844c15520c842cd9fffa1e5c2e9b6974..bc226fa67c064b991674a510b1eba54d40dc67e0 100644 (file)
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
 from ..utils import (
     determine_ext,
     float_or_none,
@@ -30,7 +31,7 @@ class IzleseneIE(InfoExtractor):
                 'description': 'md5:253753e2655dde93f59f74b572454f6d',
                 'thumbnail': 're:^http://.*\.jpg',
                 'uploader_id': 'pelikzzle',
-                'timestamp': 1404302298,
+                'timestamp': int,
                 'upload_date': '20140702',
                 'duration': 95.395,
                 'age_limit': 0,
@@ -46,7 +47,7 @@ class IzleseneIE(InfoExtractor):
                 'description': 'Tarkan Dortmund 2006 Konseri',
                 'thumbnail': 're:^http://.*\.jpg',
                 'uploader_id': 'parlayankiz',
-                'timestamp': 1163322193,
+                'timestamp': int,
                 'upload_date': '20061112',
                 'duration': 253.666,
                 'age_limit': 0,
@@ -67,9 +68,9 @@ class IzleseneIE(InfoExtractor):
 
         uploader = self._html_search_regex(
             r"adduserUsername\s*=\s*'([^']+)';",
-            webpage, 'uploader', fatal=False, default='')
+            webpage, 'uploader', fatal=False)
         timestamp = parse_iso8601(self._html_search_meta(
-            'uploadDate', webpage, 'upload date', fatal=False))
+            'uploadDate', webpage, 'upload date'))
 
         duration = float_or_none(self._html_search_regex(
             r'"videoduration"\s*:\s*"([^"]+)"',
@@ -86,8 +87,7 @@ class IzleseneIE(InfoExtractor):
 
         # Might be empty for some videos.
         streams = self._html_search_regex(
-            r'"qualitylevel"\s*:\s*"([^"]+)"',
-            webpage, 'streams', fatal=False, default='')
+            r'"qualitylevel"\s*:\s*"([^"]+)"', webpage, 'streams', default='')
 
         formats = []
         if streams:
@@ -95,15 +95,15 @@ class IzleseneIE(InfoExtractor):
                 quality, url = re.search(r'\[(\w+)\](.+)', stream).groups()
                 formats.append({
                     'format_id': '%sp' % quality if quality else 'sd',
-                    'url': url,
+                    'url': compat_urllib_parse_unquote(url),
                     'ext': ext,
                 })
         else:
             stream_url = self._search_regex(
-                r'"streamurl"\s?:\s?"([^"]+)"', webpage, 'stream URL')
+                r'"streamurl"\s*:\s*"([^"]+)"', webpage, 'stream URL')
             formats.append({
                 'format_id': 'sd',
-                'url': stream_url,
+                'url': compat_urllib_parse_unquote(stream_url),
                 'ext': ext,
             })
 
index d0720ff561c16e8c0816c5ff7ab333e54c297dbc..1df084d87ae4c712d9bcfa1aac6d6367641287b5 100644 (file)
@@ -8,9 +8,9 @@ from .common import InfoExtractor
 
 
 class JeuxVideoIE(InfoExtractor):
-    _VALID_URL = r'http://.*?\.jeuxvideo\.com/.*/(.*?)-\d+\.htm'
+    _VALID_URL = r'http://.*?\.jeuxvideo\.com/.*/(.*?)\.htm'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.jeuxvideo.com/reportages-videos-jeux/0004/00046170/tearaway-playstation-vita-gc-2013-tearaway-nous-presente-ses-papiers-d-identite-00115182.htm',
         'md5': '046e491afb32a8aaac1f44dd4ddd54ee',
         'info_dict': {
@@ -19,7 +19,10 @@ class JeuxVideoIE(InfoExtractor):
             'title': 'Tearaway : GC 2013 : Tearaway nous présente ses papiers d\'identité',
             'description': 'Lorsque les développeurs de LittleBigPlanet proposent un nouveau titre, on ne peut que s\'attendre à un résultat original et fort attrayant.',
         },
-    }
+    }, {
+        'url': 'http://www.jeuxvideo.com/videos/chroniques/434220/l-histoire-du-jeu-video-la-saturn.htm',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
index 2bb078036391e3c370a6e3b72a61eccc299b3e67..4597d1b961a0fcae8137d3ec919fc2ce6ac31777 100644 (file)
@@ -7,6 +7,7 @@ from .common import InfoExtractor
 from ..utils import (
     ExtractorError,
     float_or_none,
+    srt_subtitles_timecode,
 )
 
 
@@ -39,8 +40,8 @@ class KanalPlayIE(InfoExtractor):
             '%s\r\n%s --> %s\r\n%s'
             % (
                 num,
-                self._subtitles_timecode(item['startMillis'] / 1000.0),
-                self._subtitles_timecode(item['endMillis'] / 1000.0),
+                srt_subtitles_timecode(item['startMillis'] / 1000.0),
+                srt_subtitles_timecode(item['endMillis'] / 1000.0),
                 item['text'],
             ) for num, item in enumerate(subs, 1))
 
index e3b43ff8dbfec5e065aa069a0fb140dcfb5822c9..06daf5a89ce3ffde4d71d7dc8ceee9441840b72b 100644 (file)
@@ -2,7 +2,7 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..compat import compat_urllib_parse
+from ..compat import compat_urllib_parse_unquote_plus
 from ..utils import (
     js_to_json,
 )
@@ -24,7 +24,7 @@ class KaraoketvIE(InfoExtractor):
         webpage = self._download_webpage(url, video_id)
 
         page_video_url = self._og_search_video_url(webpage, video_id)
-        config_json = compat_urllib_parse.unquote_plus(self._search_regex(
+        config_json = compat_urllib_parse_unquote_plus(self._search_regex(
             r'config=(.*)', page_video_url, 'configuration'))
 
         urls_info_json = self._download_json(
diff --git a/youtube_dl/extractor/karrierevideos.py b/youtube_dl/extractor/karrierevideos.py
new file mode 100644 (file)
index 0000000..bed94bc
--- /dev/null
@@ -0,0 +1,96 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+    fix_xml_ampersands,
+    float_or_none,
+    xpath_with_ns,
+    xpath_text,
+)
+
+
+class KarriereVideosIE(InfoExtractor):
+    _VALID_URL = r'http://(?:www\.)?karrierevideos\.at(?:/[^/]+)+/(?P<id>[^/]+)'
+    _TESTS = [{
+        'url': 'http://www.karrierevideos.at/berufsvideos/mittlere-hoehere-schulen/altenpflegerin',
+        'info_dict': {
+            'id': '32c91',
+            'ext': 'flv',
+            'title': 'AltenpflegerIn',
+            'description': 'md5:dbadd1259fde2159a9b28667cb664ae2',
+            'thumbnail': 're:^http://.*\.png',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }, {
+        # broken ampersands
+        'url': 'http://www.karrierevideos.at/orientierung/vaeterkarenz-und-neue-chancen-fuer-muetter-baby-was-nun',
+        'info_dict': {
+            'id': '5sniu',
+            'ext': 'flv',
+            'title': 'Väterkarenz und neue Chancen für Mütter - "Baby - was nun?"',
+            'description': 'md5:97092c6ad1fd7d38e9d6a5fdeb2bcc33',
+            'thumbnail': 're:^http://.*\.png',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = (self._html_search_meta('title', webpage, default=None) or
+                 self._search_regex(r'<h1 class="title">([^<]+)</h1>'))
+
+        video_id = self._search_regex(
+            r'/config/video/(.+?)\.xml', webpage, 'video id')
+        playlist = self._download_xml(
+            'http://www.karrierevideos.at/player-playlist.xml.php?p=%s' % video_id,
+            video_id, transform_source=fix_xml_ampersands)
+
+        NS_MAP = {
+            'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats'
+        }
+
+        def ns(path):
+            return xpath_with_ns(path, NS_MAP)
+
+        item = playlist.find('./tracklist/item')
+        video_file = xpath_text(
+            item, ns('./jwplayer:file'), 'video url', fatal=True)
+        streamer = xpath_text(
+            item, ns('./jwplayer:streamer'), 'streamer', fatal=True)
+
+        uploader = xpath_text(
+            item, ns('./jwplayer:author'), 'uploader')
+        duration = float_or_none(
+            xpath_text(item, ns('./jwplayer:duration'), 'duration'))
+
+        description = self._html_search_regex(
+            r'(?s)<div class="leadtext">(.+?)</div>',
+            webpage, 'description')
+
+        thumbnail = self._html_search_meta(
+            'thumbnail', webpage, 'thumbnail')
+        if thumbnail:
+            thumbnail = compat_urlparse.urljoin(url, thumbnail)
+
+        return {
+            'id': video_id,
+            'url': streamer.replace('rtmpt', 'rtmp'),
+            'play_path': 'mp4:%s' % video_file,
+            'ext': 'flv',
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'duration': duration,
+        }
index 7d4b57056509383fdc082a68c1650f38dc258763..1d391e69ff7e0aba1b78ae5e32792b2dca839943 100644 (file)
@@ -28,6 +28,14 @@ class KickStarterIE(InfoExtractor):
             'uploader': 'Pebble Technology',
             'title': 'Pebble iOS Notifications',
         }
+    }, {
+        'url': 'https://www.kickstarter.com/projects/1420158244/power-drive-2000/widget/video.html',
+        'info_dict': {
+            'id': '1420158244',
+            'ext': 'mp4',
+            'title': 'Power Drive 2000',
+        },
+        'expected_warnings': ['OpenGraph description'],
     }]
 
     def _real_extract(self, url):
@@ -48,10 +56,15 @@ class KickStarterIE(InfoExtractor):
                 'title': title,
             }
 
+        thumbnail = self._og_search_thumbnail(webpage, default=None)
+        if thumbnail is None:
+            thumbnail = self._html_search_regex(
+                r'<img[^>]+class="[^"]+\s*poster\s*[^"]+"[^>]+src="([^"]+)"',
+                webpage, 'thumbnail image', fatal=False)
         return {
             'id': video_id,
             'url': video_url,
             'title': title,
             'description': self._og_search_description(webpage),
-            'thumbnail': self._og_search_thumbnail(webpage),
+            'thumbnail': thumbnail,
         }
diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py
new file mode 100644 (file)
index 0000000..1077846
--- /dev/null
@@ -0,0 +1,314 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import itertools
+
+from .common import InfoExtractor
+from ..utils import (
+    get_element_by_id,
+    clean_html,
+    ExtractorError,
+    remove_start,
+)
+
+
+class KuwoBaseIE(InfoExtractor):
+    _FORMATS = [
+        {'format': 'ape', 'ext': 'ape', 'preference': 100},
+        {'format': 'mp3-320', 'ext': 'mp3', 'br': '320kmp3', 'abr': 320, 'preference': 80},
+        {'format': 'mp3-192', 'ext': 'mp3', 'br': '192kmp3', 'abr': 192, 'preference': 70},
+        {'format': 'mp3-128', 'ext': 'mp3', 'br': '128kmp3', 'abr': 128, 'preference': 60},
+        {'format': 'wma', 'ext': 'wma', 'preference': 20},
+        {'format': 'aac', 'ext': 'aac', 'abr': 48, 'preference': 10}
+    ]
+
+    def _get_formats(self, song_id):
+        formats = []
+        for file_format in self._FORMATS:
+            song_url = self._download_webpage(
+                'http://antiserver.kuwo.cn/anti.s?format=%s&br=%s&rid=MUSIC_%s&type=convert_url&response=url' %
+                (file_format['ext'], file_format.get('br', ''), song_id),
+                song_id, note='Download %s url info' % file_format['format'],
+            )
+            if song_url.startswith('http://') or song_url.startswith('https://'):
+                formats.append({
+                    'url': song_url,
+                    'format_id': file_format['format'],
+                    'format': file_format['format'],
+                    'preference': file_format['preference'],
+                    'abr': file_format.get('abr'),
+                })
+        self._sort_formats(formats)
+        return formats
+
+
+class KuwoIE(KuwoBaseIE):
+    IE_NAME = 'kuwo:song'
+    IE_DESC = '酷我音乐'
+    _VALID_URL = r'http://www\.kuwo\.cn/yinyue/(?P<id>\d+?)/'
+    _TESTS = [{
+        'url': 'http://www.kuwo.cn/yinyue/635632/',
+        'info_dict': {
+            'id': '635632',
+            'ext': 'ape',
+            'title': '爱我别走',
+            'creator': '张震岳',
+            'upload_date': '20080122',
+            'description': 'md5:ed13f58e3c3bf3f7fd9fbc4e5a7aa75c'
+        },
+    }, {
+        'url': 'http://www.kuwo.cn/yinyue/6446136/',
+        'info_dict': {
+            'id': '6446136',
+            'ext': 'mp3',
+            'title': '心',
+            'creator': 'IU',
+            'upload_date': '20150518',
+        },
+        'params': {
+            'format': 'mp3-320'
+        },
+    }]
+
+    def _real_extract(self, url):
+        song_id = self._match_id(url)
+        webpage = self._download_webpage(
+            url, song_id, note='Download song detail info',
+            errnote='Unable to get song detail info')
+
+        song_name = self._html_search_regex(
+            r'<h1[^>]+title="([^"]+)">', webpage, 'song name')
+        singer_name = self._html_search_regex(
+            r'<div[^>]+class="s_img">\s*<a[^>]+title="([^>]+)"',
+            webpage, 'singer name', fatal=False)
+        lrc_content = clean_html(get_element_by_id('lrcContent', webpage))
+        if lrc_content == '暂无':     # indicates no lyrics
+            lrc_content = None
+
+        formats = self._get_formats(song_id)
+
+        album_id = self._html_search_regex(
+            r'<p[^>]+class="album"[^<]+<a[^>]+href="http://www\.kuwo\.cn/album/(\d+)/"',
+            webpage, 'album id', fatal=False)
+
+        publish_time = None
+        if album_id is not None:
+            album_info_page = self._download_webpage(
+                'http://www.kuwo.cn/album/%s/' % album_id, song_id,
+                note='Download album detail info',
+                errnote='Unable to get album detail info')
+
+            publish_time = self._html_search_regex(
+                r'发行时间:(\d{4}-\d{2}-\d{2})', album_info_page,
+                'publish time', fatal=False)
+            if publish_time:
+                publish_time = publish_time.replace('-', '')
+
+        return {
+            'id': song_id,
+            'title': song_name,
+            'creator': singer_name,
+            'upload_date': publish_time,
+            'description': lrc_content,
+            'formats': formats,
+        }
+
+
+class KuwoAlbumIE(InfoExtractor):
+    IE_NAME = 'kuwo:album'
+    IE_DESC = '酷我音乐 - 专辑'
+    _VALID_URL = r'http://www\.kuwo\.cn/album/(?P<id>\d+?)/'
+    _TEST = {
+        'url': 'http://www.kuwo.cn/album/502294/',
+        'info_dict': {
+            'id': '502294',
+            'title': 'M',
+            'description': 'md5:6a7235a84cc6400ec3b38a7bdaf1d60c',
+        },
+        'playlist_count': 2,
+    }
+
+    def _real_extract(self, url):
+        album_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            url, album_id, note='Download album info',
+            errnote='Unable to get album info')
+
+        album_name = self._html_search_regex(
+            r'<div[^>]+class="comm"[^<]+<h1[^>]+title="([^"]+)"', webpage,
+            'album name')
+        album_intro = remove_start(
+            clean_html(get_element_by_id('intro', webpage)),
+            '%s简介:' % album_name)
+
+        entries = [
+            self.url_result(song_url, 'Kuwo') for song_url in re.findall(
+                r'<p[^>]+class="listen"><a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+/)"',
+                webpage)
+        ]
+        return self.playlist_result(entries, album_id, album_name, album_intro)
+
+
+class KuwoChartIE(InfoExtractor):
+    IE_NAME = 'kuwo:chart'
+    IE_DESC = '酷我音乐 - 排行榜'
+    _VALID_URL = r'http://yinyue\.kuwo\.cn/billboard_(?P<id>[^.]+).htm'
+    _TEST = {
+        'url': 'http://yinyue.kuwo.cn/billboard_香港中文龙虎榜.htm',
+        'info_dict': {
+            'id': '香港中文龙虎榜',
+            'title': '香港中文龙虎榜',
+            'description': 're:\d{4}第\d{2}期',
+        },
+        'playlist_mincount': 10,
+    }
+
+    def _real_extract(self, url):
+        chart_id = self._match_id(url)
+        webpage = self._download_webpage(
+            url, chart_id, note='Download chart info',
+            errnote='Unable to get chart info')
+
+        chart_name = self._html_search_regex(
+            r'<h1[^>]+class="unDis">([^<]+)</h1>', webpage, 'chart name')
+
+        chart_desc = self._html_search_regex(
+            r'<p[^>]+class="tabDef">(\d{4}第\d{2}期)</p>', webpage, 'chart desc')
+
+        entries = [
+            self.url_result(song_url, 'Kuwo') for song_url in re.findall(
+                r'<a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)/"', webpage)
+        ]
+        return self.playlist_result(entries, chart_id, chart_name, chart_desc)
+
+
+class KuwoSingerIE(InfoExtractor):
+    IE_NAME = 'kuwo:singer'
+    IE_DESC = '酷我音乐 - 歌手'
+    _VALID_URL = r'http://www\.kuwo\.cn/mingxing/(?P<id>[^/]+)'
+    _TESTS = [{
+        'url': 'http://www.kuwo.cn/mingxing/bruno+mars/',
+        'info_dict': {
+            'id': 'bruno+mars',
+            'title': 'Bruno Mars',
+        },
+        'playlist_count': 10,
+    }, {
+        'url': 'http://www.kuwo.cn/mingxing/Ali/music.htm',
+        'info_dict': {
+            'id': 'Ali',
+            'title': 'Ali',
+        },
+        'playlist_mincount': 95,
+    }]
+
+    def _real_extract(self, url):
+        singer_id = self._match_id(url)
+        webpage = self._download_webpage(
+            url, singer_id, note='Download singer info',
+            errnote='Unable to get singer info')
+
+        singer_name = self._html_search_regex(
+            r'<div class="title clearfix">\s*<h1>([^<]+)<span', webpage, 'singer name'
+        )
+
+        entries = []
+        first_page_only = False if re.search(r'/music(?:_\d+)?\.htm', url) else True
+        for page_num in itertools.count(1):
+            webpage = self._download_webpage(
+                'http://www.kuwo.cn/mingxing/%s/music_%d.htm' % (singer_id, page_num),
+                singer_id, note='Download song list page #%d' % page_num,
+                errnote='Unable to get song list page #%d' % page_num)
+
+            entries.extend([
+                self.url_result(song_url, 'Kuwo') for song_url in re.findall(
+                    r'<p[^>]+class="m_name"><a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)/',
+                    webpage)
+            ][:10 if first_page_only else None])
+
+            if first_page_only or not re.search(r'<a[^>]+href="[^"]+">下一页</a>', webpage):
+                break
+
+        return self.playlist_result(entries, singer_id, singer_name)
+
+
+class KuwoCategoryIE(InfoExtractor):
+    IE_NAME = 'kuwo:category'
+    IE_DESC = '酷我音乐 - 分类'
+    _VALID_URL = r'http://yinyue\.kuwo\.cn/yy/cinfo_(?P<id>\d+?).htm'
+    _TEST = {
+        'url': 'http://yinyue.kuwo.cn/yy/cinfo_86375.htm',
+        'info_dict': {
+            'id': '86375',
+            'title': '八十年代精选',
+            'description': '这些都是属于八十年代的回忆!',
+        },
+        'playlist_count': 30,
+    }
+
+    def _real_extract(self, url):
+        category_id = self._match_id(url)
+        webpage = self._download_webpage(
+            url, category_id, note='Download category info',
+            errnote='Unable to get category info')
+
+        category_name = self._html_search_regex(
+            r'<h1[^>]+title="([^<>]+?)">[^<>]+?</h1>', webpage, 'category name')
+
+        category_desc = remove_start(
+            get_element_by_id('intro', webpage).strip(),
+            '%s简介:' % category_name)
+
+        jsonm = self._parse_json(self._html_search_regex(
+            r'var\s+jsonm\s*=\s*([^;]+);', webpage, 'category songs'), category_id)
+
+        entries = [
+            self.url_result('http://www.kuwo.cn/yinyue/%s/' % song['musicrid'], 'Kuwo')
+            for song in jsonm['musiclist']
+        ]
+        return self.playlist_result(entries, category_id, category_name, category_desc)
+
+
+class KuwoMvIE(KuwoBaseIE):
+    IE_NAME = 'kuwo:mv'
+    IE_DESC = '酷我音乐 - MV'
+    _VALID_URL = r'http://www\.kuwo\.cn/mv/(?P<id>\d+?)/'
+    _TEST = {
+        'url': 'http://www.kuwo.cn/mv/6480076/',
+        'info_dict': {
+            'id': '6480076',
+            'ext': 'mkv',
+            'title': '我们家MV',
+            'creator': '2PM',
+        },
+    }
+    _FORMATS = KuwoBaseIE._FORMATS + [
+        {'format': 'mkv', 'ext': 'mkv', 'preference': 250},
+        {'format': 'mp4', 'ext': 'mp4', 'preference': 200},
+    ]
+
+    def _real_extract(self, url):
+        song_id = self._match_id(url)
+        webpage = self._download_webpage(
+            url, song_id, note='Download mv detail info: %s' % song_id,
+            errnote='Unable to get mv detail info: %s' % song_id)
+
+        mobj = re.search(
+            r'<h1[^>]+title="(?P<song>[^"]+)">[^<]+<span[^>]+title="(?P<singer>[^"]+)"',
+            webpage)
+        if mobj:
+            song_name = mobj.group('song')
+            singer_name = mobj.group('singer')
+        else:
+            raise ExtractorError('Unable to find song or singer names')
+
+        formats = self._get_formats(song_id)
+
+        return {
+            'id': song_id,
+            'title': song_name,
+            'creator': singer_name,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/lecture2go.py b/youtube_dl/extractor/lecture2go.py
new file mode 100644 (file)
index 0000000..40a3d23
--- /dev/null
@@ -0,0 +1,62 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    parse_duration,
+    int_or_none,
+)
+
+
+class Lecture2GoIE(InfoExtractor):
+    _VALID_URL = r'https?://lecture2go\.uni-hamburg\.de/veranstaltungen/-/v/(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://lecture2go.uni-hamburg.de/veranstaltungen/-/v/17473',
+        'md5': 'ac02b570883020d208d405d5a3fd2f7f',
+        'info_dict': {
+            'id': '17473',
+            'ext': 'flv',
+            'title': '2 - Endliche Automaten und reguläre Sprachen',
+            'creator': 'Frank Heitmann',
+            'duration': 5220,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._html_search_regex(r'<em[^>]+class="title">(.+)</em>', webpage, 'title')
+
+        formats = []
+        for url in set(re.findall(r'"src","([^"]+)"', webpage)):
+            ext = determine_ext(url)
+            if ext == 'f4m':
+                formats.extend(self._extract_f4m_formats(url, video_id))
+            elif ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(url, video_id))
+            else:
+                formats.append({
+                    'url': url,
+                })
+
+        self._sort_formats(formats)
+
+        creator = self._html_search_regex(
+            r'<div[^>]+id="description">([^<]+)</div>', webpage, 'creator', fatal=False)
+        duration = parse_duration(self._html_search_regex(
+            r'Duration:\s*</em>\s*<em[^>]*>([^<]+)</em>', webpage, 'duration', fatal=False))
+        view_count = int_or_none(self._html_search_regex(
+            r'Views:\s*</em>\s*<em[^>]+>(\d+)</em>', webpage, 'view count', fatal=False))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'creator': creator,
+            'duration': duration,
+            'view_count': view_count,
+        }
index 1484ac0d267697dceb34c9e406e3a26b26a37f54..a28abb0f0081d05c5c2d0c07080ba33db0badaed 100644 (file)
@@ -15,10 +15,12 @@ from ..utils import (
     determine_ext,
     ExtractorError,
     parse_iso8601,
+    int_or_none,
 )
 
 
 class LetvIE(InfoExtractor):
+    IE_DESC = '乐视网'
     _VALID_URL = r'http://www\.letv\.com/ptv/vplay/(?P<id>\d+).html'
 
     _TESTS = [{
@@ -50,9 +52,7 @@ class LetvIE(InfoExtractor):
             'title': '与龙共舞 完整版',
             'description': 'md5:7506a5eeb1722bb9d4068f85024e3986',
         },
-        'params': {
-            'cn_verification_proxy': 'http://proxy.uku.im:8888'
-        },
+        'skip': 'Only available in China',
     }]
 
     @staticmethod
@@ -135,7 +135,7 @@ class LetvIE(InfoExtractor):
                 }
 
                 if format_id[-1:] == 'p':
-                    url_info_dict['height'] = format_id[:-1]
+                    url_info_dict['height'] = int_or_none(format_id[:-1])
 
                 urls.append(url_info_dict)
 
index 081016b8014afaf210eabe34404b7cd643fb65de..f8cbca7b36afab1890b71806d6761bbe67d7d924 100644 (file)
@@ -4,8 +4,11 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..compat import compat_urlparse
 from ..utils import (
+    determine_ext,
     int_or_none,
+    remove_end,
     unified_strdate,
     ExtractorError,
 )
@@ -14,7 +17,7 @@ from ..utils import (
 class LifeNewsIE(InfoExtractor):
     IE_NAME = 'lifenews'
     IE_DESC = 'LIFE | NEWS'
-    _VALID_URL = r'http://lifenews\.ru/(?:mobile/)?news/(?P<id>\d+)'
+    _VALID_URL = r'http://lifenews\.ru/(?:mobile/)?(?P<section>news|video)/(?P<id>\d+)'
 
     _TESTS = [{
         'url': 'http://lifenews.ru/news/126342',
@@ -37,7 +40,6 @@ class LifeNewsIE(InfoExtractor):
             'title': 'В Сети появилось видео захвата «Правым сектором» колхозных полей ',
             'description': 'Жители двух поселков Днепропетровской области не простили радикалам угрозу лишения плодородных земель и пошли в лобовую. ',
             'upload_date': '20150402',
-            'uploader': 'embed.life.ru',
         }
     }, {
         'url': 'http://lifenews.ru/news/153461',
@@ -48,36 +50,41 @@ class LifeNewsIE(InfoExtractor):
             'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве',
             'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.',
             'upload_date': '20150505',
-            'uploader': 'embed.life.ru',
         }
+    }, {
+        'url': 'http://lifenews.ru/video/13035',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
+        section = mobj.group('section')
 
-        webpage = self._download_webpage('http://lifenews.ru/news/%s' % video_id, video_id, 'Downloading page')
+        webpage = self._download_webpage(
+            'http://lifenews.ru/%s/%s' % (section, video_id),
+            video_id, 'Downloading page')
 
         videos = re.findall(r'<video.*?poster="(?P<poster>[^"]+)".*?src="(?P<video>[^"]+)".*?></video>', webpage)
         iframe_link = self._html_search_regex(
-            '<iframe[^>]+src="([^"]+)', webpage, 'iframe link', default=None)
+            '<iframe[^>]+src=["\']([^"\']+)["\']', webpage, 'iframe link', default=None)
         if not videos and not iframe_link:
             raise ExtractorError('No media links available for %s' % video_id)
 
-        title = self._og_search_title(webpage)
-        TITLE_SUFFIX = ' - Первый по срочным новостям — LIFE | NEWS'
-        if title.endswith(TITLE_SUFFIX):
-            title = title[:-len(TITLE_SUFFIX)]
+        title = remove_end(
+            self._og_search_title(webpage),
+            ' - Первый по срочным новостям — LIFE | NEWS')
 
         description = self._og_search_description(webpage)
 
         view_count = self._html_search_regex(
             r'<div class=\'views\'>\s*(\d+)\s*</div>', webpage, 'view count', fatal=False)
         comment_count = self._html_search_regex(
-            r'<div class=\'comments\'>\s*<span class=\'counter\'>\s*(\d+)\s*</span>', webpage, 'comment count', fatal=False)
+            r'=\'commentCount\'[^>]*>\s*(\d+)\s*<',
+            webpage, 'comment count', fatal=False)
 
         upload_date = self._html_search_regex(
-            r'<time datetime=\'([^\']+)\'>', webpage, 'upload date', fatal=False)
+            r'<time[^>]*datetime=\'([^\']+)\'', webpage, 'upload date', fatal=False)
         if upload_date is not None:
             upload_date = unified_strdate(upload_date)
 
@@ -113,3 +120,49 @@ class LifeNewsIE(InfoExtractor):
             return make_entry(video_id, videos[0])
         else:
             return [make_entry(video_id, media, video_number + 1) for video_number, media in enumerate(videos)]
+
+
+class LifeEmbedIE(InfoExtractor):
+    IE_NAME = 'life:embed'
+    _VALID_URL = r'http://embed\.life\.ru/embed/(?P<id>[\da-f]{32})'
+
+    _TEST = {
+        'url': 'http://embed.life.ru/embed/e50c2dec2867350528e2574c899b8291',
+        'md5': 'b889715c9e49cb1981281d0e5458fbbe',
+        'info_dict': {
+            'id': 'e50c2dec2867350528e2574c899b8291',
+            'ext': 'mp4',
+            'title': 'e50c2dec2867350528e2574c899b8291',
+            'thumbnail': 're:http://.*\.jpg',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        formats = []
+        for video_url in re.findall(r'"file"\s*:\s*"([^"]+)', webpage):
+            video_url = compat_urlparse.urljoin(url, video_url)
+            ext = determine_ext(video_url)
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    video_url, video_id, 'mp4', m3u8_id='m3u8'))
+            else:
+                formats.append({
+                    'url': video_url,
+                    'format_id': ext,
+                    'preference': 1,
+                })
+        self._sort_formats(formats)
+
+        thumbnail = self._search_regex(
+            r'"image"\s*:\s*"([^"]+)', webpage, 'thumbnail', default=None)
+
+        return {
+            'id': video_id,
+            'title': video_id,
+            'thumbnail': thumbnail,
+            'formats': formats,
+        }
index 35822067f908f0567e8dcb8c9c8265df4d3421c2..857edfde263196d9bf2811568cc9f9de90eed92b 100644 (file)
@@ -40,6 +40,17 @@ class LiveLeakIE(InfoExtractor):
             'title': 'Man is Fatally Struck by Reckless Car While Packing up a Moving Truck',
             'age_limit': 18,
         }
+    }, {
+        # Covers https://github.com/rg3/youtube-dl/pull/5983
+        'url': 'http://www.liveleak.com/view?i=801_1409392012',
+        'md5': '0b3bec2d888c20728ca2ad3642f0ef15',
+        'info_dict': {
+            'id': '801_1409392012',
+            'ext': 'mp4',
+            'description': "Happened on 27.7.2014. \r\nAt 0:53 you can see people still swimming at near beach.",
+            'uploader': 'bony333',
+            'title': 'Crazy Hungarian tourist films close call waterspout in Croatia'
+        }
     }]
 
     def _real_extract(self, url):
@@ -85,7 +96,10 @@ class LiveLeakIE(InfoExtractor):
             'url': s['file'],
         } for i, s in enumerate(sources)]
         for i, s in enumerate(sources):
-            orig_url = s['file'].replace('.h264_base.mp4', '')
+            # Removing '.h264_*.mp4' gives the raw video, which is essentially
+            # the same video without the LiveLeak logo at the top (see
+            # https://github.com/rg3/youtube-dl/pull/4768)
+            orig_url = re.sub(r'\.h264_.+?\.mp4', '', s['file'])
             if s['file'] != orig_url:
                 formats.append({
                     'format_id': 'original-%s' % i,
index cfd3b14f4bfd755a7600701e86900ece12b0c3ac..a00f6e5e5eb1d398ef0776d8b20dcb1dd51ec082 100644 (file)
@@ -30,13 +30,13 @@ class LyndaBaseIE(InfoExtractor):
             return
 
         login_form = {
-            'username': username,
-            'password': password,
+            'username': username.encode('utf-8'),
+            'password': password.encode('utf-8'),
             'remember': 'false',
             'stayPut': 'false'
         }
         request = compat_urllib_request.Request(
-            self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
+            self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
         login_page = self._download_webpage(
             request, None, 'Logging in as %s' % username)
 
@@ -65,7 +65,7 @@ class LyndaBaseIE(InfoExtractor):
                     'stayPut': 'false',
                 }
                 request = compat_urllib_request.Request(
-                    self._LOGIN_URL, compat_urllib_parse.urlencode(confirm_form))
+                    self._LOGIN_URL, compat_urllib_parse.urlencode(confirm_form).encode('utf-8'))
                 login_page = self._download_webpage(
                     request, None,
                     'Confirming log in and log out from another device')
index 0b85a59d1c644d7d04e573aae0bdd03ebd4f6c80..92511a671ae300287fe6eb57b91b9c708dba45c5 100644 (file)
@@ -2,9 +2,7 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..compat import (
-    compat_urllib_parse,
-)
+from ..compat import compat_urllib_parse_unquote
 
 
 class MalemotionIE(InfoExtractor):
@@ -24,7 +22,7 @@ class MalemotionIE(InfoExtractor):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
 
-        video_url = compat_urllib_parse.unquote(self._search_regex(
+        video_url = compat_urllib_parse_unquote(self._search_regex(
             r'<source type="video/mp4" src="(.+?)"', webpage, 'video URL'))
         video_title = self._html_search_regex(
             r'<title>(.*?)</title', webpage, 'title')
index 8bc333b0277e27e6fd8f3d4f11b3c9c7eabdd7d7..6e2e73a5162f10ea5818b636da579c932b4f2e7d 100644 (file)
@@ -6,6 +6,7 @@ from .common import InfoExtractor
 from ..compat import (
     compat_parse_qs,
     compat_urllib_parse,
+    compat_urllib_parse_unquote,
     compat_urllib_request,
 )
 from ..utils import (
@@ -155,7 +156,7 @@ class MetacafeIE(InfoExtractor):
         video_url = None
         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
         if mobj is not None:
-            mediaURL = compat_urllib_parse.unquote(mobj.group(1))
+            mediaURL = compat_urllib_parse_unquote(mobj.group(1))
             video_ext = mediaURL[-3:]
 
             # Extract gdaKey if available
index d8897eb90d526b7b7d2e5a5ace5bec84ebb40031..852d722664a3d63aafed0f8246949335b4150c09 100644 (file)
@@ -5,6 +5,7 @@ import json
 from .common import InfoExtractor
 from ..compat import (
     compat_urllib_parse,
+    compat_urllib_parse_unquote,
     compat_urlparse,
 )
 from ..utils import (
@@ -20,7 +21,6 @@ class MiTeleIE(InfoExtractor):
 
     _TESTS = [{
         'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/',
-        'md5': '6a75fe9d0d3275bead0cb683c616fddb',
         'info_dict': {
             'id': '0fce117d',
             'ext': 'mp4',
@@ -29,6 +29,10 @@ class MiTeleIE(InfoExtractor):
             'display_id': 'programa-144',
             'duration': 2913,
         },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
     }]
 
     def _real_extract(self, url):
@@ -45,7 +49,7 @@ class MiTeleIE(InfoExtractor):
             domain = 'http://' + domain
         info_url = compat_urlparse.urljoin(
             domain,
-            compat_urllib_parse.unquote(embed_data['flashvars']['host'])
+            compat_urllib_parse_unquote(embed_data['flashvars']['host'])
         )
         info_el = self._download_xml(info_url, episode).find('./video/info')
 
@@ -56,12 +60,14 @@ class MiTeleIE(InfoExtractor):
             episode,
             transform_source=strip_jsonp
         )
+        formats = self._extract_m3u8_formats(
+            token_info['tokenizedUrl'], episode, ext='mp4')
 
         return {
             'id': embed_data['videoId'],
             'display_id': episode,
             'title': info_el.find('title').text,
-            'url': token_info['tokenizedUrl'],
+            'formats': formats,
             'description': get_element_by_attribute('class', 'text', webpage),
             'thumbnail': info_el.find('thumb').text,
             'duration': parse_duration(info_el.find('duration').text),
index 425a4ccf16fff96b1bface874748b93762d2194b..d47aecedae388829babaed8642611c5a6b7d29fe 100644 (file)
@@ -3,9 +3,7 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..compat import (
-    compat_urllib_parse,
-)
+from ..compat import compat_urllib_parse_unquote
 from ..utils import (
     ExtractorError,
     HEADRequest,
@@ -60,7 +58,7 @@ class MixcloudIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
         uploader = mobj.group(1)
         cloudcast_name = mobj.group(2)
-        track_id = compat_urllib_parse.unquote('-'.join((uploader, cloudcast_name)))
+        track_id = compat_urllib_parse_unquote('-'.join((uploader, cloudcast_name)))
 
         webpage = self._download_webpage(url, track_id)
 
index ee9ff73bf22f3fbcf769b63fdcbdf96395f1db4a..e242b897f2b63cf624805c7564cf7e2f02a9d16b 100644 (file)
@@ -10,7 +10,21 @@ from ..utils import (
 
 
 class MLBIE(InfoExtractor):
-    _VALID_URL = r'https?://m(?:lb)?\.(?:[\da-z_-]+\.)?mlb\.com/(?:(?:.*?/)?video/(?:topic/[\da-z_-]+/)?v|(?:shared/video/embed/(?:embed|m-internal-embed)\.html|[^/]+/video/play\.jsp)\?.*?\bcontent_id=)(?P<id>n?\d+)'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:[\da-z_-]+\.)*mlb\.com/
+                        (?:
+                            (?:
+                                (?:.*?/)?video/(?:topic/[\da-z_-]+/)?v|
+                                (?:
+                                    shared/video/embed/(?:embed|m-internal-embed)\.html|
+                                    (?:[^/]+/)+(?:play|index)\.jsp|
+                                )\?.*?\bcontent_id=
+                            )
+                            (?P<id>n?\d+)|
+                            (?:[^/]+/)*(?P<path>[^/]+)
+                        )
+                    '''
     _TESTS = [
         {
             'url': 'http://m.mlb.com/sea/video/topic/51231442/v34698933/nymsea-ackley-robs-a-home-run-with-an-amazing-catch/?c_id=sea',
@@ -68,6 +82,18 @@ class MLBIE(InfoExtractor):
                 'thumbnail': 're:^https?://.*\.jpg$',
             },
         },
+        {
+            'url': 'http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer',
+            'md5': 'b190e70141fb9a1552a85426b4da1b5d',
+            'info_dict': {
+                'id': '75609783',
+                'ext': 'mp4',
+                'title': 'Must C: Pillar climbs for catch',
+                'description': '4/15/15: Blue Jays outfielder Kevin Pillar continues his defensive dominance by climbing the wall in left to rob Tim Beckham of a home run',
+                'timestamp': 1429124820,
+                'upload_date': '20150415',
+            }
+        },
         {
             'url': 'http://m.mlb.com/shared/video/embed/embed.html?content_id=35692085&topic_id=6479266&width=400&height=224&property=mlb',
             'only_matching': True,
@@ -88,6 +114,10 @@ class MLBIE(InfoExtractor):
             # From http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer
             'url': 'http://mlb.mlb.com/shared/video/embed/m-internal-embed.html?content_id=75609783&property=mlb&autoplay=true&hashmode=false&siteSection=mlb/multimedia/article_118550098/article_embed&club=mlb',
             'only_matching': True,
+        },
+        {
+            'url': 'http://washington.nationals.mlb.com/mlb/gameday/index.jsp?c_id=was&gid=2015_05_09_atlmlb_wasmlb_1&lang=en&content_id=108309983&mode=video#',
+            'only_matching': True,
         }
     ]
 
@@ -95,6 +125,12 @@ class MLBIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
 
+        if not video_id:
+            video_path = mobj.group('path')
+            webpage = self._download_webpage(url, video_path)
+            video_id = self._search_regex(
+                [r'data-video-?id="(\d+)"', r'content_id=(\d+)'], webpage, 'video id')
+
         detail = self._download_xml(
             'http://m.mlb.com/gen/multimedia/detail/%s/%s/%s/%s.xml'
             % (video_id[-3], video_id[-2], video_id[-1], video_id), video_id)
index 2cec12d35ec1797dd7612ad49c5739e87f77e6c9..9bf99a54a98c4838c2b878db3ec165c867602110 100644 (file)
@@ -5,9 +5,9 @@ import re
 
 from .common import InfoExtractor
 from ..compat import (
+    compat_urllib_parse_unquote,
     compat_urllib_parse_urlparse,
     compat_urllib_request,
-    compat_urllib_parse,
 )
 
 
@@ -34,7 +34,7 @@ class MofosexIE(InfoExtractor):
         webpage = self._download_webpage(req, video_id)
 
         video_title = self._html_search_regex(r'<h1>(.+?)<', webpage, 'title')
-        video_url = compat_urllib_parse.unquote(self._html_search_regex(r'flashvars.video_url = \'([^\']+)', webpage, 'video_url'))
+        video_url = compat_urllib_parse_unquote(self._html_search_regex(r'flashvars.video_url = \'([^\']+)', webpage, 'video_url'))
         path = compat_urllib_parse_urlparse(video_url).path
         extension = os.path.splitext(path)[1][1:]
         format = path.split('/')[5].split('_')[:2]
index 5b9b9fbcd0844897d6d63305ed00729e70c7f4fb..4557a2b13b3e47a75242ebd4a5c095bf17cbaacf 100644 (file)
@@ -35,7 +35,8 @@ class MySpassIE(InfoExtractor):
 
         # get metadata
         metadata_url = META_DATA_URL_TEMPLATE % video_id
-        metadata = self._download_xml(metadata_url, video_id)
+        metadata = self._download_xml(
+            metadata_url, video_id, transform_source=lambda s: s.strip())
 
         # extract values from metadata
         url_flv_el = metadata.find('url_flv')
diff --git a/youtube_dl/extractor/myvi.py b/youtube_dl/extractor/myvi.py
new file mode 100644 (file)
index 0000000..4c65be1
--- /dev/null
@@ -0,0 +1,60 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .vimple import SprutoBaseIE
+
+
+class MyviIE(SprutoBaseIE):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        myvi\.(?:ru/player|tv)/
+                            (?:
+                                (?:
+                                    embed/html|
+                                    flash|
+                                    api/Video/Get
+                                )/|
+                                content/preloader\.swf\?.*\bid=
+                            )
+                            (?P<id>[\da-zA-Z_-]+)
+                    '''
+    _TESTS = [{
+        'url': 'http://myvi.ru/player/embed/html/oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wObeRTZaCATzucDQIDph8hQU0',
+        'md5': '571bbdfba9f9ed229dc6d34cc0f335bf',
+        'info_dict': {
+            'id': 'f16b2bbd-cde8-481c-a981-7cd48605df43',
+            'ext': 'mp4',
+            'title': 'хозяин жизни',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 25,
+        },
+    }, {
+        'url': 'http://myvi.ru/player/content/preloader.swf?id=oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wOYf1WFpPfc_bWTKGVf_Zafr0',
+        'only_matching': True,
+    }, {
+        'url': 'http://myvi.ru/player/api/Video/Get/oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wObeRTZaCATzucDQIDph8hQU0',
+        'only_matching': True,
+    }, {
+        'url': 'http://myvi.tv/embed/html/oTGTNWdyz4Zwy_u1nraolwZ1odenTd9WkTnRfIL9y8VOgHYqOHApE575x4_xxS9Vn0?ap=0',
+        'only_matching': True,
+    }, {
+        'url': 'http://myvi.ru/player/flash/ocp2qZrHI-eZnHKQBK4cZV60hslH8LALnk0uBfKsB-Q4WnY26SeGoYPi8HWHxu0O30',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def _extract_url(cls, webpage):
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//myvi\.(?:ru/player|tv)/(?:embed/html|flash)/[^"]+)\1', webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        spruto = self._download_json(
+            'http://myvi.ru/player/api/Video/Get/%s?sig' % video_id, video_id)['sprutoData']
+
+        return self._extract_spruto(spruto, video_id)
index 5e754fcffb6403cbd359b9b358ad3879ef279f53..c96f472a39e569c7dfb88682d36fad9ed6ce2c10 100644 (file)
@@ -10,6 +10,7 @@ from .common import InfoExtractor
 from ..compat import (
     compat_ord,
     compat_urllib_parse,
+    compat_urllib_parse_unquote,
     compat_urllib_request,
 )
 from ..utils import (
@@ -107,7 +108,7 @@ class MyVideoIE(InfoExtractor):
             if not a == '_encxml':
                 params[a] = b
             else:
-                encxml = compat_urllib_parse.unquote(b)
+                encxml = compat_urllib_parse_unquote(b)
         if not params.get('domain'):
             params['domain'] = 'www.myvideo.de'
         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
@@ -135,7 +136,7 @@ class MyVideoIE(InfoExtractor):
         video_url = None
         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
         if mobj:
-            video_url = compat_urllib_parse.unquote(mobj.group(1))
+            video_url = compat_urllib_parse_unquote(mobj.group(1))
             if 'myvideo2flash' in video_url:
                 self.report_warning(
                     'Rewriting URL to use unencrypted rtmp:// ...',
@@ -147,10 +148,10 @@ class MyVideoIE(InfoExtractor):
             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
             if mobj is None:
                 raise ExtractorError('unable to extract url')
-            video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
+            video_url = compat_urllib_parse_unquote(mobj.group(1)) + compat_urllib_parse_unquote(mobj.group(2))
 
         video_file = self._search_regex('source=\'(.*?)\'', dec_data, 'video file')
-        video_file = compat_urllib_parse.unquote(video_file)
+        video_file = compat_urllib_parse_unquote(video_file)
 
         if not video_file.endswith('f4m'):
             ppath, prefix = video_file.split('.')
@@ -159,7 +160,7 @@ class MyVideoIE(InfoExtractor):
             video_playpath = ''
 
         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, 'swfobj')
-        video_swfobj = compat_urllib_parse.unquote(video_swfobj)
+        video_swfobj = compat_urllib_parse_unquote(video_swfobj)
 
         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
                                               webpage, 'title')
index c18640c5a9f0093344475ad649643705522cddb9..6fc9e7b050c75ced3262ebad9ba92d74ed6b5d7d 100644 (file)
@@ -8,25 +8,40 @@ from ..utils import (
 
 
 class NationalGeographicIE(InfoExtractor):
-    _VALID_URL = r'http://video\.nationalgeographic\.com/video/.*?'
-
-    _TEST = {
-        'url': 'http://video.nationalgeographic.com/video/news/150210-news-crab-mating-vin?source=featuredvideo',
-        'info_dict': {
-            'id': '4DmDACA6Qtk_',
-            'ext': 'flv',
-            'title': 'Mating Crabs Busted by Sharks',
-            'description': 'md5:16f25aeffdeba55aaa8ec37e093ad8b3',
+    _VALID_URL = r'http://video\.nationalgeographic\.com/.*?'
+
+    _TESTS = [
+        {
+            'url': 'http://video.nationalgeographic.com/video/news/150210-news-crab-mating-vin?source=featuredvideo',
+            'info_dict': {
+                'id': '4DmDACA6Qtk_',
+                'ext': 'flv',
+                'title': 'Mating Crabs Busted by Sharks',
+                'description': 'md5:16f25aeffdeba55aaa8ec37e093ad8b3',
+            },
+            'add_ie': ['ThePlatform'],
         },
-        'add_ie': ['ThePlatform'],
-    }
+        {
+            'url': 'http://video.nationalgeographic.com/wild/when-sharks-attack/the-real-jaws',
+            'info_dict': {
+                'id': '_JeBD_D7PlS5',
+                'ext': 'flv',
+                'title': 'The Real Jaws',
+                'description': 'md5:8d3e09d9d53a85cd397b4b21b2c77be6',
+            },
+            'add_ie': ['ThePlatform'],
+        },
+    ]
 
     def _real_extract(self, url):
         name = url_basename(url)
 
         webpage = self._download_webpage(url, name)
-        feed_url = self._search_regex(r'data-feed-url="([^"]+)"', webpage, 'feed url')
-        guid = self._search_regex(r'data-video-guid="([^"]+)"', webpage, 'guid')
+        feed_url = self._search_regex(
+            r'data-feed-url="([^"]+)"', webpage, 'feed url')
+        guid = self._search_regex(
+            r'id="(?:videoPlayer|player-container)"[^>]+data-guid="([^"]+)"',
+            webpage, 'guid')
 
         feed = self._download_xml('%s?byGuid=%s' % (feed_url, guid), name)
         content = feed.find('.//{http://search.yahoo.com/mrss/}content')
@@ -34,5 +49,6 @@ class NationalGeographicIE(InfoExtractor):
 
         return self.url_result(smuggle_url(
             'http://link.theplatform.com/s/ngs/%s?format=SMIL&formats=MPEG4&manifest=f4m' % theplatform_id,
-            # For some reason, the normal links don't work and we must force the use of f4m
+            # For some reason, the normal links don't work and we must force
+            # the use of f4m
             {'force_smil_url': True}))
index c10405f04d3cc1b3e89004029b7502112e9baa29..925967753bd12816005b5ed8929f0438e9ec0214 100644 (file)
@@ -6,6 +6,7 @@ import re
 from .common import InfoExtractor
 from ..compat import (
     compat_urllib_parse,
+    compat_urlparse,
 )
 from ..utils import (
     ExtractorError,
@@ -16,7 +17,7 @@ from ..utils import (
 class NaverIE(InfoExtractor):
     _VALID_URL = r'https?://(?:m\.)?tvcast\.naver\.com/v/(?P<id>\d+)'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://tvcast.naver.com/v/81652',
         'info_dict': {
             'id': '81652',
@@ -25,7 +26,18 @@ class NaverIE(InfoExtractor):
             'description': '합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.',
             'upload_date': '20130903',
         },
-    }
+    }, {
+        'url': 'http://tvcast.naver.com/v/395837',
+        'md5': '638ed4c12012c458fefcddfd01f173cd',
+        'info_dict': {
+            'id': '395837',
+            'ext': 'mp4',
+            'title': '9년이 지나도 아픈 기억, 전효성의 아버지',
+            'description': 'md5:5bf200dcbf4b66eb1b350d1eb9c753f7',
+            'upload_date': '20150519',
+        },
+        'skip': 'Georestricted',
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
@@ -35,7 +47,7 @@ class NaverIE(InfoExtractor):
                          webpage)
         if m_id is None:
             m_error = re.search(
-                r'(?s)<div class="nation_error">\s*(?:<!--.*?-->)?\s*<p class="[^"]+">(?P<msg>.+?)</p>\s*</div>',
+                r'(?s)<div class="(?:nation_error|nation_box)">\s*(?:<!--.*?-->)?\s*<p class="[^"]+">(?P<msg>.+?)</p>\s*</div>',
                 webpage)
             if m_error:
                 raise ExtractorError(clean_html(m_error.group('msg')), expected=True)
@@ -58,14 +70,18 @@ class NaverIE(InfoExtractor):
         formats = []
         for format_el in urls.findall('EncodingOptions/EncodingOption'):
             domain = format_el.find('Domain').text
+            uri = format_el.find('uri').text
             f = {
-                'url': domain + format_el.find('uri').text,
+                'url': compat_urlparse.urljoin(domain, uri),
                 'ext': 'mp4',
                 'width': int(format_el.find('width').text),
                 'height': int(format_el.find('height').text),
             }
             if domain.startswith('rtmp'):
+                # urlparse does not support custom schemes
+                # https://bugs.python.org/issue18828
                 f.update({
+                    'url': domain + uri,
                     'ext': 'flv',
                     'rtmp_protocol': '1',  # rtmpt
                 })
index 862b706bf96719aa071f1f89c73f2a4ef45a20b1..944096e1ca15de964fcdf896adf988c9aa2264bd 100644 (file)
@@ -22,6 +22,18 @@ class NBAIE(InfoExtractor):
     }, {
         'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/',
         'only_matching': True,
+    }, {
+        'url': 'http://watch.nba.com/nba/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba',
+        'info_dict': {
+            'id': '0041400301-cle-atl-recap.nba',
+            'ext': 'mp4',
+            'title': 'NBA GAME TIME | Video: Hawks vs. Cavaliers Game 1',
+            'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d',
+            'duration': 228,
+        },
+        'params': {
+            'skip_download': True,
+        }
     }]
 
     def _real_extract(self, url):
@@ -35,8 +47,12 @@ class NBAIE(InfoExtractor):
             self._og_search_title(webpage, default=shortened_video_id), ' : NBA.com')
 
         description = self._og_search_description(webpage)
-        duration = parse_duration(
-            self._html_search_meta('duration', webpage, 'duration'))
+        duration_str = self._html_search_meta(
+            'duration', webpage, 'duration', default=None)
+        if not duration_str:
+            duration_str = self._html_search_regex(
+                r'Duration:</b>\s*(\d+:\d+)', webpage, 'duration', fatal=False)
+        duration = parse_duration(duration_str)
 
         return {
             'id': shortened_video_id,
index f49c666909a270ad18e36a2d1177ef681adc3121..79a13958b05e25a1c9e586168bb3a10742fbe01f 100644 (file)
@@ -8,41 +8,11 @@ from ..utils import (
     ExtractorError,
     int_or_none,
     qualities,
+    parse_duration,
 )
 
 
-class NDRIE(InfoExtractor):
-    IE_NAME = 'ndr'
-    IE_DESC = 'NDR.de - Mediathek'
-    _VALID_URL = r'https?://www\.ndr\.de/.+?(?P<id>\d+)\.html'
-
-    _TESTS = [
-        {
-            'url': 'http://www.ndr.de/fernsehen/sendungen/nordmagazin/Kartoffeltage-in-der-Lewitz,nordmagazin25866.html',
-            'md5': '5bc5f5b92c82c0f8b26cddca34f8bb2c',
-            'note': 'Video file',
-            'info_dict': {
-                'id': '25866',
-                'ext': 'mp4',
-                'title': 'Kartoffeltage in der Lewitz',
-                'description': 'md5:48c4c04dde604c8a9971b3d4e3b9eaa8',
-                'duration': 166,
-            }
-        },
-        {
-            'url': 'http://www.ndr.de/info/audio51535.html',
-            'md5': 'bb3cd38e24fbcc866d13b50ca59307b8',
-            'note': 'Audio file',
-            'info_dict': {
-                'id': '51535',
-                'ext': 'mp3',
-                'title': 'La Valette entgeht der Hinrichtung',
-                'description': 'md5:22f9541913a40fe50091d5cdd7c9f536',
-                'duration': 884,
-            }
-        }
-    ]
-
+class NDRBaseIE(InfoExtractor):
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
@@ -54,7 +24,11 @@ class NDRIE(InfoExtractor):
         if description:
             description = description.strip()
 
-        duration = int_or_none(self._html_search_regex(r'duration: (\d+),\n', page, 'duration', fatal=False))
+        duration = int_or_none(self._html_search_regex(r'duration: (\d+),\n', page, 'duration', default=None))
+        if not duration:
+            duration = parse_duration(self._html_search_regex(
+                r'(<span class="min">\d+</span>:<span class="sec">\d+</span>)',
+                page, 'duration', default=None))
 
         formats = []
 
@@ -92,3 +66,65 @@ class NDRIE(InfoExtractor):
             'duration': duration,
             'formats': formats,
         }
+
+
+class NDRIE(NDRBaseIE):
+    IE_NAME = 'ndr'
+    IE_DESC = 'NDR.de - Mediathek'
+    _VALID_URL = r'https?://www\.ndr\.de/.+?(?P<id>\d+)\.html'
+
+    _TESTS = [
+        {
+            'url': 'http://www.ndr.de/fernsehen/sendungen/nordmagazin/Kartoffeltage-in-der-Lewitz,nordmagazin25866.html',
+            'md5': '5bc5f5b92c82c0f8b26cddca34f8bb2c',
+            'note': 'Video file',
+            'info_dict': {
+                'id': '25866',
+                'ext': 'mp4',
+                'title': 'Kartoffeltage in der Lewitz',
+                'description': 'md5:48c4c04dde604c8a9971b3d4e3b9eaa8',
+                'duration': 166,
+            },
+            'skip': '404 Not found',
+        },
+        {
+            'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html',
+            'md5': 'dadc003c55ae12a5d2f6bd436cd73f59',
+            'info_dict': {
+                'id': '988',
+                'ext': 'mp4',
+                'title': 'Party, Pötte und Parade',
+                'description': 'Hunderttausende feiern zwischen Speicherstadt und St. Pauli den 826. Hafengeburtstag. Die NDR Sondersendung zeigt die schönsten und spektakulärsten Bilder vom Auftakt.',
+                'duration': 3498,
+            },
+        },
+        {
+            'url': 'http://www.ndr.de/info/audio51535.html',
+            'md5': 'bb3cd38e24fbcc866d13b50ca59307b8',
+            'note': 'Audio file',
+            'info_dict': {
+                'id': '51535',
+                'ext': 'mp3',
+                'title': 'La Valette entgeht der Hinrichtung',
+                'description': 'md5:22f9541913a40fe50091d5cdd7c9f536',
+                'duration': 884,
+            }
+        }
+    ]
+
+
+class NJoyIE(NDRBaseIE):
+    IE_NAME = 'N-JOY'
+    _VALID_URL = r'https?://www\.n-joy\.de/.+?(?P<id>\d+)\.html'
+
+    _TEST = {
+        'url': 'http://www.n-joy.de/entertainment/comedy/comedy_contest/Benaissa-beim-NDR-Comedy-Contest,comedycontest2480.html',
+        'md5': 'cb63be60cd6f9dd75218803146d8dc67',
+        'info_dict': {
+            'id': '2480',
+            'ext': 'mp4',
+            'title': 'Benaissa beim NDR Comedy Contest',
+            'description': 'Von seinem sehr "behaarten" Leben lässt sich Benaissa trotz aller Schwierigkeiten nicht unterkriegen.',
+            'duration': 654,
+        }
+    }
diff --git a/youtube_dl/extractor/neteasemusic.py b/youtube_dl/extractor/neteasemusic.py
new file mode 100644 (file)
index 0000000..a8e0a64
--- /dev/null
@@ -0,0 +1,459 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from hashlib import md5
+from base64 import b64encode
+from datetime import datetime
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_request,
+    compat_urllib_parse,
+    compat_str,
+    compat_itertools_count,
+)
+
+
+class NetEaseMusicBaseIE(InfoExtractor):
+    _FORMATS = ['bMusic', 'mMusic', 'hMusic']
+    _NETEASE_SALT = '3go8&$8*3*3h0k(2)2'
+    _API_BASE = 'http://music.163.com/api/'
+
+    @classmethod
+    def _encrypt(cls, dfsid):
+        salt_bytes = bytearray(cls._NETEASE_SALT.encode('utf-8'))
+        string_bytes = bytearray(compat_str(dfsid).encode('ascii'))
+        salt_len = len(salt_bytes)
+        for i in range(len(string_bytes)):
+            string_bytes[i] = string_bytes[i] ^ salt_bytes[i % salt_len]
+        m = md5()
+        m.update(bytes(string_bytes))
+        result = b64encode(m.digest()).decode('ascii')
+        return result.replace('/', '_').replace('+', '-')
+
+    @classmethod
+    def extract_formats(cls, info):
+        formats = []
+        for song_format in cls._FORMATS:
+            details = info.get(song_format)
+            if not details:
+                continue
+            formats.append({
+                'url': 'http://m1.music.126.net/%s/%s.%s' %
+                       (cls._encrypt(details['dfsId']), details['dfsId'],
+                        details['extension']),
+                'ext': details.get('extension'),
+                'abr': details.get('bitrate', 0) / 1000,
+                'format_id': song_format,
+                'filesize': details.get('size'),
+                'asr': details.get('sr')
+            })
+        return formats
+
+    @classmethod
+    def convert_milliseconds(cls, ms):
+        return int(round(ms / 1000.0))
+
+    def query_api(self, endpoint, video_id, note):
+        req = compat_urllib_request.Request('%s%s' % (self._API_BASE, endpoint))
+        req.add_header('Referer', self._API_BASE)
+        return self._download_json(req, video_id, note)
+
+
+class NetEaseMusicIE(NetEaseMusicBaseIE):
+    IE_NAME = 'netease:song'
+    IE_DESC = '网易云音乐'
+    _VALID_URL = r'https?://music\.163\.com/(#/)?song\?id=(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://music.163.com/#/song?id=32102397',
+        'md5': 'f2e97280e6345c74ba9d5677dd5dcb45',
+        'info_dict': {
+            'id': '32102397',
+            'ext': 'mp3',
+            'title': 'Bad Blood (feat. Kendrick Lamar)',
+            'creator': 'Taylor Swift / Kendrick Lamar',
+            'upload_date': '20150517',
+            'timestamp': 1431878400,
+            'description': 'md5:a10a54589c2860300d02e1de821eb2ef',
+        },
+    }, {
+        'note': 'No lyrics translation.',
+        'url': 'http://music.163.com/#/song?id=29822014',
+        'info_dict': {
+            'id': '29822014',
+            'ext': 'mp3',
+            'title': '听见下雨的声音',
+            'creator': '周杰伦',
+            'upload_date': '20141225',
+            'timestamp': 1419523200,
+            'description': 'md5:a4d8d89f44656af206b7b2555c0bce6c',
+        },
+    }, {
+        'note': 'No lyrics.',
+        'url': 'http://music.163.com/song?id=17241424',
+        'info_dict': {
+            'id': '17241424',
+            'ext': 'mp3',
+            'title': 'Opus 28',
+            'creator': 'Dustin O\'Halloran',
+            'upload_date': '20080211',
+            'timestamp': 1202745600,
+        },
+    }, {
+        'note': 'Has translated name.',
+        'url': 'http://music.163.com/#/song?id=22735043',
+        'info_dict': {
+            'id': '22735043',
+            'ext': 'mp3',
+            'title': '소원을 말해봐 (Genie)',
+            'creator': '少女时代',
+            'description': 'md5:79d99cc560e4ca97e0c4d86800ee4184',
+            'upload_date': '20100127',
+            'timestamp': 1264608000,
+            'alt_title': '说出愿望吧(Genie)',
+        }
+    }]
+
+    def _process_lyrics(self, lyrics_info):
+        original = lyrics_info.get('lrc', {}).get('lyric')
+        translated = lyrics_info.get('tlyric', {}).get('lyric')
+
+        if not translated:
+            return original
+
+        lyrics_expr = r'(\[[0-9]{2}:[0-9]{2}\.[0-9]{2,}\])([^\n]+)'
+        original_ts_texts = re.findall(lyrics_expr, original)
+        translation_ts_dict = dict(
+            (time_stamp, text) for time_stamp, text in re.findall(lyrics_expr, translated)
+        )
+        lyrics = '\n'.join([
+            '%s%s / %s' % (time_stamp, text, translation_ts_dict.get(time_stamp, ''))
+            for time_stamp, text in original_ts_texts
+        ])
+        return lyrics
+
+    def _real_extract(self, url):
+        song_id = self._match_id(url)
+
+        params = {
+            'id': song_id,
+            'ids': '[%s]' % song_id
+        }
+        info = self.query_api(
+            'song/detail?' + compat_urllib_parse.urlencode(params),
+            song_id, 'Downloading song info')['songs'][0]
+
+        formats = self.extract_formats(info)
+        self._sort_formats(formats)
+
+        lyrics_info = self.query_api(
+            'song/lyric?id=%s&lv=-1&tv=-1' % song_id,
+            song_id, 'Downloading lyrics data')
+        lyrics = self._process_lyrics(lyrics_info)
+
+        alt_title = None
+        if info.get('transNames'):
+            alt_title = '/'.join(info.get('transNames'))
+
+        return {
+            'id': song_id,
+            'title': info['name'],
+            'alt_title': alt_title,
+            'creator': ' / '.join([artist['name'] for artist in info.get('artists', [])]),
+            'timestamp': self.convert_milliseconds(info.get('album', {}).get('publishTime')),
+            'thumbnail': info.get('album', {}).get('picUrl'),
+            'duration': self.convert_milliseconds(info.get('duration', 0)),
+            'description': lyrics,
+            'formats': formats,
+        }
+
+
+class NetEaseMusicAlbumIE(NetEaseMusicBaseIE):
+    IE_NAME = 'netease:album'
+    IE_DESC = '网易云音乐 - 专辑'
+    _VALID_URL = r'https?://music\.163\.com/(#/)?album\?id=(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://music.163.com/#/album?id=220780',
+        'info_dict': {
+            'id': '220780',
+            'title': 'B\'day',
+        },
+        'playlist_count': 23,
+    }
+
+    def _real_extract(self, url):
+        album_id = self._match_id(url)
+
+        info = self.query_api(
+            'album/%s?id=%s' % (album_id, album_id),
+            album_id, 'Downloading album data')['album']
+
+        name = info['name']
+        desc = info.get('description')
+        entries = [
+            self.url_result('http://music.163.com/#/song?id=%s' % song['id'],
+                            'NetEaseMusic', song['id'])
+            for song in info['songs']
+        ]
+        return self.playlist_result(entries, album_id, name, desc)
+
+
+class NetEaseMusicSingerIE(NetEaseMusicBaseIE):
+    IE_NAME = 'netease:singer'
+    IE_DESC = '网易云音乐 - 歌手'
+    _VALID_URL = r'https?://music\.163\.com/(#/)?artist\?id=(?P<id>[0-9]+)'
+    _TESTS = [{
+        'note': 'Singer has aliases.',
+        'url': 'http://music.163.com/#/artist?id=10559',
+        'info_dict': {
+            'id': '10559',
+            'title': '张惠妹 - aMEI;阿密特',
+        },
+        'playlist_count': 50,
+    }, {
+        'note': 'Singer has translated name.',
+        'url': 'http://music.163.com/#/artist?id=124098',
+        'info_dict': {
+            'id': '124098',
+            'title': '李昇基 - 이승기',
+        },
+        'playlist_count': 50,
+    }]
+
+    def _real_extract(self, url):
+        singer_id = self._match_id(url)
+
+        info = self.query_api(
+            'artist/%s?id=%s' % (singer_id, singer_id),
+            singer_id, 'Downloading singer data')
+
+        name = info['artist']['name']
+        if info['artist']['trans']:
+            name = '%s - %s' % (name, info['artist']['trans'])
+        if info['artist']['alias']:
+            name = '%s - %s' % (name, ';'.join(info['artist']['alias']))
+
+        entries = [
+            self.url_result('http://music.163.com/#/song?id=%s' % song['id'],
+                            'NetEaseMusic', song['id'])
+            for song in info['hotSongs']
+        ]
+        return self.playlist_result(entries, singer_id, name)
+
+
+class NetEaseMusicListIE(NetEaseMusicBaseIE):
+    IE_NAME = 'netease:playlist'
+    IE_DESC = '网易云音乐 - 歌单'
+    _VALID_URL = r'https?://music\.163\.com/(#/)?(playlist|discover/toplist)\?id=(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://music.163.com/#/playlist?id=79177352',
+        'info_dict': {
+            'id': '79177352',
+            'title': 'Billboard 2007 Top 100',
+            'description': 'md5:12fd0819cab2965b9583ace0f8b7b022'
+        },
+        'playlist_count': 99,
+    }, {
+        'note': 'Toplist/Charts sample',
+        'url': 'http://music.163.com/#/discover/toplist?id=3733003',
+        'info_dict': {
+            'id': '3733003',
+            'title': 're:韩国Melon排行榜周榜 [0-9]{4}-[0-9]{2}-[0-9]{2}',
+            'description': 'md5:73ec782a612711cadc7872d9c1e134fc',
+        },
+        'playlist_count': 50,
+    }]
+
+    def _real_extract(self, url):
+        list_id = self._match_id(url)
+
+        info = self.query_api(
+            'playlist/detail?id=%s&lv=-1&tv=-1' % list_id,
+            list_id, 'Downloading playlist data')['result']
+
+        name = info['name']
+        desc = info.get('description')
+
+        if info.get('specialType') == 10:  # is a chart/toplist
+            datestamp = datetime.fromtimestamp(
+                self.convert_milliseconds(info['updateTime'])).strftime('%Y-%m-%d')
+            name = '%s %s' % (name, datestamp)
+
+        entries = [
+            self.url_result('http://music.163.com/#/song?id=%s' % song['id'],
+                            'NetEaseMusic', song['id'])
+            for song in info['tracks']
+        ]
+        return self.playlist_result(entries, list_id, name, desc)
+
+
+class NetEaseMusicMvIE(NetEaseMusicBaseIE):
+    IE_NAME = 'netease:mv'
+    IE_DESC = '网易云音乐 - MV'
+    _VALID_URL = r'https?://music\.163\.com/(#/)?mv\?id=(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://music.163.com/#/mv?id=415350',
+        'info_dict': {
+            'id': '415350',
+            'ext': 'mp4',
+            'title': '이럴거면 그러지말지',
+            'description': '白雅言自作曲唱甜蜜爱情',
+            'creator': '白雅言',
+            'upload_date': '20150520',
+        },
+    }
+
+    def _real_extract(self, url):
+        mv_id = self._match_id(url)
+
+        info = self.query_api(
+            'mv/detail?id=%s&type=mp4' % mv_id,
+            mv_id, 'Downloading mv info')['data']
+
+        formats = [
+            {'url': mv_url, 'ext': 'mp4', 'format_id': '%sp' % brs, 'height': int(brs)}
+            for brs, mv_url in info['brs'].items()
+        ]
+        self._sort_formats(formats)
+
+        return {
+            'id': mv_id,
+            'title': info['name'],
+            'description': info.get('desc') or info.get('briefDesc'),
+            'creator': info['artistName'],
+            'upload_date': info['publishTime'].replace('-', ''),
+            'formats': formats,
+            'thumbnail': info.get('cover'),
+            'duration': self.convert_milliseconds(info.get('duration', 0)),
+        }
+
+
+class NetEaseMusicProgramIE(NetEaseMusicBaseIE):
+    IE_NAME = 'netease:program'
+    IE_DESC = '网易云音乐 - 电台节目'
+    _VALID_URL = r'https?://music\.163\.com/(#/?)program\?id=(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://music.163.com/#/program?id=10109055',
+        'info_dict': {
+            'id': '10109055',
+            'ext': 'mp3',
+            'title': '不丹足球背后的故事',
+            'description': '喜马拉雅人的足球梦 ...',
+            'creator': '大话西藏',
+            'timestamp': 1434179342,
+            'upload_date': '20150613',
+            'duration': 900,
+        },
+    }, {
+        'note': 'This program has accompanying songs.',
+        'url': 'http://music.163.com/#/program?id=10141022',
+        'info_dict': {
+            'id': '10141022',
+            'title': '25岁,你是自在如风的少年<27°C>',
+            'description': 'md5:8d594db46cc3e6509107ede70a4aaa3b',
+        },
+        'playlist_count': 4,
+    }, {
+        'note': 'This program has accompanying songs.',
+        'url': 'http://music.163.com/#/program?id=10141022',
+        'info_dict': {
+            'id': '10141022',
+            'ext': 'mp3',
+            'title': '25岁,你是自在如风的少年<27°C>',
+            'description': 'md5:8d594db46cc3e6509107ede70a4aaa3b',
+            'timestamp': 1434450841,
+            'upload_date': '20150616',
+        },
+        'params': {
+            'noplaylist': True
+        }
+    }]
+
+    def _real_extract(self, url):
+        program_id = self._match_id(url)
+
+        info = self.query_api(
+            'dj/program/detail?id=%s' % program_id,
+            program_id, 'Downloading program info')['program']
+
+        name = info['name']
+        description = info['description']
+
+        if not info['songs'] or self._downloader.params.get('noplaylist'):
+            if info['songs']:
+                self.to_screen(
+                    'Downloading just the main audio %s because of --no-playlist'
+                    % info['mainSong']['id'])
+
+            formats = self.extract_formats(info['mainSong'])
+            self._sort_formats(formats)
+
+            return {
+                'id': program_id,
+                'title': name,
+                'description': description,
+                'creator': info['dj']['brand'],
+                'timestamp': self.convert_milliseconds(info['createTime']),
+                'thumbnail': info['coverUrl'],
+                'duration': self.convert_milliseconds(info.get('duration', 0)),
+                'formats': formats,
+            }
+
+        self.to_screen(
+            'Downloading playlist %s - add --no-playlist to just download the main audio %s'
+            % (program_id, info['mainSong']['id']))
+
+        song_ids = [info['mainSong']['id']]
+        song_ids.extend([song['id'] for song in info['songs']])
+        entries = [
+            self.url_result('http://music.163.com/#/song?id=%s' % song_id,
+                            'NetEaseMusic', song_id)
+            for song_id in song_ids
+        ]
+        return self.playlist_result(entries, program_id, name, description)
+
+
+class NetEaseMusicDjRadioIE(NetEaseMusicBaseIE):
+    IE_NAME = 'netease:djradio'
+    IE_DESC = '网易云音乐 - 电台'
+    _VALID_URL = r'https?://music\.163\.com/(#/)?djradio\?id=(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://music.163.com/#/djradio?id=42',
+        'info_dict': {
+            'id': '42',
+            'title': '声音蔓延',
+            'description': 'md5:766220985cbd16fdd552f64c578a6b15'
+        },
+        'playlist_mincount': 40,
+    }
+    _PAGE_SIZE = 1000
+
+    def _real_extract(self, url):
+        dj_id = self._match_id(url)
+
+        name = None
+        desc = None
+        entries = []
+        for offset in compat_itertools_count(start=0, step=self._PAGE_SIZE):
+            info = self.query_api(
+                'dj/program/byradio?asc=false&limit=%d&radioId=%s&offset=%d'
+                % (self._PAGE_SIZE, dj_id, offset),
+                dj_id, 'Downloading dj programs - %d' % offset)
+
+            entries.extend([
+                self.url_result(
+                    'http://music.163.com/#/program?id=%s' % program['id'],
+                    'NetEaseMusicProgram', program['id'])
+                for program in info['programs']
+            ])
+
+            if name is None:
+                radio = info['programs'][0]['radio']
+                name = radio['name']
+                desc = radio['desc']
+
+            if not info['more']:
+                break
+
+        return self.playlist_result(entries, dj_id, name, desc)
index bc17e20aa9d736eb9e4ba0a39929f20db47d8465..0d165a82ad53ac8ac16ca8943c934db9fb28b720 100644 (file)
@@ -49,7 +49,7 @@ class NetzkinoIE(InfoExtractor):
             'http://www.netzkino.de/beta/dist/production.min.js', video_id,
             note='Downloading player code')
         avo_js = self._search_regex(
-            r'window\.avoCore\s*=.*?urlTemplate:\s*(\{.*?"\})',
+            r'var urlTemplate=(\{.*?"\})',
             production_js, 'URL templates')
         templates = self._parse_json(
             avo_js, video_id, transform_source=js_to_json)
index 85fcad06b51dc9ce87bdd563043c92a126cc8eea..5a9e73cd66a1b1224bdec848722f5e9d14f65c38 100644 (file)
@@ -31,7 +31,7 @@ class NewstubeIE(InfoExtractor):
         page = self._download_webpage(url, video_id, 'Downloading page')
 
         video_guid = self._html_search_regex(
-            r'<meta property="og:video" content="https?://(?:www\.)?newstube\.ru/freshplayer\.swf\?guid=(?P<guid>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})',
+            r'<meta property="og:video:url" content="https?://(?:www\.)?newstube\.ru/freshplayer\.swf\?guid=(?P<guid>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})',
             page, 'video GUID')
 
         player = self._download_xml(
index 02dba4ef639e64deff790f94bd5cd0cd6da2a4cb..c10784f6b7321395e69c86ab06f91f8d3b37b655 100644 (file)
@@ -6,6 +6,7 @@ from ..utils import parse_iso8601
 
 
 class NextMediaIE(InfoExtractor):
+    IE_DESC = '蘋果日報'
     _VALID_URL = r'http://hk.apple.nextmedia.com/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)'
     _TESTS = [{
         'url': 'http://hk.apple.nextmedia.com/realtime/news/20141108/53109199',
@@ -66,6 +67,7 @@ class NextMediaIE(InfoExtractor):
 
 
 class NextMediaActionNewsIE(NextMediaIE):
+    IE_DESC = '蘋果日報 - 動新聞'
     _VALID_URL = r'http://hk.dv.nextmedia.com/actionnews/[^/]+/(?P<date>\d+)/(?P<id>\d+)/\d+'
     _TESTS = [{
         'url': 'http://hk.dv.nextmedia.com/actionnews/hit/20150121/19009428/20061460',
@@ -89,8 +91,9 @@ class NextMediaActionNewsIE(NextMediaIE):
         return self._extract_from_nextmedia_page(news_id, url, article_page)
 
 
-class AppleDailyRealtimeNewsIE(NextMediaIE):
-    _VALID_URL = r'http://(www|ent).appledaily.com.tw/(realtimenews|enews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
+class AppleDailyIE(NextMediaIE):
+    IE_DESC = '臺灣蘋果日報'
+    _VALID_URL = r'http://(www|ent).appledaily.com.tw/(?:animation|appledaily|enews|realtimenews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
     _TESTS = [{
         'url': 'http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694',
         'md5': 'a843ab23d150977cc55ef94f1e2c1e4d',
@@ -99,7 +102,7 @@ class AppleDailyRealtimeNewsIE(NextMediaIE):
             'ext': 'mp4',
             'title': '周亭羽走過摩鐵陰霾2男陪吃 九把刀孤寒看醫生',
             'thumbnail': 're:^https?://.*\.jpg$',
-            'description': 'md5:b23787119933404ce515c6356a8c355c',
+            'description': 'md5:2acd430e59956dc47cd7f67cb3c003f4',
             'upload_date': '20150128',
         }
     }, {
@@ -110,26 +113,10 @@ class AppleDailyRealtimeNewsIE(NextMediaIE):
             'ext': 'mp4',
             'title': '不滿被踩腳 山東兩大媽一路打下車',
             'thumbnail': 're:^https?://.*\.jpg$',
-            'description': 'md5:2648aaf6fc4f401f6de35a91d111aa1d',
+            'description': 'md5:175b4260c1d7c085993474217e4ab1b4',
             'upload_date': '20150128',
         }
-    }]
-
-    _URL_PATTERN = r'\{url: \'(.+)\'\}'
-
-    def _fetch_title(self, page):
-        return self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title')
-
-    def _fetch_thumbnail(self, page):
-        return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False)
-
-    def _fetch_timestamp(self, page):
-        return None
-
-
-class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE):
-    _VALID_URL = 'http://www.appledaily.com.tw/animation/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
-    _TESTS = [{
+    }, {
         'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003671',
         'md5': '03df296d95dedc2d5886debbb80cb43f',
         'info_dict': {
@@ -154,10 +141,22 @@ class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE):
         'expected_warnings': [
             'video thumbnail',
         ]
+    }, {
+        'url': 'http://www.appledaily.com.tw/appledaily/article/supplement/20140417/35770334/',
+        'only_matching': True,
     }]
 
+    _URL_PATTERN = r'\{url: \'(.+)\'\}'
+
     def _fetch_title(self, page):
-        return self._html_search_meta('description', page, 'news title')
+        return (self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title', default=None) or
+                self._html_search_meta('description', page, 'news title'))
+
+    def _fetch_thumbnail(self, page):
+        return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False)
+
+    def _fetch_timestamp(self, page):
+        return None
 
     def _fetch_description(self, page):
         return self._html_search_meta('description', page, 'news description')
index 2684dd250aa65e22903612f4a1780fc8f701296a..dc54634a58e440fc70ae9bcb3e7d5781981b2b1e 100644 (file)
@@ -19,7 +19,7 @@ class NFLIE(InfoExtractor):
     _VALID_URL = r'''(?x)https?://
         (?P<host>(?:www\.)?(?:nfl\.com|.*?\.clubs\.nfl\.com))/
         (?:.+?/)*
-        (?P<id>(?:\d[a-z]{2}\d{13}|\w{8}\-(?:\w{4}\-){3}\w{12}))'''
+        (?P<id>(?:[a-z0-9]{16}|\w{8}\-(?:\w{4}\-){3}\w{12}))'''
     _TESTS = [
         {
             'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights',
@@ -58,6 +58,10 @@ class NFLIE(InfoExtractor):
                 'upload_date': '20150202',
             },
         },
+        {
+            'url': 'http://www.nfl.com/videos/nfl-network-top-ten/09000d5d810a6bd4/Top-10-Gutsiest-Performances-Jack-Youngblood',
+            'only_matching': True,
+        }
     ]
 
     @staticmethod
index 3cecebf95a4acd0a388da5984aebc2822e79b32c..0f8aa5adad5b2247621ce00249f3bd03a33a104a 100644 (file)
@@ -182,7 +182,6 @@ class NiconicoIE(InfoExtractor):
         extension = xpath_text(video_info, './/movie_type')
         if not extension:
             extension = determine_ext(video_real_url)
-        video_format = extension.upper()
 
         thumbnail = (
             xpath_text(video_info, './/thumbnail_url') or
@@ -241,7 +240,7 @@ class NiconicoIE(InfoExtractor):
             'url': video_real_url,
             'title': title,
             'ext': extension,
-            'format': video_format,
+            'format_id': 'economy' if video_real_url.endswith('low') else 'normal',
             'thumbnail': thumbnail,
             'description': description,
             'uploader': uploader,
index 664dc81d47ce7af613636022f4e540dffd67f8b6..a53e27b274eaa21ac15a1dc5077001d520832696 100644 (file)
@@ -166,6 +166,10 @@ class NocoIE(InfoExtractor):
         self._sort_formats(formats)
 
         timestamp = parse_iso8601(show.get('online_date_start_utc'), ' ')
+
+        if timestamp is not None and timestamp < 0:
+            timestamp = None
+
         uploader = show.get('partner_name')
         uploader_id = show.get('partner_key')
         duration = float_or_none(show.get('duration_ms'), 1000)
@@ -191,7 +195,7 @@ class NocoIE(InfoExtractor):
         if episode_number:
             title += ' #' + compat_str(episode_number)
         if episode:
-            title += ' - ' + episode
+            title += ' - ' + compat_str(episode)
 
         description = show.get('show_resume') or show.get('family_resume')
 
diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py
new file mode 100644 (file)
index 0000000..3f9c776
--- /dev/null
@@ -0,0 +1,179 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    unified_strdate,
+)
+
+
+class NovaIE(InfoExtractor):
+    IE_DESC = 'TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz'
+    _VALID_URL = 'http://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)'
+    _TESTS = [{
+        'url': 'http://tvnoviny.nova.cz/clanek/novinky/co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou.html?utm_source=tvnoviny&utm_medium=cpfooter&utm_campaign=novaplus',
+        'info_dict': {
+            'id': '1608920',
+            'display_id': 'co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou',
+            'ext': 'flv',
+            'title': 'Duel: Michal Hrdlička a Petr Suchoň',
+            'description': 'md5:d0cc509858eee1b1374111c588c6f5d5',
+            'thumbnail': 're:^https?://.*\.(?:jpg)',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }, {
+        'url': 'http://tn.nova.cz/clanek/tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci.html#player_13260',
+        'md5': '1dd7b9d5ea27bc361f110cd855a19bd3',
+        'info_dict': {
+            'id': '1757139',
+            'display_id': 'tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci',
+            'ext': 'mp4',
+            'title': 'Podzemní nemocnice v pražské Krči',
+            'description': 'md5:f0a42dd239c26f61c28f19e62d20ef53',
+            'thumbnail': 're:^https?://.*\.(?:jpg)',
+        }
+    }, {
+        'url': 'http://novaplus.nova.cz/porad/policie-modrava/video/5591-policie-modrava-15-dil-blondynka-na-hrbitove',
+        'info_dict': {
+            'id': '1756825',
+            'display_id': '5591-policie-modrava-15-dil-blondynka-na-hrbitove',
+            'ext': 'flv',
+            'title': 'Policie Modrava - 15. díl - Blondýnka na hřbitově',
+            'description': 'md5:dc24e50be5908df83348e50d1431295e',  # Make sure this description is clean of html tags
+            'thumbnail': 're:^https?://.*\.(?:jpg)',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }, {
+        'url': 'http://novaplus.nova.cz/porad/televizni-noviny/video/5585-televizni-noviny-30-5-2015/',
+        'info_dict': {
+            'id': '1756858',
+            'ext': 'flv',
+            'title': 'Televizní noviny - 30. 5. 2015',
+            'thumbnail': 're:^https?://.*\.(?:jpg)',
+            'upload_date': '20150530',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }, {
+        'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html',
+        'info_dict': {
+            'id': '1753621',
+            'ext': 'mp4',
+            'title': 'Zaklínač 3: Divoký hon',
+            'description': 're:.*Pokud se stejně jako my nemůžete.*',
+            'thumbnail': 're:https?://.*\.jpg(\?.*)?',
+            'upload_date': '20150521',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }, {
+        'url': 'http://sport.tn.nova.cz/clanek/sport/hokej/nhl/zivot-jde-dal-hodnotil-po-vyrazeni-z-playoff-jiri-sekac.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://doma.nova.cz/clanek/zdravi/prijdte-se-zapsat-do-registru-kostni-drene-jiz-ve-stredu-3-cervna.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://prask.nova.cz/clanek/novinky/co-si-na-sobe-nase-hvezdy-nechaly-pojistit.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://tv.nova.cz/clanek/novinky/zivot-je-zivot-bondovsky-trailer.html',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('id')
+        site = mobj.group('site')
+
+        webpage = self._download_webpage(url, display_id)
+
+        video_id = self._search_regex(
+            [r"(?:media|video_id)\s*:\s*'(\d+)'",
+             r'media=(\d+)',
+             r'id="article_video_(\d+)"',
+             r'id="player_(\d+)"'],
+            webpage, 'video id')
+
+        config_url = self._search_regex(
+            r'src="(http://tn\.nova\.cz/bin/player/videojs/config\.php\?[^"]+)"',
+            webpage, 'config url', default=None)
+
+        if not config_url:
+            DEFAULT_SITE_ID = '23000'
+            SITES = {
+                'tvnoviny': DEFAULT_SITE_ID,
+                'novaplus': DEFAULT_SITE_ID,
+                'vymena': DEFAULT_SITE_ID,
+                'krasna': DEFAULT_SITE_ID,
+                'fanda': '30',
+                'tn': '30',
+                'doma': '30',
+            }
+
+            site_id = self._search_regex(
+                r'site=(\d+)', webpage, 'site id', default=None) or SITES.get(site, DEFAULT_SITE_ID)
+
+            config_url = ('http://tn.nova.cz/bin/player/videojs/config.php?site=%s&media=%s&jsVar=vjsconfig'
+                          % (site_id, video_id))
+
+        config = self._download_json(
+            config_url, display_id,
+            'Downloading config JSON',
+            transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1])
+
+        mediafile = config['mediafile']
+        video_url = mediafile['src']
+
+        m = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+?))/&*(?P<playpath>.+)$', video_url)
+        if m:
+            formats = [{
+                'url': m.group('url'),
+                'app': m.group('app'),
+                'play_path': m.group('playpath'),
+                'player_path': 'http://tvnoviny.nova.cz/static/shared/app/videojs/video-js.swf',
+                'ext': 'flv',
+            }]
+        else:
+            formats = [{
+                'url': video_url,
+            }]
+        self._sort_formats(formats)
+
+        title = mediafile.get('meta', {}).get('title') or self._og_search_title(webpage)
+        description = clean_html(self._og_search_description(webpage, default=None))
+        thumbnail = config.get('poster')
+
+        if site == 'novaplus':
+            upload_date = unified_strdate(self._search_regex(
+                r'(\d{1,2}-\d{1,2}-\d{4})$', display_id, 'upload date', default=None))
+        elif site == 'fanda':
+            upload_date = unified_strdate(self._search_regex(
+                r'<span class="date_time">(\d{1,2}\.\d{1,2}\.\d{4})', webpage, 'upload date', default=None))
+        else:
+            upload_date = None
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'upload_date': upload_date,
+            'thumbnail': thumbnail,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py
new file mode 100644 (file)
index 0000000..0b5ff47
--- /dev/null
@@ -0,0 +1,192 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    parse_iso8601,
+    parse_duration,
+    remove_start,
+)
+
+
+class NowTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?nowtv\.de/(?P<station>rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/player'
+
+    _TESTS = [{
+        # rtl
+        'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/player',
+        'info_dict': {
+            'id': '203519',
+            'display_id': 'bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit',
+            'ext': 'mp4',
+            'title': 'Die neuen Bauern und eine Hochzeit',
+            'description': 'md5:e234e1ed6d63cf06be5c070442612e7e',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1432580700,
+            'upload_date': '20150525',
+            'duration': 2786,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # rtl2
+        'url': 'http://www.nowtv.de/rtl2/berlin-tag-nacht/berlin-tag-nacht-folge-934/player',
+        'info_dict': {
+            'id': '203481',
+            'display_id': 'berlin-tag-nacht/berlin-tag-nacht-folge-934',
+            'ext': 'mp4',
+            'title': 'Berlin - Tag & Nacht (Folge 934)',
+            'description': 'md5:c85e88c2e36c552dfe63433bc9506dd0',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1432666800,
+            'upload_date': '20150526',
+            'duration': 2641,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # rtlnitro
+        'url': 'http://www.nowtv.de/rtlnitro/alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00/player',
+        'info_dict': {
+            'id': '165780',
+            'display_id': 'alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00',
+            'ext': 'mp4',
+            'title': 'Hals- und Beinbruch',
+            'description': 'md5:b50d248efffe244e6f56737f0911ca57',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1432415400,
+            'upload_date': '20150523',
+            'duration': 2742,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # superrtl
+        'url': 'http://www.nowtv.de/superrtl/medicopter-117/angst/player',
+        'info_dict': {
+            'id': '99205',
+            'display_id': 'medicopter-117/angst',
+            'ext': 'mp4',
+            'title': 'Angst!',
+            'description': 'md5:30cbc4c0b73ec98bcd73c9f2a8c17c4e',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1222632900,
+            'upload_date': '20080928',
+            'duration': 3025,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # ntv
+        'url': 'http://www.nowtv.de/ntv/ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch/player',
+        'info_dict': {
+            'id': '203521',
+            'display_id': 'ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch',
+            'ext': 'mp4',
+            'title': 'Thema u.a.: Der erste Blick: Die Apple Watch',
+            'description': 'md5:4312b6c9d839ffe7d8caf03865a531af',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1432751700,
+            'upload_date': '20150527',
+            'duration': 1083,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # vox
+        'url': 'http://www.nowtv.de/vox/der-hundeprofi/buero-fall-chihuahua-joel/player',
+        'info_dict': {
+            'id': '128953',
+            'display_id': 'der-hundeprofi/buero-fall-chihuahua-joel',
+            'ext': 'mp4',
+            'title': "Büro-Fall / Chihuahua 'Joel'",
+            'description': 'md5:e62cb6bf7c3cc669179d4f1eb279ad8d',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1432408200,
+            'upload_date': '20150523',
+            'duration': 3092,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('id')
+        station = mobj.group('station')
+
+        info = self._download_json(
+            'https://api.nowtv.de/v3/movies/%s?fields=id,title,free,geoblocked,articleLong,articleShort,broadcastStartDate,seoUrl,duration,format,files' % display_id,
+            display_id)
+
+        video_id = compat_str(info['id'])
+
+        files = info['files']
+        if not files:
+            if info.get('geoblocked', False):
+                raise ExtractorError(
+                    'Video %s is not available from your location due to geo restriction' % video_id,
+                    expected=True)
+            if not info.get('free', True):
+                raise ExtractorError(
+                    'Video %s is not available for free' % video_id, expected=True)
+
+        f = info.get('format', {})
+        station = f.get('station') or station
+
+        STATIONS = {
+            'rtl': 'rtlnow',
+            'rtl2': 'rtl2now',
+            'vox': 'voxnow',
+            'nitro': 'rtlnitronow',
+            'ntv': 'n-tvnow',
+            'superrtl': 'superrtlnow'
+        }
+
+        formats = []
+        for item in files['items']:
+            item_path = remove_start(item['path'], '/')
+            tbr = int_or_none(item['bitrate'])
+            m3u8_url = 'http://hls.fra.%s.de/hls-vod-enc/%s.m3u8' % (STATIONS[station], item_path)
+            m3u8_url = m3u8_url.replace('now/', 'now/videos/')
+            formats.append({
+                'url': m3u8_url,
+                'format_id': '%s-%sk' % (item['id'], tbr),
+                'ext': 'mp4',
+                'tbr': tbr,
+            })
+        self._sort_formats(formats)
+
+        title = info['title']
+        description = info.get('articleLong') or info.get('articleShort')
+        timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ')
+        duration = parse_duration(info.get('duration'))
+        thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo')
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
+            'duration': duration,
+            'formats': formats,
+        }
index 5d84485714b9f360d47c8676710e9c3e6d9578c7..0c2d02c108ed425cf4688c34aa2a826ddaa400b4 100644 (file)
@@ -1,5 +1,7 @@
 from __future__ import unicode_literals
 
+import re
+
 from .common import InfoExtractor
 from ..utils import (
     fix_xml_ampersands,
@@ -7,7 +9,6 @@ from ..utils import (
     qualities,
     strip_jsonp,
     unified_strdate,
-    url_basename,
 )
 
 
@@ -16,13 +17,42 @@ class NPOBaseIE(InfoExtractor):
         token_page = self._download_webpage(
             'http://ida.omroep.nl/npoplayer/i.js',
             video_id, note='Downloading token')
-        return self._search_regex(
+        token = self._search_regex(
             r'npoplayer\.token = "(.+?)"', token_page, 'token')
+        # Decryption algorithm extracted from http://npoplayer.omroep.nl/csjs/npoplayer-min.js
+        token_l = list(token)
+        first = second = None
+        for i in range(5, len(token_l) - 4):
+            if token_l[i].isdigit():
+                if first is None:
+                    first = i
+                elif second is None:
+                    second = i
+        if first is None or second is None:
+            first = 12
+            second = 13
+
+        token_l[first], token_l[second] = token_l[second], token_l[first]
+
+        return ''.join(token_l)
 
 
 class NPOIE(NPOBaseIE):
-    IE_NAME = 'npo.nl'
-    _VALID_URL = r'https?://(?:www\.)?npo\.nl/(?!live|radio)[^/]+/[^/]+/(?P<id>[^/?]+)'
+    IE_NAME = 'npo'
+    IE_DESC = 'npo.nl and ntr.nl'
+    _VALID_URL = r'''(?x)
+                    (?:
+                        npo:|
+                        https?://
+                            (?:www\.)?
+                            (?:
+                                npo\.nl/(?!live|radio)(?:[^/]+/){2}|
+                                ntr\.nl/(?:[^/]+/){2,}|
+                                omroepwnl\.nl/video/fragment/[^/]+__
+                            )
+                        )
+                        (?P<id>[^/?#]+)
+                '''
 
     _TESTS = [
         {
@@ -42,7 +72,7 @@ class NPOIE(NPOBaseIE):
             'info_dict': {
                 'id': 'VARA_101191800',
                 'ext': 'm4v',
-                'title': 'De Mega Mike & Mega Thomas show',
+                'title': 'De Mega Mike & Mega Thomas show: The best of.',
                 'description': 'md5:3b74c97fc9d6901d5a665aac0e5400f4',
                 'upload_date': '20090227',
                 'duration': 2400,
@@ -54,8 +84,8 @@ class NPOIE(NPOBaseIE):
             'info_dict': {
                 'id': 'VPWON_1169289',
                 'ext': 'm4v',
-                'title': 'Tegenlicht',
-                'description': 'md5:d6476bceb17a8c103c76c3b708f05dd1',
+                'title': 'Tegenlicht: De toekomst komt uit Afrika',
+                'description': 'md5:52cf4eefbc96fffcbdc06d024147abea',
                 'upload_date': '20130225',
                 'duration': 3000,
             },
@@ -84,6 +114,30 @@ class NPOIE(NPOBaseIE):
                 'title': 'Hoe gaat Europa verder na Parijs?',
             },
         },
+        {
+            'url': 'http://www.ntr.nl/Aap-Poot-Pies/27/detail/Aap-poot-pies/VPWON_1233944#content',
+            'md5': '01c6a2841675995da1f0cf776f03a9c3',
+            'info_dict': {
+                'id': 'VPWON_1233944',
+                'ext': 'm4v',
+                'title': 'Aap, poot, pies',
+                'description': 'md5:c9c8005d1869ae65b858e82c01a91fde',
+                'upload_date': '20150508',
+                'duration': 599,
+            },
+        },
+        {
+            'url': 'http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698',
+            'md5': 'd30cd8417b8b9bca1fdff27428860d08',
+            'info_dict': {
+                'id': 'POW_00996502',
+                'ext': 'm4v',
+                'title': '''"Dit is wel een 'landslide'..."''',
+                'description': 'md5:f8d66d537dfb641380226e31ca57b8e8',
+                'upload_date': '20150508',
+                'duration': 462,
+            },
+        }
     ]
 
     def _real_extract(self, url):
@@ -92,12 +146,24 @@ class NPOIE(NPOBaseIE):
 
     def _get_info(self, video_id):
         metadata = self._download_json(
-            'http://e.omroep.nl/metadata/aflevering/%s' % video_id,
+            'http://e.omroep.nl/metadata/%s' % video_id,
             video_id,
             # We have to remove the javascript callback
             transform_source=strip_jsonp,
         )
 
+        # For some videos actual video id (prid) is different (e.g. for
+        # http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698
+        # video id is POMS_WNL_853698 but prid is POW_00996502)
+        video_id = metadata.get('prid') or video_id
+
+        # titel is too generic in some cases so utilize aflevering_titel as well
+        # when available (e.g. http://tegenlicht.vpro.nl/afleveringen/2014-2015/access-to-africa.html)
+        title = metadata['titel']
+        sub_title = metadata.get('aflevering_titel')
+        if sub_title and sub_title != title:
+            title += ': %s' % sub_title
+
         token = self._get_token(video_id)
 
         formats = []
@@ -170,8 +236,8 @@ class NPOIE(NPOBaseIE):
 
         return {
             'id': video_id,
-            'title': metadata['titel'],
-            'description': metadata['info'],
+            'title': title,
+            'description': metadata.get('info'),
             'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'],
             'upload_date': unified_strdate(metadata.get('gidsdatum')),
             'duration': parse_duration(metadata.get('tijdsduur')),
@@ -340,9 +406,8 @@ class NPORadioFragmentIE(InfoExtractor):
         }
 
 
-class TegenlichtVproIE(NPOIE):
-    IE_NAME = 'tegenlicht.vpro.nl'
-    _VALID_URL = r'https?://tegenlicht\.vpro\.nl/afleveringen/.*?'
+class VPROIE(NPOIE):
+    _VALID_URL = r'https?://(?:www\.)?(?:tegenlicht\.)?vpro\.nl/(?:[^/]+/){2,}(?P<id>[^/]+)\.html'
 
     _TESTS = [
         {
@@ -351,17 +416,72 @@ class TegenlichtVproIE(NPOIE):
             'info_dict': {
                 'id': 'VPWON_1169289',
                 'ext': 'm4v',
-                'title': 'Tegenlicht',
-                'description': 'md5:d6476bceb17a8c103c76c3b708f05dd1',
+                'title': 'De toekomst komt uit Afrika',
+                'description': 'md5:52cf4eefbc96fffcbdc06d024147abea',
                 'upload_date': '20130225',
             },
         },
+        {
+            'url': 'http://www.vpro.nl/programmas/2doc/2015/sergio-herman.html',
+            'info_dict': {
+                'id': 'sergio-herman',
+                'title': 'Sergio Herman: Fucking perfect',
+            },
+            'playlist_count': 2,
+        },
+        {
+            # playlist with youtube embed
+            'url': 'http://www.vpro.nl/programmas/2doc/2015/education-education.html',
+            'info_dict': {
+                'id': 'education-education',
+                'title': '2Doc',
+            },
+            'playlist_count': 2,
+        }
     ]
 
     def _real_extract(self, url):
-        name = url_basename(url)
-        webpage = self._download_webpage(url, name)
-        urn = self._html_search_meta('mediaurn', webpage)
-        info_page = self._download_json(
-            'http://rs.vpro.nl/v2/api/media/%s.json' % urn, name)
-        return self._get_info(info_page['mid'])
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        entries = [
+            self.url_result('npo:%s' % video_id if not video_id.startswith('http') else video_id)
+            for video_id in re.findall(r'data-media-id="([^"]+)"', webpage)
+        ]
+
+        playlist_title = self._search_regex(
+            r'<title>\s*([^>]+?)\s*-\s*Teledoc\s*-\s*VPRO\s*</title>',
+            webpage, 'playlist title', default=None) or self._og_search_title(webpage)
+
+        return self.playlist_result(entries, playlist_id, playlist_title)
+
+
+class WNLIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?omroepwnl\.nl/video/detail/(?P<id>[^/]+)__\d+'
+
+    _TEST = {
+        'url': 'http://www.omroepwnl.nl/video/detail/vandaag-de-dag-6-mei__060515',
+        'info_dict': {
+            'id': 'vandaag-de-dag-6-mei',
+            'title': 'Vandaag de Dag 6 mei',
+        },
+        'playlist_count': 4,
+    }
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        entries = [
+            self.url_result('npo:%s' % video_id, 'NPO')
+            for video_id, part in re.findall(
+                r'<a[^>]+href="([^"]+)"[^>]+class="js-mid"[^>]*>(Deel \d+)', webpage)
+        ]
+
+        playlist_title = self._html_search_regex(
+            r'(?s)<h1[^>]+class="subject"[^>]*>(.+?)</h1>',
+            webpage, 'playlist title')
+
+        return self.playlist_result(entries, playlist_id, playlist_title)
index e91d3a248ec3367af8b7e9a8f60be0d503877481..d066a96db137ee3fb2c36f712803622e73b4aa40 100644 (file)
@@ -4,7 +4,6 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..compat import compat_str
 from ..utils import (
     ExtractorError,
     float_or_none,
@@ -14,7 +13,7 @@ from ..utils import (
 
 
 class NRKIE(InfoExtractor):
-    _VALID_URL = r'(?:nrk:|http://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)'
+    _VALID_URL = r'(?:nrk:|https?://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)'
 
     _TESTS = [
         {
@@ -77,7 +76,7 @@ class NRKIE(InfoExtractor):
 
 
 class NRKPlaylistIE(InfoExtractor):
-    _VALID_URL = r'http://(?:www\.)?nrk\.no/(?!video)(?:[^/]+/)+(?P<id>[^/]+)'
+    _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video)(?:[^/]+/)+(?P<id>[^/]+)'
 
     _TESTS = [{
         'url': 'http://www.nrk.no/troms/gjenopplev-den-historiske-solformorkelsen-1.12270763',
@@ -117,11 +116,12 @@ class NRKPlaylistIE(InfoExtractor):
 
 
 class NRKTVIE(InfoExtractor):
-    _VALID_URL = r'(?P<baseurl>http://tv\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?'
+    IE_DESC = 'NRK TV and NRK Radio'
+    _VALID_URL = r'(?P<baseurl>https?://(?:tv|radio)\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?'
 
     _TESTS = [
         {
-            'url': 'http://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',
+            'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',
             'md5': 'adf2c5454fa2bf032f47a9f8fb351342',
             'info_dict': {
                 'id': 'MUHH48000314',
@@ -133,7 +133,7 @@ class NRKTVIE(InfoExtractor):
             },
         },
         {
-            'url': 'http://tv.nrk.no/program/mdfp15000514',
+            'url': 'https://tv.nrk.no/program/mdfp15000514',
             'md5': '383650ece2b25ecec996ad7b5bb2a384',
             'info_dict': {
                 'id': 'mdfp15000514',
@@ -146,7 +146,7 @@ class NRKTVIE(InfoExtractor):
         },
         {
             # single playlist video
-            'url': 'http://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2',
+            'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2',
             'md5': 'adbd1dbd813edaf532b0a253780719c2',
             'info_dict': {
                 'id': 'MSPO40010515-part2',
@@ -158,7 +158,7 @@ class NRKTVIE(InfoExtractor):
             'skip': 'Only works from Norway',
         },
         {
-            'url': 'http://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015',
+            'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015',
             'playlist': [
                 {
                     'md5': '9480285eff92d64f06e02a5367970a7a',
@@ -189,6 +189,10 @@ class NRKTVIE(InfoExtractor):
                 'duration': 6947.5199999999995,
             },
             'skip': 'Only works from Norway',
+        },
+        {
+            'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#',
+            'only_matching': True,
         }
     ]
 
@@ -200,24 +204,15 @@ class NRKTVIE(InfoExtractor):
         url = "%s%s" % (baseurl, subtitlesurl)
         self._debug_print('%s: Subtitle url: %s' % (video_id, url))
         captions = self._download_xml(
-            url, video_id, 'Downloading subtitles',
-            transform_source=lambda s: s.replace(r'<br />', '\r\n'))
+            url, video_id, 'Downloading subtitles')
         lang = captions.get('lang', 'no')
-        ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/ns/ttml}'))
-        srt = ''
-        for pos, p in enumerate(ps):
-            begin = parse_duration(p.get('begin'))
-            duration = parse_duration(p.get('dur'))
-            starttime = self._subtitles_timecode(begin)
-            endtime = self._subtitles_timecode(begin + duration)
-            srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (compat_str(pos), starttime, endtime, p.text)
         return {lang: [
             {'ext': 'ttml', 'url': url},
-            {'ext': 'srt', 'data': srt},
         ]}
 
     def _extract_f4m(self, manifest_url, video_id):
-        return self._extract_f4m_formats(manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id)
+        return self._extract_f4m_formats(
+            manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id, f4m_id='hds')
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
@@ -279,7 +274,7 @@ class NRKTVIE(InfoExtractor):
 
         m3u8_url = re.search(r'data-hls-media="([^"]+)"', webpage)
         if m3u8_url:
-            formats.extend(self._extract_m3u8_formats(m3u8_url.group(1), video_id, 'mp4'))
+            formats.extend(self._extract_m3u8_formats(m3u8_url.group(1), video_id, 'mp4', m3u8_id='hls'))
         self._sort_formats(formats)
 
         subtitles_url = self._html_search_regex(
index 6ffbe38631a0e854d035586de7396e1714b81037..7f254b867da66f70a79ff7aac5d81eb6f37bd997 100644 (file)
@@ -89,7 +89,7 @@ class NYTimesIE(NYTimesBaseIE):
 
 
 class NYTimesArticleIE(NYTimesBaseIE):
-    _VALID_URL = r'https?://(?:www)?\.nytimes\.com/(.(?<!video))*?/(?:[^/]+/)*(?P<id>[^.]+)(?:\.html)?'
+    _VALID_URL = r'https?://(?:www\.)?nytimes\.com/(.(?<!video))*?/(?:[^/]+/)*(?P<id>[^.]+)(?:\.html)?'
     _TESTS = [{
         'url': 'http://www.nytimes.com/2015/04/14/business/owner-of-gravity-payments-a-credit-card-processor-is-setting-a-new-minimum-wage-70000-a-year.html?_r=0',
         'md5': 'e2076d58b4da18e6a001d53fd56db3c9',
index 155d0ee6a834fa6fb551900e0d0c35dcf0007a5c..215ffe87b55db126300f0c18c98d9c5bfd920ed7 100644 (file)
@@ -2,16 +2,19 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
 from ..utils import (
     unified_strdate,
     int_or_none,
     qualities,
+    unescapeHTML,
 )
 
 
 class OdnoklassnikiIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:odnoklassniki|ok)\.ru/(?:video|web-api/video/moviePlayer)/(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:odnoklassniki|ok)\.ru/(?:video|web-api/video/moviePlayer)/(?P<id>[\d-]+)'
     _TESTS = [{
+        # metadata in JSON
         'url': 'http://ok.ru/video/20079905452',
         'md5': '8e24ad2da6f387948e7a7d44eb8668fe',
         'info_dict': {
@@ -19,11 +22,22 @@ class OdnoklassnikiIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'Культура меняет нас (прекрасный ролик!))',
             'duration': 100,
-            'upload_date': '20141207',
             'uploader_id': '330537914540',
             'uploader': 'Виталий Добровольский',
             'like_count': int,
-            'age_limit': 0,
+        },
+    }, {
+        # metadataUrl
+        'url': 'http://ok.ru/video/63567059965189-0',
+        'md5': '9676cf86eff5391d35dea675d224e131',
+        'info_dict': {
+            'id': '63567059965189-0',
+            'ext': 'mp4',
+            'title': 'Девушка без комплексов ...',
+            'duration': 191,
+            'uploader_id': '534380003155',
+            'uploader': 'Андрей Мещанинов',
+            'like_count': int,
         },
     }, {
         'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452',
@@ -33,14 +47,23 @@ class OdnoklassnikiIE(InfoExtractor):
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        webpage = self._download_webpage(url, video_id)
+        webpage = self._download_webpage(
+            'http://ok.ru/video/%s' % video_id, video_id)
 
         player = self._parse_json(
-            self._search_regex(
-                r"OKVideo\.start\(({.+?})\s*,\s*'VideoAutoplay_player'", webpage, 'player'),
+            unescapeHTML(self._search_regex(
+                r'data-attributes="([^"]+)"', webpage, 'player')),
             video_id)
 
-        metadata = self._parse_json(player['flashvars']['metadata'], video_id)
+        flashvars = player['flashvars']
+
+        metadata = flashvars.get('metadata')
+        if metadata:
+            metadata = self._parse_json(metadata, video_id)
+        else:
+            metadata = self._download_json(
+                compat_urllib_parse_unquote(flashvars['metadataUrl']),
+                video_id, 'Downloading metadata JSON')
 
         movie = metadata['movie']
         title = movie['title']
@@ -52,11 +75,11 @@ class OdnoklassnikiIE(InfoExtractor):
         uploader = author.get('name')
 
         upload_date = unified_strdate(self._html_search_meta(
-            'ya:ovs:upload_date', webpage, 'upload date'))
+            'ya:ovs:upload_date', webpage, 'upload date', default=None))
 
         age_limit = None
         adult = self._html_search_meta(
-            'ya:ovs:adult', webpage, 'age limit')
+            'ya:ovs:adult', webpage, 'age limit', default=None)
         if adult:
             age_limit = 18 if adult == 'true' else 0
 
diff --git a/youtube_dl/extractor/onionstudios.py b/youtube_dl/extractor/onionstudios.py
new file mode 100644 (file)
index 0000000..0f1f448
--- /dev/null
@@ -0,0 +1,76 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import determine_ext
+
+
+class OnionStudiosIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?onionstudios\.com/(?:videos/[^/]+-|embed\?.*\bid=)(?P<id>\d+)(?!-)'
+
+    _TESTS = [{
+        'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937',
+        'md5': 'd4851405d31adfadf71cd7a487b765bb',
+        'info_dict': {
+            'id': '2937',
+            'ext': 'mp4',
+            'title': 'Hannibal charges forward, stops for a cocktail',
+            'description': 'md5:545299bda6abf87e5ec666548c6a9448',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'uploader': 'The A.V. Club',
+            'uploader_id': 'TheAVClub',
+        },
+    }, {
+        'url': 'http://www.onionstudios.com/embed?id=2855&autoplay=true',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?onionstudios\.com/embed.+?)\1', webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'http://www.onionstudios.com/embed?id=%s' % video_id, video_id)
+
+        formats = []
+        for src in re.findall(r'<source[^>]+src="([^"]+)"', webpage):
+            if determine_ext(src) != 'm3u8':  # m3u8 always results in 403
+                formats.append({
+                    'url': src,
+                })
+        self._sort_formats(formats)
+
+        title = self._search_regex(
+            r'share_title\s*=\s*(["\'])(?P<title>[^\1]+?)\1',
+            webpage, 'title', group='title')
+        description = self._search_regex(
+            r'share_description\s*=\s*(["\'])(?P<description>[^\1]+?)\1',
+            webpage, 'description', default=None, group='description')
+        thumbnail = self._search_regex(
+            r'poster\s*=\s*(["\'])(?P<thumbnail>[^\1]+?)\1',
+            webpage, 'thumbnail', default=False, group='thumbnail')
+
+        uploader_id = self._search_regex(
+            r'twitter_handle\s*=\s*(["\'])(?P<uploader_id>[^\1]+?)\1',
+            webpage, 'uploader id', fatal=False, group='uploader_id')
+        uploader = self._search_regex(
+            r'window\.channelName\s*=\s*(["\'])Embedded:(?P<uploader>[^\1]+?)\1',
+            webpage, 'uploader', default=False, group='uploader')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'formats': formats,
+        }
index c0e6d643d51982a8f8e0694329f940345c5f1284..a262a9f6d4ec232e78ee34f3ce38b03ea27d01e9 100644 (file)
@@ -12,50 +12,7 @@ from ..utils import (
 )
 
 
-class OoyalaIE(InfoExtractor):
-    _VALID_URL = r'(?:ooyala:|https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=)(?P<id>.+?)(&|$)'
-
-    _TESTS = [
-        {
-            # From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video
-            'url': 'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
-            'info_dict': {
-                'id': 'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
-                'ext': 'mp4',
-                'title': 'Explaining Data Recovery from Hard Drives and SSDs',
-                'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.',
-            },
-        }, {
-            # Only available for ipad
-            'url': 'http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0',
-            'info_dict': {
-                'id': 'x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0',
-                'ext': 'mp4',
-                'title': 'Simulation Overview - Levels of Simulation',
-                'description': '',
-            },
-        },
-        {
-            # Information available only through SAS api
-            # From http://community.plm.automation.siemens.com/t5/News-NX-Manufacturing/Tool-Path-Divide/ba-p/4187
-            'url': 'http://player.ooyala.com/player.js?embedCode=FiOG81ZTrvckcchQxmalf4aQj590qTEx',
-            'md5': 'a84001441b35ea492bc03736e59e7935',
-            'info_dict': {
-                'id': 'FiOG81ZTrvckcchQxmalf4aQj590qTEx',
-                'ext': 'mp4',
-                'title': 'Ooyala video',
-            }
-        }
-    ]
-
-    @staticmethod
-    def _url_for_embed_code(embed_code):
-        return 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code
-
-    @classmethod
-    def _build_url_result(cls, embed_code):
-        return cls.url_result(cls._url_for_embed_code(embed_code),
-                              ie=cls.ie_key())
+class OoyalaBaseIE(InfoExtractor):
 
     def _extract_result(self, info, more_info):
         embedCode = info['embedCode']
@@ -77,11 +34,8 @@ class OoyalaIE(InfoExtractor):
             'thumbnail': more_info['promo'],
         }
 
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        embedCode = mobj.group('id')
-        player_url = 'http://player.ooyala.com/player.js?embedCode=%s' % embedCode
-        player = self._download_webpage(player_url, embedCode)
+    def _extract(self, player_url, video_id):
+        player = self._download_webpage(player_url, video_id)
         mobile_url = self._search_regex(r'mobile_player_url="(.+?)&device="',
                                         player, 'mobile player url')
         # Looks like some videos are only available for particular devices
@@ -94,7 +48,7 @@ class OoyalaIE(InfoExtractor):
         devices.insert(0, 'unknown')
         for device in devices:
             mobile_player = self._download_webpage(
-                '%s&device=%s' % (mobile_url, device), embedCode,
+                '%s&device=%s' % (mobile_url, device), video_id,
                 'Downloading mobile player JS for %s device' % device)
             videos_info = self._search_regex(
                 r'var streams=window.oo_testEnv\?\[\]:eval\("\((\[{.*?}\])\)"\);',
@@ -105,10 +59,10 @@ class OoyalaIE(InfoExtractor):
         if not videos_info:
             formats = []
             auth_data = self._download_json(
-                'http://player.ooyala.com/sas/player_api/v1/authorization/embed_code/%s/%s?domain=www.example.org&supportedFormats=mp4,webm' % (embedCode, embedCode),
-                embedCode)
+                'http://player.ooyala.com/sas/player_api/v1/authorization/embed_code/%s/%s?domain=www.example.org&supportedFormats=mp4,webm' % (video_id, video_id),
+                video_id)
 
-            cur_auth_data = auth_data['authorization_data'][embedCode]
+            cur_auth_data = auth_data['authorization_data'][video_id]
 
             for stream in cur_auth_data['streams']:
                 formats.append({
@@ -123,7 +77,7 @@ class OoyalaIE(InfoExtractor):
                 })
             if formats:
                 return {
-                    'id': embedCode,
+                    'id': video_id,
                     'formats': formats,
                     'title': 'Ooyala video',
                 }
@@ -143,9 +97,100 @@ class OoyalaIE(InfoExtractor):
             videos = [self._extract_result(info, more_info) for (info, more_info) in zip(videos_info, videos_more_info['lineup'])]
             return {
                 '_type': 'playlist',
-                'id': embedCode,
+                'id': video_id,
                 'title': unescapeHTML(videos_more_info['title']),
                 'entries': videos,
             }
         else:
             return self._extract_result(videos_info[0], videos_more_info)
+
+
+class OoyalaIE(OoyalaBaseIE):
+    _VALID_URL = r'(?:ooyala:|https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=)(?P<id>.+?)(&|$)'
+
+    _TESTS = [
+        {
+            # From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video
+            'url': 'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
+            'info_dict': {
+                'id': 'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
+                'ext': 'mp4',
+                'title': 'Explaining Data Recovery from Hard Drives and SSDs',
+                'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.',
+            },
+        }, {
+            # Only available for ipad
+            'url': 'http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0',
+            'info_dict': {
+                'id': 'x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0',
+                'ext': 'mp4',
+                'title': 'Simulation Overview - Levels of Simulation',
+                'description': '',
+            },
+        },
+        {
+            # Information available only through SAS api
+            # From http://community.plm.automation.siemens.com/t5/News-NX-Manufacturing/Tool-Path-Divide/ba-p/4187
+            'url': 'http://player.ooyala.com/player.js?embedCode=FiOG81ZTrvckcchQxmalf4aQj590qTEx',
+            'md5': 'a84001441b35ea492bc03736e59e7935',
+            'info_dict': {
+                'id': 'FiOG81ZTrvckcchQxmalf4aQj590qTEx',
+                'ext': 'mp4',
+                'title': 'Ooyala video',
+            }
+        }
+    ]
+
+    @staticmethod
+    def _url_for_embed_code(embed_code):
+        return 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code
+
+    @classmethod
+    def _build_url_result(cls, embed_code):
+        return cls.url_result(cls._url_for_embed_code(embed_code),
+                              ie=cls.ie_key())
+
+    def _real_extract(self, url):
+        embed_code = self._match_id(url)
+        player_url = 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code
+        return self._extract(player_url, embed_code)
+
+
+class OoyalaExternalIE(OoyalaBaseIE):
+    _VALID_URL = r'''(?x)
+                    (?:
+                        ooyalaexternal:|
+                        https?://.+?\.ooyala\.com/.*?\bexternalId=
+                    )
+                    (?P<partner_id>[^:]+)
+                    :
+                    (?P<id>.+)
+                    (?:
+                        :|
+                        .*?&pcode=
+                    )
+                    (?P<pcode>.+?)
+                    (&|$)
+                    '''
+
+    _TEST = {
+        'url': 'https://player.ooyala.com/player.js?externalId=espn:10365079&pcode=1kNG061cgaoolOncv54OAO1ceO-I&adSetCode=91cDU6NuXTGKz3OdjOxFdAgJVtQcKJnI&callback=handleEvents&hasModuleParams=1&height=968&playerBrandingId=7af3bd04449c444c964f347f11873075&targetReplaceId=videoPlayer&width=1656&wmode=opaque&allowScriptAccess=always',
+        'info_dict': {
+            'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG',
+            'ext': 'mp4',
+            'title': 'dm_140128_30for30Shorts___JudgingJewellv2',
+            'description': '',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        partner_id = mobj.group('partner_id')
+        video_id = mobj.group('id')
+        pcode = mobj.group('pcode')
+        player_url = 'http://player.ooyala.com/player.js?externalId=%s:%s&pcode=%s' % (partner_id, video_id, pcode)
+        return self._extract(player_url, video_id)
index 2249657eb1b796970c155f4baf3445fef9b60681..d2ceedd018fe1f0237aefa17330d5dbe3d94f68c 100644 (file)
@@ -3,9 +3,9 @@ from __future__ import unicode_literals
 import json
 
 from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote_plus
 from ..utils import (
     parse_iso8601,
-    compat_urllib_parse,
     parse_age_limit,
     int_or_none,
 )
@@ -37,7 +37,7 @@ class OpenFilmIE(InfoExtractor):
 
         webpage = self._download_webpage(url, display_id)
 
-        player = compat_urllib_parse.unquote_plus(
+        player = compat_urllib_parse_unquote_plus(
             self._og_search_video_url(webpage))
 
         video = json.loads(self._search_regex(
index f179ea2008636f061c6a4cdad6fc69841a291076..6cdc2638b4930dc92835d71f673b560dea99022d 100644 (file)
@@ -87,7 +87,7 @@ class PatreonIE(InfoExtractor):
             r'<div class="attach"><a target="_blank" href="([^"]+)">',
             webpage, 'attachment URL', default=None)
         embed = self._html_search_regex(
-            r'<div id="watchCreation">\s*<iframe class="embedly-embed" src="([^"]+)"',
+            r'<div[^>]+id="watchCreation"[^>]*>\s*<iframe[^>]+src="([^"]+)"',
             webpage, 'embedded URL', default=None)
 
         if attach_fn is not None:
index 143a7669639770e0cdfddc55e0b9395893301736..a53479aad762d2fbe8095867a625df996bbf1473 100644 (file)
@@ -1,3 +1,4 @@
+# coding: utf-8
 from __future__ import unicode_literals
 
 import re
@@ -31,10 +32,13 @@ class PBSIE(InfoExtractor):
             'info_dict': {
                 'id': '2365006249',
                 'ext': 'mp4',
-                'title': 'A More Perfect Union',
+                'title': 'Constitution USA with Peter Sagal - A More Perfect Union',
                 'description': 'md5:ba0c207295339c8d6eced00b7c363c6a',
                 'duration': 3190,
             },
+            'params': {
+                'skip_download': True,  # requires ffmpeg
+            },
         },
         {
             'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/',
@@ -42,10 +46,13 @@ class PBSIE(InfoExtractor):
             'info_dict': {
                 'id': '2365297690',
                 'ext': 'mp4',
-                'title': 'Losing Iraq',
+                'title': 'FRONTLINE - Losing Iraq',
                 'description': 'md5:f5bfbefadf421e8bb8647602011caf8e',
                 'duration': 5050,
             },
+            'params': {
+                'skip_download': True,  # requires ffmpeg
+            }
         },
         {
             'url': 'http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/',
@@ -53,7 +60,7 @@ class PBSIE(InfoExtractor):
             'info_dict': {
                 'id': '2201174722',
                 'ext': 'mp4',
-                'title': 'Cyber Schools Gain Popularity, but Quality Questions Persist',
+                'title': 'PBS NewsHour - Cyber Schools Gain Popularity, but Quality Questions Persist',
                 'description': 'md5:5871c15cba347c1b3d28ac47a73c7c28',
                 'duration': 801,
             },
@@ -65,10 +72,13 @@ class PBSIE(InfoExtractor):
                 'id': '2365297708',
                 'ext': 'mp4',
                 'description': 'md5:68d87ef760660eb564455eb30ca464fe',
-                'title': 'Dudamel Conducts Verdi Requiem at the Hollywood Bowl - Full',
+                'title': 'Great Performances - Dudamel Conducts Verdi Requiem at the Hollywood Bowl - Full',
                 'duration': 6559,
                 'thumbnail': 're:^https?://.*\.jpg$',
-            }
+            },
+            'params': {
+                'skip_download': True,  # requires ffmpeg
+            },
         },
         {
             'url': 'http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html',
@@ -78,11 +88,14 @@ class PBSIE(InfoExtractor):
                 'display_id': 'killer-typhoon',
                 'ext': 'mp4',
                 'description': 'md5:c741d14e979fc53228c575894094f157',
-                'title': 'Killer Typhoon',
+                'title': 'NOVA - Killer Typhoon',
                 'duration': 3172,
                 'thumbnail': 're:^https?://.*\.jpg$',
                 'upload_date': '20140122',
-            }
+            },
+            'params': {
+                'skip_download': True,  # requires ffmpeg
+            },
         },
         {
             'url': 'http://www.pbs.org/wgbh/pages/frontline/united-states-of-secrets/',
@@ -90,6 +103,36 @@ class PBSIE(InfoExtractor):
                 'id': 'united-states-of-secrets',
             },
             'playlist_count': 2,
+        },
+        {
+            'url': 'http://www.pbs.org/wgbh/americanexperience/films/death/player/',
+            'info_dict': {
+                'id': '2280706814',
+                'display_id': 'player',
+                'ext': 'mp4',
+                'title': 'American Experience - Death and the Civil War',
+                'description': 'American Experience, TV’s most-watched history series, brings to life the compelling stories from our past that inform our understanding of the world today.',
+                'duration': 6705,
+                'thumbnail': 're:^https?://.*\.jpg$',
+            },
+            'params': {
+                'skip_download': True,  # requires ffmpeg
+            },
+        },
+        {
+            'url': 'http://video.pbs.org/video/2365367186/',
+            'info_dict': {
+                'id': '2365367186',
+                'display_id': '2365367186',
+                'ext': 'mp4',
+                'title': 'To Catch A Comet - Full Episode',
+                'description': 'On November 12, 2014, billions of kilometers from Earth, spacecraft orbiter Rosetta and lander Philae did what no other had dared to attempt \u2014 land on the volatile surface of a comet as it zooms around the sun at 67,000 km/hr. The European Space Agency hopes this mission can help peer into our past and unlock secrets of our origins.',
+                'duration': 3342,
+                'thumbnail': 're:^https?://.*\.jpg$',
+            },
+            'params': {
+                'skip_download': True,  # requires ffmpeg
+            },
         }
     ]
 
@@ -123,7 +166,7 @@ class PBSIE(InfoExtractor):
                 return media_id, presumptive_id, upload_date
 
             url = self._search_regex(
-                r'<iframe\s+(?:class|id)=["\']partnerPlayer["\'].*?\s+src=["\'](.*?)["\']>',
+                r'<iframe\s+[^>]*\s+src=["\']([^\'"]+partnerplayer[^\'"]+)["\']',
                 webpage, 'player URL')
             mobj = re.match(self._VALID_URL, url)
 
@@ -196,6 +239,20 @@ class PBSIE(InfoExtractor):
             rating_str = rating_str.rpartition('-')[2]
         age_limit = US_RATINGS.get(rating_str)
 
+        subtitles = {}
+        closed_captions_url = info.get('closed_captions_url')
+        if closed_captions_url:
+            subtitles['en'] = [{
+                'ext': 'ttml',
+                'url': closed_captions_url,
+            }]
+
+        # info['title'] is often incomplete (e.g. 'Full Episode', 'Episode 5', etc)
+        # Try turning it to 'program - title' naming scheme if possible
+        alt_title = info.get('program', {}).get('title')
+        if alt_title:
+            info['title'] = alt_title + ' - ' + re.sub(r'^' + alt_title + '[\s\-:]+', '', info['title'])
+
         return {
             'id': video_id,
             'display_id': display_id,
@@ -206,4 +263,5 @@ class PBSIE(InfoExtractor):
             'age_limit': age_limit,
             'upload_date': upload_date,
             'formats': formats,
+            'subtitles': subtitles,
         }
index c66db3cdc84e55a6a3a904ddf3ff7c09aaac9573..788411ccc18082f59588d40704900c26dba1fe21 100644 (file)
@@ -4,7 +4,7 @@ import json
 import re
 
 from .common import InfoExtractor
-from ..compat import compat_urllib_parse
+from ..compat import compat_urllib_parse_unquote
 
 
 class PhotobucketIE(InfoExtractor):
@@ -34,7 +34,7 @@ class PhotobucketIE(InfoExtractor):
         info_json = self._search_regex(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (.*?)\);',
                                        webpage, 'info json')
         info = json.loads(info_json)
-        url = compat_urllib_parse.unquote(self._html_search_regex(r'file=(.+\.mp4)', info['linkcodes']['html'], 'url'))
+        url = compat_urllib_parse_unquote(self._html_search_regex(r'file=(.+\.mp4)', info['linkcodes']['html'], 'url'))
         return {
             'id': video_id,
             'url': url,
diff --git a/youtube_dl/extractor/pinkbike.py b/youtube_dl/extractor/pinkbike.py
new file mode 100644 (file)
index 0000000..a52210f
--- /dev/null
@@ -0,0 +1,96 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    remove_end,
+    remove_start,
+    str_to_int,
+    unified_strdate,
+)
+
+
+class PinkbikeIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:(?:www\.)?pinkbike\.com/video/|es\.pinkbike\.org/i/kvid/kvid-y5\.swf\?id=)(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://www.pinkbike.com/video/402811/',
+        'md5': '4814b8ca7651034cd87e3361d5c2155a',
+        'info_dict': {
+            'id': '402811',
+            'ext': 'mp4',
+            'title': 'Brandon Semenuk - RAW 100',
+            'description': 'Official release: www.redbull.ca/rupertwalker',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 100,
+            'upload_date': '20150406',
+            'uploader': 'revelco',
+            'location': 'Victoria, British Columbia, Canada',
+            'view_count': int,
+            'comment_count': int,
+        }
+    }, {
+        'url': 'http://es.pinkbike.org/i/kvid/kvid-y5.swf?id=406629',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'http://www.pinkbike.com/video/%s' % video_id, video_id)
+
+        formats = []
+        for _, format_id, src in re.findall(
+                r'data-quality=((?:\\)?["\'])(.+?)\1[^>]+src=\1(.+?)\1', webpage):
+            height = int_or_none(self._search_regex(
+                r'^(\d+)[pP]$', format_id, 'height', default=None))
+            formats.append({
+                'url': src,
+                'format_id': format_id,
+                'height': height,
+            })
+        self._sort_formats(formats)
+
+        title = remove_end(self._og_search_title(webpage), ' Video - Pinkbike')
+        description = self._html_search_regex(
+            r'(?s)id="media-description"[^>]*>(.+?)<',
+            webpage, 'description', default=None) or remove_start(
+            self._og_search_description(webpage), title + '. ')
+        thumbnail = self._og_search_thumbnail(webpage)
+        duration = int_or_none(self._html_search_meta(
+            'video:duration', webpage, 'duration'))
+
+        uploader = self._search_regex(
+            r'un:\s*"([^"]+)"', webpage, 'uploader', fatal=False)
+        upload_date = unified_strdate(self._search_regex(
+            r'class="fullTime"[^>]+title="([^"]+)"',
+            webpage, 'upload date', fatal=False))
+
+        location = self._html_search_regex(
+            r'(?s)<dt>Location</dt>\s*<dd>(.+?)<',
+            webpage, 'location', fatal=False)
+
+        def extract_count(webpage, label):
+            return str_to_int(self._search_regex(
+                r'<span[^>]+class="stat-num"[^>]*>([\d,.]+)</span>\s*<span[^>]+class="stat-label"[^>]*>%s' % label,
+                webpage, label, fatal=False))
+
+        view_count = extract_count(webpage, 'Views')
+        comment_count = extract_count(webpage, 'Comments')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'upload_date': upload_date,
+            'uploader': uploader,
+            'location': location,
+            'view_count': view_count,
+            'comment_count': comment_count,
+            'formats': formats
+        }
index 596c621d75067255f5be6e4eaf97ba897cc51fe5..06505e96fb9ac81224f03f71299eb80238c0921f 100644 (file)
@@ -18,7 +18,8 @@ class PlanetaPlayIE(InfoExtractor):
             'id': '3586',
             'ext': 'flv',
             'title': 'md5:e829428ee28b1deed00de90de49d1da1',
-        }
+        },
+        'skip': 'Not accessible from Travis CI server',
     }
 
     _SONG_FORMATS = {
index 45716c75d9505c5fcb7e8c6d73ec4feaef298aee..8a1c296dda8b57611a0e464387be43ab0fc9a370 100644 (file)
@@ -38,9 +38,7 @@ class PlayedIE(InfoExtractor):
         if m_error:
             raise ExtractorError(m_error.group('msg'), expected=True)
 
-        fields = re.findall(
-            r'type="hidden" name="([^"]+)"\s+value="([^"]+)">', orig_webpage)
-        data = dict(fields)
+        data = self._hidden_inputs(orig_webpage)
 
         self._sleep(2, video_id)
 
index c3e667e9e72ea0aaf6e5db731f630816e6a2861d..2eb4fd96dcbc071c1c2ecfb596ab20c4526018bd 100644 (file)
@@ -4,7 +4,8 @@ import re
 
 from .common import InfoExtractor
 from ..compat import (
-    compat_urllib_parse,
+    compat_urllib_parse_unquote,
+    compat_urllib_parse_unquote_plus,
 )
 from ..utils import (
     clean_html,
@@ -44,7 +45,7 @@ class PlayvidIE(InfoExtractor):
         flashvars = self._html_search_regex(
             r'flashvars="(.+?)"', webpage, 'flashvars')
 
-        infos = compat_urllib_parse.unquote(flashvars).split(r'&')
+        infos = compat_urllib_parse_unquote(flashvars).split(r'&')
         for info in infos:
             videovars_match = re.match(r'^video_vars\[(.+?)\]=(.+?)$', info)
             if videovars_match:
@@ -52,7 +53,7 @@ class PlayvidIE(InfoExtractor):
                 val = videovars_match.group(2)
 
                 if key == 'title':
-                    video_title = compat_urllib_parse.unquote_plus(val)
+                    video_title = compat_urllib_parse_unquote_plus(val)
                 if key == 'duration':
                     try:
                         duration = int(val)
diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py
new file mode 100644 (file)
index 0000000..72d1b27
--- /dev/null
@@ -0,0 +1,71 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+from ..compat import compat_urllib_parse
+from .common import InfoExtractor
+from ..utils import (
+    parse_duration,
+    int_or_none,
+    ExtractorError,
+)
+
+
+class Porn91IE(InfoExtractor):
+    IE_NAME = '91porn'
+    _VALID_URL = r'(?:https?://)(?:www\.|)91porn\.com/.+?\?viewkey=(?P<id>[\w\d]+)'
+
+    _TEST = {
+        'url': 'http://91porn.com/view_video.php?viewkey=7e42283b4f5ab36da134',
+        'md5': '6df8f6d028bc8b14f5dbd73af742fb20',
+        'info_dict': {
+            'id': '7e42283b4f5ab36da134',
+            'title': '18岁大一漂亮学妹,水嫩性感,再爽一次!',
+            'ext': 'mp4',
+            'duration': 431,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        url = 'http://91porn.com/view_video.php?viewkey=%s' % video_id
+        self._set_cookie('91porn.com', 'language', 'cn_CN')
+        webpage = self._download_webpage(url, video_id, 'get HTML content')
+
+        if '作为游客,你每天只可观看10个视频' in webpage:
+            raise ExtractorError('91 Porn says: Daily limit 10 videos exceeded', expected=True)
+
+        title = self._search_regex(
+            r'<div id="viewvideo-title">([^<]+)</div>', webpage, 'title')
+        title = title.replace('\n', '')
+
+        # get real url
+        file_id = self._search_regex(
+            r'so.addVariable\(\'file\',\'(\d+)\'', webpage, 'file id')
+        sec_code = self._search_regex(
+            r'so.addVariable\(\'seccode\',\'([^\']+)\'', webpage, 'sec code')
+        max_vid = self._search_regex(
+            r'so.addVariable\(\'max_vid\',\'(\d+)\'', webpage, 'max vid')
+        url_params = compat_urllib_parse.urlencode({
+            'VID': file_id,
+            'mp4': '1',
+            'seccode': sec_code,
+            'max_vid': max_vid,
+        })
+        info_cn = self._download_webpage(
+            'http://91porn.com/getfile.php?' + url_params, video_id,
+            'get real video url')
+        video_url = self._search_regex(r'file=([^&]+)&', info_cn, 'url')
+
+        duration = parse_duration(self._search_regex(
+            r'时长:\s*</span>\s*(\d+:\d+)', webpage, 'duration', fatal=False))
+
+        comment_count = int_or_none(self._search_regex(
+            r'留言:\s*</span>\s*(\d+)', webpage, 'comment count', fatal=False))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': video_url,
+            'duration': duration,
+            'comment_count': comment_count,
+        }
index 0c8b731cf47267568e43ccd09ff21f1683b4d992..0b7886840fbced3d9fa6fb219050f40ac709c080 100644 (file)
@@ -5,7 +5,8 @@ import re
 
 from .common import InfoExtractor
 from ..compat import (
-    compat_urllib_parse,
+    compat_urllib_parse_unquote,
+    compat_urllib_parse_unquote_plus,
     compat_urllib_parse_urlparse,
     compat_urllib_request,
 )
@@ -19,8 +20,8 @@ from ..aes import (
 
 
 class PornHubIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?pornhub\.com/view_video\.php\?viewkey=(?P<id>[0-9a-f]+)'
-    _TEST = {
+    _VALID_URL = r'https?://(?:www\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P<id>[0-9a-z]+)'
+    _TESTS = [{
         'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
         'md5': '882f488fa1f0026f023f33576004a2ed',
         'info_dict': {
@@ -30,7 +31,17 @@ class PornHubIE(InfoExtractor):
             "title": "Seductive Indian beauty strips down and fingers her pink pussy",
             "age_limit": 18
         }
-    }
+    }, {
+        'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def _extract_url(cls, webpage):
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/\d+)\1', webpage)
+        if mobj:
+            return mobj.group('url')
 
     def _extract_count(self, pattern, webpage, name):
         return str_to_int(self._search_regex(
@@ -39,7 +50,8 @@ class PornHubIE(InfoExtractor):
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        req = compat_urllib_request.Request(url)
+        req = compat_urllib_request.Request(
+            'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id)
         req.add_header('Cookie', 'age_verified=1')
         webpage = self._download_webpage(req, video_id)
 
@@ -58,7 +70,7 @@ class PornHubIE(InfoExtractor):
             webpage, 'uploader', fatal=False)
         thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False)
         if thumbnail:
-            thumbnail = compat_urllib_parse.unquote(thumbnail)
+            thumbnail = compat_urllib_parse_unquote(thumbnail)
 
         view_count = self._extract_count(
             r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
@@ -69,9 +81,10 @@ class PornHubIE(InfoExtractor):
         comment_count = self._extract_count(
             r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
 
-        video_urls = list(map(compat_urllib_parse.unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))
+        video_urls = list(map(compat_urllib_parse_unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))
         if webpage.find('"encrypted":true') != -1:
-            password = compat_urllib_parse.unquote_plus(self._html_search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
+            password = compat_urllib_parse_unquote_plus(
+                self._search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
             video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls))
 
         formats = []
index 9688ed94898de231e6c7f1c9dc28d3779da10311..eba4dfbb39576bff355b722c997dd31e07ce370f 100644 (file)
@@ -34,7 +34,7 @@ class PornoVoisinesIE(InfoExtractor):
             'duration': 120,
             'view_count': int,
             'average_rating': float,
-            'categories': ['Débutante', 'Scénario', 'Sodomie'],
+            'categories': ['Débutantes', 'Scénario', 'Sodomie'],
             'age_limit': 18,
         }
     }
@@ -71,7 +71,7 @@ class PornoVoisinesIE(InfoExtractor):
         view_count = int_or_none(self._search_regex(
             r'(\d+) vues', webpage, 'view count', fatal=False))
         average_rating = self._search_regex(
-            r'Note : (\d+,\d+)', webpage, 'average rating', fatal=False)
+            r'Note\s*:\s*(\d+(?:,\d+)?)', webpage, 'average rating', fatal=False)
         if average_rating:
             average_rating = float_or_none(average_rating.replace(',', '.'))
 
index 01cc3d9ea3ff845476a7f7b306c3bfee25078b96..304359dc5b189b8ce27c967c2d369b26db334532 100644 (file)
@@ -1,7 +1,5 @@
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
 from ..compat import (
     compat_urllib_parse,
@@ -31,12 +29,7 @@ class PrimeShareTVIE(InfoExtractor):
         if '>File not exist<' in webpage:
             raise ExtractorError('Video %s does not exist' % video_id, expected=True)
 
-        fields = dict(re.findall(r'''(?x)<input\s+
-            type="hidden"\s+
-            name="([^"]+)"\s+
-            (?:id="[^"]+"\s+)?
-            value="([^"]*)"
-            ''', webpage))
+        fields = self._hidden_inputs(webpage)
 
         headers = {
             'Referer': url,
index f536e6e6cdfb3d71e21c98614e2baf117387493b..8190ed6766ce5c878fc82700524ec6d012d70a57 100644 (file)
@@ -35,10 +35,7 @@ class PromptFileIE(InfoExtractor):
             raise ExtractorError('Video %s does not exist' % video_id,
                                  expected=True)
 
-        fields = dict(re.findall(r'''(?x)type="hidden"\s+
-            name="(.+?)"\s+
-            value="(.*?)"
-            ''', webpage))
+        fields = self._hidden_inputs(webpage)
         post = compat_urllib_parse.urlencode(fields)
         req = compat_urllib_request.Request(url, post)
         req.add_header('Content-type', 'application/x-www-form-urlencoded')
index 7cc7996642cae1de1ca2a585391d167025b92162..effcf1db37b06d1af40d99233359f0df295eaa03 100644 (file)
@@ -9,18 +9,26 @@ from ..compat import (
     compat_urllib_parse,
 )
 from ..utils import (
-    unified_strdate,
+    ExtractorError,
+    determine_ext,
+    float_or_none,
     int_or_none,
+    unified_strdate,
 )
 
 
 class ProSiebenSat1IE(InfoExtractor):
     IE_NAME = 'prosiebensat1'
     IE_DESC = 'ProSiebenSat.1 Digital'
-    _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|ran|the-voice-of-germany)\.de|fem\.com)/(?P<id>.+)'
+    _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany)\.(?:de|at)|ran\.de|fem\.com)/(?P<id>.+)'
 
     _TESTS = [
         {
+            # Tests changes introduced in https://github.com/rg3/youtube-dl/pull/6242
+            # in response to fixing https://github.com/rg3/youtube-dl/issues/6215:
+            # - malformed f4m manifest support
+            # - proper handling of URLs starting with `https?://` in 2.0 manifests
+            # - recursive child f4m manifests extraction
             'url': 'http://www.prosieben.de/tv/circus-halligalli/videos/218-staffel-2-episode-18-jahresrueckblick-ganze-folge',
             'info_dict': {
                 'id': '2104602',
@@ -177,6 +185,7 @@ class ProSiebenSat1IE(InfoExtractor):
         r'<header class="clearfix">\s*<h3>(.+?)</h3>',
         r'<!-- start video -->\s*<h1>(.+?)</h1>',
         r'<h1 class="att-name">\s*(.+?)</h1>',
+        r'<header class="module_header">\s*<h2>([^<]+)</h2>\s*</header>',
     ]
     _DESCRIPTION_REGEXES = [
         r'<p itemprop="description">\s*(.+?)</p>',
@@ -206,8 +215,8 @@ class ProSiebenSat1IE(InfoExtractor):
     def _extract_clip(self, url, webpage):
         clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id')
 
-        access_token = 'testclient'
-        client_name = 'kolibri-1.2.5'
+        access_token = 'prosieben'
+        client_name = 'kolibri-2.0.19-splec4'
         client_location = url
 
         videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse.urlencode({
@@ -217,10 +226,13 @@ class ProSiebenSat1IE(InfoExtractor):
             'ids': clip_id,
         })
 
-        videos = self._download_json(videos_api_url, clip_id, 'Downloading videos JSON')
+        video = self._download_json(videos_api_url, clip_id, 'Downloading videos JSON')[0]
+
+        if video.get('is_protected') is True:
+            raise ExtractorError('This video is DRM protected.', expected=True)
 
-        duration = float(videos[0]['duration'])
-        source_ids = [source['id'] for source in videos[0]['sources']]
+        duration = float_or_none(video.get('duration'))
+        source_ids = [source['id'] for source in video['sources']]
         source_ids_str = ','.join(map(str, source_ids))
 
         g = '01!8d8F_)r9]4s[qeuXfP%'
@@ -274,23 +286,30 @@ class ProSiebenSat1IE(InfoExtractor):
 
         for source in urls_sources:
             protocol = source['protocol']
+            source_url = source['url']
             if protocol == 'rtmp' or protocol == 'rtmpe':
-                mobj = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', source['url'])
+                mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url)
                 if not mobj:
                     continue
+                path = mobj.group('path')
+                mp4colon_index = path.rfind('mp4:')
+                app = path[:mp4colon_index]
+                play_path = path[mp4colon_index:]
                 formats.append({
-                    'url': mobj.group('url'),
-                    'app': mobj.group('app'),
-                    'play_path': mobj.group('playpath'),
+                    'url': '%s/%s' % (mobj.group('url'), app),
+                    'app': app,
+                    'play_path': play_path,
                     'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf',
                     'page_url': 'http://www.prosieben.de',
                     'vbr': fix_bitrate(source['bitrate']),
                     'ext': 'mp4',
                     'format_id': '%s_%s' % (source['cdn'], source['bitrate']),
                 })
+            elif 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m':
+                formats.extend(self._extract_f4m_formats(source_url, clip_id))
             else:
                 formats.append({
-                    'url': source['url'],
+                    'url': source_url,
                     'vbr': fix_bitrate(source['bitrate']),
                 })
 
index 174c8e0ae751fa9d1a875c666faa906d9729330f..1654a641f00db6833056571394b4dbe0b2856150 100644 (file)
@@ -9,25 +9,48 @@ from .common import InfoExtractor
 from ..utils import (
     strip_jsonp,
     unescapeHTML,
+    clean_html,
 )
 from ..compat import compat_urllib_request
 
 
 class QQMusicIE(InfoExtractor):
+    IE_NAME = 'qqmusic'
+    IE_DESC = 'QQ音乐'
     _VALID_URL = r'http://y.qq.com/#type=song&mid=(?P<id>[0-9A-Za-z]+)'
     _TESTS = [{
         'url': 'http://y.qq.com/#type=song&mid=004295Et37taLD',
-        'md5': 'bed90b6db2a7a7a7e11bc585f471f63a',
+        'md5': '9ce1c1c8445f561506d2e3cfb0255705',
         'info_dict': {
             'id': '004295Et37taLD',
-            'ext': 'm4a',
+            'ext': 'mp3',
             'title': '可惜没如果',
             'upload_date': '20141227',
             'creator': '林俊杰',
-            'description': 'md5:4348ff1dd24036906baa7b6f973f8d30',
+            'description': 'md5:d327722d0361576fde558f1ac68a7065',
+            'thumbnail': 're:^https?://.*\.jpg$',
+        }
+    }, {
+        'note': 'There is no mp3-320 version of this song.',
+        'url': 'http://y.qq.com/#type=song&mid=004MsGEo3DdNxV',
+        'md5': 'fa3926f0c585cda0af8fa4f796482e3e',
+        'info_dict': {
+            'id': '004MsGEo3DdNxV',
+            'ext': 'mp3',
+            'title': '如果',
+            'upload_date': '20050626',
+            'creator': '李季美',
+            'description': 'md5:46857d5ed62bc4ba84607a805dccf437',
+            'thumbnail': 're:^https?://.*\.jpg$',
         }
     }]
 
+    _FORMATS = {
+        'mp3-320': {'prefix': 'M800', 'ext': 'mp3', 'preference': 40, 'abr': 320},
+        'mp3-128': {'prefix': 'M500', 'ext': 'mp3', 'preference': 30, 'abr': 128},
+        'm4a': {'prefix': 'C200', 'ext': 'm4a', 'preference': 10}
+    }
+
     # Reference: m_r_GetRUin() in top_player.js
     # http://imgcache.gtimg.cn/music/portal_v3/y/top_player.js
     @staticmethod
@@ -58,6 +81,16 @@ class QQMusicIE(InfoExtractor):
         lrc_content = self._html_search_regex(
             r'<div class="content" id="lrc_content"[^<>]*>([^<>]+)</div>',
             detail_info_page, 'LRC lyrics', default=None)
+        if lrc_content:
+            lrc_content = lrc_content.replace('\\n', '\n')
+
+        thumbnail_url = None
+        albummid = self._search_regex(
+            [r'albummid:\'([0-9a-zA-Z]+)\'', r'"albummid":"([0-9a-zA-Z]+)"'],
+            detail_info_page, 'album mid', default=None)
+        if albummid:
+            thumbnail_url = "http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg" \
+                            % (albummid[-2:-1], albummid[-1], albummid)
 
         guid = self.m_r_get_ruin()
 
@@ -65,15 +98,28 @@ class QQMusicIE(InfoExtractor):
             'http://base.music.qq.com/fcgi-bin/fcg_musicexpress.fcg?json=3&guid=%s' % guid,
             mid, note='Retrieve vkey', errnote='Unable to get vkey',
             transform_source=strip_jsonp)['key']
-        song_url = 'http://cc.stream.qqmusic.qq.com/C200%s.m4a?vkey=%s&guid=%s&fromtag=0' % (mid, vkey, guid)
+
+        formats = []
+        for format_id, details in self._FORMATS.items():
+            formats.append({
+                'url': 'http://cc.stream.qqmusic.qq.com/%s%s.%s?vkey=%s&guid=%s&fromtag=0'
+                       % (details['prefix'], mid, details['ext'], vkey, guid),
+                'format': format_id,
+                'format_id': format_id,
+                'preference': details['preference'],
+                'abr': details.get('abr'),
+            })
+        self._check_formats(formats, mid)
+        self._sort_formats(formats)
 
         return {
             'id': mid,
-            'url': song_url,
+            'formats': formats,
             'title': song_name,
             'upload_date': publish_time,
             'creator': singer,
             'description': lrc_content,
+            'thumbnail': thumbnail_url,
         }
 
 
@@ -96,6 +142,8 @@ class QQPlaylistBaseIE(InfoExtractor):
 
 
 class QQMusicSingerIE(QQPlaylistBaseIE):
+    IE_NAME = 'qqmusic:singer'
+    IE_DESC = 'QQ音乐 - 歌手'
     _VALID_URL = r'http://y.qq.com/#type=singer&mid=(?P<id>[0-9A-Za-z]+)'
     _TEST = {
         'url': 'http://y.qq.com/#type=singer&mid=001BLpXF2DyJe2',
@@ -139,32 +187,131 @@ class QQMusicSingerIE(QQPlaylistBaseIE):
 
 
 class QQMusicAlbumIE(QQPlaylistBaseIE):
+    IE_NAME = 'qqmusic:album'
+    IE_DESC = 'QQ音乐 - 专辑'
     _VALID_URL = r'http://y.qq.com/#type=album&mid=(?P<id>[0-9A-Za-z]+)'
 
-    _TEST = {
-        'url': 'http://y.qq.com/#type=album&mid=000gXCTb2AhRR1&play=0',
+    _TESTS = [{
+        'url': 'http://y.qq.com/#type=album&mid=000gXCTb2AhRR1',
         'info_dict': {
             'id': '000gXCTb2AhRR1',
             'title': '我们都是这样长大的',
-            'description': 'md5:d216c55a2d4b3537fe4415b8767d74d6',
+            'description': 'md5:179c5dce203a5931970d306aa9607ea6',
         },
         'playlist_count': 4,
-    }
+    }, {
+        'url': 'http://y.qq.com/#type=album&mid=002Y5a3b3AlCu3',
+        'info_dict': {
+            'id': '002Y5a3b3AlCu3',
+            'title': '그리고...',
+            'description': 'md5:a48823755615508a95080e81b51ba729',
+        },
+        'playlist_count': 8,
+    }]
 
     def _real_extract(self, url):
         mid = self._match_id(url)
 
-        album_page = self._download_webpage(
-            self.qq_static_url('album', mid), mid, 'Download album page')
+        album = self._download_json(
+            'http://i.y.qq.com/v8/fcg-bin/fcg_v8_album_info_cp.fcg?albummid=%s&format=json' % mid,
+            mid, 'Download album page')['data']
 
-        entries = self.get_entries_from_page(album_page)
+        entries = [
+            self.url_result(
+                'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid']
+            ) for song in album['list']
+        ]
+        album_name = album.get('name')
+        album_detail = album.get('desc')
+        if album_detail is not None:
+            album_detail = album_detail.strip()
 
-        album_name = self._html_search_regex(
-            r"albumname\s*:\s*'([^']+)',", album_page, 'album name',
-            default=None)
+        return self.playlist_result(entries, mid, album_name, album_detail)
 
-        album_detail = self._html_search_regex(
-            r'<div class="album_detail close_detail">\s*<p>((?:[^<>]+(?:<br />)?)+)</p>',
-            album_page, 'album details', default=None)
 
-        return self.playlist_result(entries, mid, album_name, album_detail)
+class QQMusicToplistIE(QQPlaylistBaseIE):
+    IE_NAME = 'qqmusic:toplist'
+    IE_DESC = 'QQ音乐 - 排行榜'
+    _VALID_URL = r'http://y\.qq\.com/#type=toplist&p=(?P<id>(top|global)_[0-9]+)'
+
+    _TESTS = [{
+        'url': 'http://y.qq.com/#type=toplist&p=global_123',
+        'info_dict': {
+            'id': 'global_123',
+            'title': '美国iTunes榜',
+        },
+        'playlist_count': 10,
+    }, {
+        'url': 'http://y.qq.com/#type=toplist&p=top_3',
+        'info_dict': {
+            'id': 'top_3',
+            'title': 'QQ音乐巅峰榜·欧美',
+            'description': 'QQ音乐巅峰榜·欧美根据用户收听行为自动生成,集结当下最流行的欧美新歌!:更新时间:每周四22点|统'
+                           '计周期:一周(上周四至本周三)|统计对象:三个月内发行的欧美歌曲|统计数量:100首|统计算法:根据'
+                           '歌曲在一周内的有效播放次数,由高到低取前100名(同一歌手最多允许5首歌曲同时上榜)|有效播放次数:'
+                           '登录用户完整播放一首歌曲,记为一次有效播放;同一用户收听同一首歌曲,每天记录为1次有效播放'
+        },
+        'playlist_count': 100,
+    }, {
+        'url': 'http://y.qq.com/#type=toplist&p=global_106',
+        'info_dict': {
+            'id': 'global_106',
+            'title': '韩国Mnet榜',
+        },
+        'playlist_count': 50,
+    }]
+
+    def _real_extract(self, url):
+        list_id = self._match_id(url)
+
+        list_type, num_id = list_id.split("_")
+
+        toplist_json = self._download_json(
+            'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg?type=%s&topid=%s&format=json'
+            % (list_type, num_id),
+            list_id, 'Download toplist page')
+
+        entries = [
+            self.url_result(
+                'http://y.qq.com/#type=song&mid=' + song['data']['songmid'], 'QQMusic', song['data']['songmid']
+            ) for song in toplist_json['songlist']
+        ]
+
+        topinfo = toplist_json.get('topinfo', {})
+        list_name = topinfo.get('ListName')
+        list_description = topinfo.get('info')
+        return self.playlist_result(entries, list_id, list_name, list_description)
+
+
+class QQMusicPlaylistIE(QQPlaylistBaseIE):
+    IE_NAME = 'qqmusic:playlist'
+    IE_DESC = 'QQ音乐 - 歌单'
+    _VALID_URL = r'http://y\.qq\.com/#type=taoge&id=(?P<id>[0-9]+)'
+
+    _TEST = {
+        'url': 'http://y.qq.com/#type=taoge&id=3462654915',
+        'info_dict': {
+            'id': '3462654915',
+            'title': '韩国5月新歌精选下旬',
+            'description': 'md5:d2c9d758a96b9888cf4fe82f603121d4',
+        },
+        'playlist_count': 40,
+    }
+
+    def _real_extract(self, url):
+        list_id = self._match_id(url)
+
+        list_json = self._download_json(
+            'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg?type=1&json=1&utf8=1&onlysong=0&disstid=%s'
+            % list_id, list_id, 'Download list page',
+            transform_source=strip_jsonp)['cdlist'][0]
+
+        entries = [
+            self.url_result(
+                'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid']
+            ) for song in list_json['songlist']
+        ]
+
+        list_name = list_json.get('dissname')
+        list_description = clean_html(unescapeHTML(list_json.get('desc')))
+        return self.playlist_result(entries, list_id, list_name, list_description)
index af7d76cf47e575277de3ffe480fdb8e1eb48b43c..f414e2384e619e8faccab280bdb9e8a9518819c9 100644 (file)
@@ -24,6 +24,7 @@ class QuickVidIE(InfoExtractor):
             'thumbnail': 're:^https?://.*\.(?:png|jpg|gif)$',
             'view_count': int,
         },
+        'skip': 'Not accessible from Travis CI server',
     }
 
     def _real_extract(self, url):
diff --git a/youtube_dl/extractor/rds.py b/youtube_dl/extractor/rds.py
new file mode 100644 (file)
index 0000000..796adfd
--- /dev/null
@@ -0,0 +1,73 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    parse_duration,
+    parse_iso8601,
+)
+
+
+class RDSIE(InfoExtractor):
+    IE_DESC = 'RDS.ca'
+    _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P<display_id>[^/]+)-(?P<id>\d+\.\d+)'
+
+    _TESTS = [{
+        'url': 'http://www.rds.ca/videos/football/nfl/fowler-jr-prend-la-direction-de-jacksonville-3.1132799',
+        'info_dict': {
+            'id': '3.1132799',
+            'display_id': 'fowler-jr-prend-la-direction-de-jacksonville',
+            'ext': 'mp4',
+            'title': 'Fowler Jr. prend la direction de Jacksonville',
+            'description': 'Dante Fowler Jr. est le troisième choix du repêchage 2015 de la NFL. ',
+            'timestamp': 1430397346,
+            'upload_date': '20150430',
+            'duration': 154.354,
+            'age_limit': 0,
+        }
+    }, {
+        'url': 'http://www.rds.ca/vid%C3%A9os/un-voyage-positif-3.877934',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id')
+
+        webpage = self._download_webpage(url, display_id)
+
+        # TODO: extract f4m from 9c9media.com
+        video_url = self._search_regex(
+            r'<span[^>]+itemprop="contentURL"[^>]+content="([^"]+)"',
+            webpage, 'video url')
+
+        title = self._og_search_title(webpage) or self._html_search_meta(
+            'title', webpage, 'title', fatal=True)
+        description = self._og_search_description(webpage) or self._html_search_meta(
+            'description', webpage, 'description')
+        thumbnail = self._og_search_thumbnail(webpage) or self._search_regex(
+            [r'<link[^>]+itemprop="thumbnailUrl"[^>]+href="([^"]+)"',
+             r'<span[^>]+itemprop="thumbnailUrl"[^>]+content="([^"]+)"'],
+            webpage, 'thumbnail', fatal=False)
+        timestamp = parse_iso8601(self._search_regex(
+            r'<span[^>]+itemprop="uploadDate"[^>]+content="([^"]+)"',
+            webpage, 'upload date', fatal=False))
+        duration = parse_duration(self._search_regex(
+            r'<span[^>]+itemprop="duration"[^>]+content="([^"]+)"',
+            webpage, 'duration', fatal=False))
+        age_limit = self._family_friendly_search(webpage)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'url': video_url,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
+            'duration': duration,
+            'age_limit': age_limit,
+        }
index dce64e1517003015722db1097ac83b106cc91136..e4215d546219bb95fe79abfb184da149148962db 100644 (file)
@@ -1,10 +1,11 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import re
-import json
-
 from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    unescapeHTML,
+)
 
 
 class RTBFIE(InfoExtractor):
@@ -16,34 +17,47 @@ class RTBFIE(InfoExtractor):
             'id': '1921274',
             'ext': 'mp4',
             'title': 'Les Diables au coeur (épisode 2)',
-            'description': 'Football - Diables Rouges',
             'duration': 3099,
-            'timestamp': 1398456336,
-            'upload_date': '20140425',
         }
     }
 
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+    _QUALITIES = [
+        ('mobile', 'mobile'),
+        ('web', 'SD'),
+        ('url', 'MD'),
+        ('high', 'HD'),
+    ]
 
-        page = self._download_webpage('https://www.rtbf.be/video/embed?id=%s' % video_id, video_id)
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
 
-        data = json.loads(self._html_search_regex(
-            r'<div class="js-player-embed(?: player-embed)?" data-video="([^"]+)"', page, 'data video'))['data']
+        webpage = self._download_webpage(
+            'http://www.rtbf.be/video/embed?id=%s' % video_id, video_id)
 
-        video_url = data.get('downloadUrl') or data.get('url')
+        data = self._parse_json(
+            unescapeHTML(self._search_regex(
+                r'data-video="([^"]+)"', webpage, 'data video')),
+            video_id)
 
-        if data['provider'].lower() == 'youtube':
+        if data.get('provider').lower() == 'youtube':
+            video_url = data.get('downloadUrl') or data.get('url')
             return self.url_result(video_url, 'Youtube')
+        formats = []
+        for key, format_id in self._QUALITIES:
+            format_url = data['sources'].get(key)
+            if format_url:
+                formats.append({
+                    'format_id': format_id,
+                    'url': format_url,
+                })
 
         return {
             'id': video_id,
-            'url': video_url,
+            'formats': formats,
             'title': data['title'],
             'description': data.get('description') or data.get('subtitle'),
-            'thumbnail': data['thumbnail']['large'],
+            'thumbnail': data.get('thumbnail'),
             'duration': data.get('duration') or data.get('realDuration'),
-            'timestamp': data['created'],
-            'view_count': data['viewCount'],
+            'timestamp': int_or_none(data.get('created')),
+            'view_count': int_or_none(data.get('viewCount')),
         }
index cfce4550ada568cfe13fae859a2bb745671074b5..543d94417f6d52f9cb5f4bd4508d5bcf3d4b984e 100644 (file)
@@ -12,10 +12,10 @@ class RtlNlIE(InfoExtractor):
     IE_NAME = 'rtl.nl'
     IE_DESC = 'rtl.nl and rtlxl.nl'
     _VALID_URL = r'''(?x)
-        https?://(www\.)?
+        https?://(?:www\.)?
         (?:
             rtlxl\.nl/\#!/[^/]+/|
-            rtl\.nl/system/videoplayer/[^?#]+?/video_embed\.html\#uuid=
+            rtl\.nl/system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html\b.+?\buuid=
         )
         (?P<id>[0-9a-f-]+)'''
 
@@ -43,26 +43,60 @@ class RtlNlIE(InfoExtractor):
             'upload_date': '20150215',
             'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.',
         }
+    }, {
+        # empty synopsis and missing episodes (see https://github.com/rg3/youtube-dl/issues/6275)
+        'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a/autoplay=false',
+        'info_dict': {
+            'id': 'f536aac0-1dc3-4314-920e-3bd1c5b3811a',
+            'ext': 'mp4',
+            'title': 'RTL Nieuws - Meer beelden van overval juwelier',
+            'thumbnail': 're:^https?://screenshots\.rtl\.nl/system/thumb/sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$',
+            'timestamp': 1437233400,
+            'upload_date': '20150718',
+            'duration': 30.474,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # encrypted m3u8 streams, georestricted
+        'url': 'http://www.rtlxl.nl/#!/afl-2-257632/52a74543-c504-4cde-8aa8-ec66fe8d68a7',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.rtl.nl/system/videoplayer/derden/embed.html#!/uuid=bb0353b0-d6a4-1dad-90e9-18fe75b8d1f0',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
         uuid = self._match_id(url)
         info = self._download_json(
-            'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=flash/' % uuid,
+            'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=adaptive/' % uuid,
             uuid)
 
         material = info['material'][0]
-        progname = info['abstracts'][0]['name']
-        subtitle = material['title'] or info['episodes'][0]['name']
-        description = material.get('synopsis') or info['episodes'][0]['synopsis']
+        title = info['abstracts'][0]['name']
+        subtitle = material.get('title')
+        if subtitle:
+            title += ' - %s' % subtitle
+        description = material.get('synopsis')
+
+        meta = info.get('meta', {})
 
-        # Use unencrypted m3u8 streams (See https://github.com/rg3/youtube-dl/issues/4118)
-        videopath = material['videopath'].replace('.f4m', '.m3u8')
-        m3u8_url = 'http://manifest.us.rtl.nl' + videopath
+        # m3u8 streams are encrypted and may not be handled properly by older ffmpeg/avconv.
+        # To workaround this previously adaptive -> flash trick was used to obtain
+        # unencrypted m3u8 streams (see https://github.com/rg3/youtube-dl/issues/4118)
+        # and bypass georestrictions as well.
+        # Currently, unencrypted m3u8 playlists are (intentionally?) invalid and therefore
+        # unusable albeit can be fixed by simple string replacement (see
+        # https://github.com/rg3/youtube-dl/pull/6337)
+        # Since recent ffmpeg and avconv handle encrypted streams just fine encrypted
+        # streams are used now.
+        videopath = material['videopath']
+        m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath
 
         formats = self._extract_m3u8_formats(m3u8_url, uuid, ext='mp4')
 
-        video_urlpart = videopath.split('/flash/')[1][:-5]
+        video_urlpart = videopath.split('/adaptive/')[1][:-5]
         PG_URL_TEMPLATE = 'http://pg.us.rtl.nl/rtlxl/network/%s/progressive/%s.mp4'
 
         formats.extend([
@@ -79,7 +113,7 @@ class RtlNlIE(InfoExtractor):
         self._sort_formats(formats)
 
         thumbnails = []
-        meta = info.get('meta', {})
+
         for p in ('poster_base_url', '"thumb_base_url"'):
             if not meta.get(p):
                 continue
@@ -95,7 +129,7 @@ class RtlNlIE(InfoExtractor):
 
         return {
             'id': uuid,
-            'title': '%s - %s' % (progname, subtitle),
+            'title': title,
             'formats': formats,
             'timestamp': material['original_date'],
             'description': description,
diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py
deleted file mode 100644 (file)
index 785a804..0000000
+++ /dev/null
@@ -1,174 +0,0 @@
-# encoding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import (
-    ExtractorError,
-    clean_html,
-    unified_strdate,
-    int_or_none,
-)
-
-
-class RTLnowIE(InfoExtractor):
-    """Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW"""
-    _VALID_URL = r'''(?x)
-                        (?:https?://)?
-                        (?P<url>
-                            (?P<domain>
-                                rtl-now\.rtl\.de|
-                                rtl2now\.rtl2\.de|
-                                (?:www\.)?voxnow\.de|
-                                (?:www\.)?rtlnitronow\.de|
-                                (?:www\.)?superrtlnow\.de|
-                                (?:www\.)?n-tvnow\.de)
-                            /+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?
-                            (?:container_id|film_id)=(?P<video_id>[0-9]+)&
-                            player=1(?:&season=[0-9]+)?(?:&.*)?
-                        )'''
-
-    _TESTS = [
-        {
-            'url': 'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1',
-            'info_dict': {
-                'id': '90419',
-                'ext': 'flv',
-                'title': 'Ahornallee - Folge 1 - Der Einzug',
-                'description': 'md5:ce843b6b5901d9a7f7d04d1bbcdb12de',
-                'upload_date': '20070416',
-                'duration': 1685,
-            },
-            'params': {
-                'skip_download': True,
-            },
-            'skip': 'Only works from Germany',
-        },
-        {
-            'url': 'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5',
-            'info_dict': {
-                'id': '69756',
-                'ext': 'flv',
-                'title': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.',
-                'description': 'md5:3fb247005ed21a935ffc82b7dfa70cf0',
-                'thumbnail': 'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg',
-                'upload_date': '20120519',
-                'duration': 1245,
-            },
-            'params': {
-                'skip_download': True,
-            },
-            'skip': 'Only works from Germany',
-        },
-        {
-            'url': 'http://www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17',
-            'info_dict': {
-                'id': '13883',
-                'ext': 'flv',
-                'title': 'Voxtours - Südafrika-Reporter II',
-                'description': 'md5:de7f8d56be6fd4fed10f10f57786db00',
-                'upload_date': '20090627',
-                'duration': 1800,
-            },
-            'params': {
-                'skip_download': True,
-            },
-        },
-        {
-            'url': 'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1',
-            'info_dict': {
-                'id': '99205',
-                'ext': 'flv',
-                'title': 'Medicopter 117 - Angst!',
-                'description': 're:^Im Therapiezentrum \'Sonnalm\' kommen durch eine Unachtsamkeit die für die B.handlung mit Phobikern gehaltenen Voglespinnen frei\. Eine Ausreißerin',
-                'thumbnail': 'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg',
-                'upload_date': '20080928',
-                'duration': 2691,
-            },
-            'params': {
-                'skip_download': True,
-            },
-        },
-        {
-            'url': 'http://rtl-now.rtl.de/der-bachelor/folge-4.php?film_id=188729&player=1&season=5',
-            'info_dict': {
-                'id': '188729',
-                'ext': 'flv',
-                'upload_date': '20150204',
-                'description': 'md5:5e1ce23095e61a79c166d134b683cecc',
-                'title': 'Der Bachelor - Folge 4',
-            }
-        }, {
-            'url': 'http://www.n-tvnow.de/deluxe-alles-was-spass-macht/thema-ua-luxushotel-fuer-vierbeiner.php?container_id=153819&player=1&season=0',
-            'only_matching': True,
-        },
-    ]
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_page_url = 'http://%s/' % mobj.group('domain')
-        video_id = mobj.group('video_id')
-
-        webpage = self._download_webpage('http://' + mobj.group('url'), video_id)
-
-        mobj = re.search(r'(?s)<div style="margin-left: 20px; font-size: 13px;">(.*?)<div id="playerteaser">', webpage)
-        if mobj:
-            raise ExtractorError(clean_html(mobj.group(1)), expected=True)
-
-        title = self._og_search_title(webpage)
-        description = self._og_search_description(webpage)
-        thumbnail = self._og_search_thumbnail(webpage, default=None)
-
-        upload_date = unified_strdate(self._html_search_meta('uploadDate', webpage, 'upload date'))
-
-        mobj = re.search(r'<meta itemprop="duration" content="PT(?P<seconds>\d+)S" />', webpage)
-        duration = int(mobj.group('seconds')) if mobj else None
-
-        playerdata_url = self._html_search_regex(
-            r"'playerdata': '(?P<playerdata_url>[^']+)'", webpage, 'playerdata_url')
-
-        playerdata = self._download_xml(playerdata_url, video_id, 'Downloading player data XML')
-
-        videoinfo = playerdata.find('./playlist/videoinfo')
-
-        formats = []
-        for filename in videoinfo.findall('filename'):
-            mobj = re.search(r'(?P<url>rtmpe://(?:[^/]+/){2})(?P<play_path>.+)', filename.text)
-            if mobj:
-                fmt = {
-                    'url': mobj.group('url'),
-                    'play_path': 'mp4:' + mobj.group('play_path'),
-                    'page_url': video_page_url,
-                    'player_url': video_page_url + 'includes/vodplayer.swf',
-                }
-            else:
-                mobj = re.search(r'.*/(?P<hoster>[^/]+)/videos/(?P<play_path>.+)\.f4m', filename.text)
-                if mobj:
-                    fmt = {
-                        'url': 'rtmpe://fms.rtl.de/' + mobj.group('hoster'),
-                        'play_path': 'mp4:' + mobj.group('play_path'),
-                        'page_url': url,
-                        'player_url': video_page_url + 'includes/vodplayer.swf',
-                    }
-                else:
-                    fmt = {
-                        'url': filename.text,
-                    }
-            fmt.update({
-                'width': int_or_none(filename.get('width')),
-                'height': int_or_none(filename.get('height')),
-                'vbr': int_or_none(filename.get('bitrate')),
-                'ext': 'flv',
-            })
-            formats.append(fmt)
-
-        return {
-            'id': video_id,
-            'title': title,
-            'description': description,
-            'thumbnail': thumbnail,
-            'upload_date': upload_date,
-            'duration': duration,
-            'formats': formats,
-        }
index d0981115da9c64f1addf3108b8a9e6acf0c6508e..12639f08bbc24b2c520b5d93a36b61dbb5e7d831 100644 (file)
@@ -19,7 +19,16 @@ from ..utils import (
 
 class RTSIE(InfoExtractor):
     IE_DESC = 'RTS.ch'
-    _VALID_URL = r'https?://(?:www\.)?rts\.ch/(?:(?:[^/]+/){2,}(?P<id>[0-9]+)-(?P<display_id>.+?)\.html|play/tv/[^/]+/video/(?P<display_id_new>.+?)\?id=(?P<id_new>[0-9]+))'
+    _VALID_URL = r'''(?x)
+                    (?:
+                        rts:(?P<rts_id>\d+)|
+                        https?://
+                            (?:www\.)?rts\.ch/
+                            (?:
+                                (?:[^/]+/){2,}(?P<id>[0-9]+)-(?P<display_id>.+?)\.html|
+                                play/tv/[^/]+/video/(?P<display_id_new>.+?)\?id=(?P<id_new>[0-9]+)
+                            )
+                    )'''
 
     _TESTS = [
         {
@@ -122,6 +131,15 @@ class RTSIE(InfoExtractor):
                 'view_count': int,
             },
         },
+        {
+            # article with videos on rhs
+            'url': 'http://www.rts.ch/sport/hockey/6693917-hockey-davos-decroche-son-31e-titre-de-champion-de-suisse.html',
+            'info_dict': {
+                'id': '6693917',
+                'title': 'Hockey: Davos décroche son 31e titre de champion de Suisse',
+            },
+            'playlist_mincount': 5,
+        },
         {
             'url': 'http://www.rts.ch/play/tv/le-19h30/video/le-chantier-du-nouveau-parlement-vaudois-a-permis-une-trouvaille-historique?id=6348280',
             'only_matching': True,
@@ -130,7 +148,7 @@ class RTSIE(InfoExtractor):
 
     def _real_extract(self, url):
         m = re.match(self._VALID_URL, url)
-        video_id = m.group('id') or m.group('id_new')
+        video_id = m.group('rts_id') or m.group('id') or m.group('id_new')
         display_id = m.group('display_id') or m.group('display_id_new')
 
         def download_json(internal_id):
@@ -143,6 +161,15 @@ class RTSIE(InfoExtractor):
         # video_id extracted out of URL is not always a real id
         if 'video' not in all_info and 'audio' not in all_info:
             page = self._download_webpage(url, display_id)
+
+            # article with videos on rhs
+            videos = re.findall(
+                r'<article[^>]+class="content-item"[^>]*>\s*<a[^>]+data-video-urn="urn:rts:video:(\d+)"',
+                page)
+            if videos:
+                entries = [self.url_result('rts:%s' % video_urn, 'RTS') for video_urn in videos]
+                return self.playlist_result(entries, video_id, self._og_search_title(page))
+
             internal_id = self._html_search_regex(
                 r'<(?:video|audio) data-id="([0-9]+)"', page,
                 'internal video id')
@@ -190,6 +217,7 @@ class RTSIE(InfoExtractor):
                 'tbr': media['rate'] or extract_bitrate(media['url']),
             } for media in info['media'] if media.get('rate')])
 
+        self._check_formats(formats, video_id)
         self._sort_formats(formats)
 
         return {
index 849300140ecbf598874d22b090262eabec1e7ea5..82cd98ac742bf436b24fbbc77cac9a6fb8a44ff6 100644 (file)
@@ -17,7 +17,7 @@ from ..utils import (
 
 
 def _decrypt_url(png):
-    encrypted_data = base64.b64decode(png)
+    encrypted_data = base64.b64decode(png.encode('utf-8'))
     text_index = encrypted_data.find(b'tEXt')
     text_chunk = encrypted_data[text_index - 4:]
     length = struct_unpack('!I', text_chunk[:4])[0]
index 55604637dca22533cd765529dcb2abfb759fd9c1..d9df0686133a6772deb1e58260069857620afc58 100644 (file)
@@ -104,7 +104,7 @@ class RUTVIE(InfoExtractor):
     @classmethod
     def _extract_url(cls, webpage):
         mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.rutv\.ru/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage)
+            r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage)
         if mobj:
             return mobj.group('url')
 
diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py
new file mode 100644 (file)
index 0000000..4e22628
--- /dev/null
@@ -0,0 +1,119 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_urlparse
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    xpath_text,
+)
+
+
+class RuutuIE(InfoExtractor):
+    _VALID_URL = r'http://(?:www\.)?ruutu\.fi/ohjelmat/(?:[^/?#]+/)*(?P<id>[^/?#]+)'
+    _TESTS = [
+        {
+            'url': 'http://www.ruutu.fi/ohjelmat/oletko-aina-halunnut-tietaa-mita-tapahtuu-vain-hetki-ennen-lahetysta-nyt-se-selvisi',
+            'md5': 'ab2093f39be1ca8581963451b3c0234f',
+            'info_dict': {
+                'id': '2058907',
+                'display_id': 'oletko-aina-halunnut-tietaa-mita-tapahtuu-vain-hetki-ennen-lahetysta-nyt-se-selvisi',
+                'ext': 'mp4',
+                'title': 'Oletko aina halunnut tietää mitä tapahtuu vain hetki ennen lähetystä? - Nyt se selvisi!',
+                'description': 'md5:cfc6ccf0e57a814360df464a91ff67d6',
+                'thumbnail': 're:^https?://.*\.jpg$',
+                'duration': 114,
+                'age_limit': 0,
+            },
+        },
+        {
+            'url': 'http://www.ruutu.fi/ohjelmat/superpesis/superpesis-katso-koko-kausi-ruudussa',
+            'md5': '065a10ae4d5b8cfd9d0c3d332465e3d9',
+            'info_dict': {
+                'id': '2057306',
+                'display_id': 'superpesis-katso-koko-kausi-ruudussa',
+                'ext': 'mp4',
+                'title': 'Superpesis: katso koko kausi Ruudussa',
+                'description': 'md5:44c44a99fdbe5b380ab74ebd75f0af77',
+                'thumbnail': 're:^https?://.*\.jpg$',
+                'duration': 40,
+                'age_limit': 0,
+            },
+        },
+    ]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        video_id = self._search_regex(
+            r'data-media-id="(\d+)"', webpage, 'media id')
+
+        video_xml_url = None
+
+        media_data = self._search_regex(
+            r'jQuery\.extend\([^,]+,\s*(.+?)\);', webpage,
+            'media data', default=None)
+        if media_data:
+            media_json = self._parse_json(media_data, display_id, fatal=False)
+            if media_json:
+                xml_url = media_json.get('ruutuplayer', {}).get('xmlUrl')
+                if xml_url:
+                    video_xml_url = xml_url.replace('{ID}', video_id)
+
+        if not video_xml_url:
+            video_xml_url = 'http://gatling.ruutu.fi/media-xml-cache?id=%s' % video_id
+
+        video_xml = self._download_xml(video_xml_url, video_id)
+
+        formats = []
+        processed_urls = []
+
+        def extract_formats(node):
+            for child in node:
+                if child.tag.endswith('Files'):
+                    extract_formats(child)
+                elif child.tag.endswith('File'):
+                    video_url = child.text
+                    if not video_url or video_url in processed_urls or 'NOT_USED' in video_url:
+                        return
+                    processed_urls.append(video_url)
+                    ext = determine_ext(video_url)
+                    if ext == 'm3u8':
+                        formats.extend(self._extract_m3u8_formats(
+                            video_url, video_id, 'mp4', m3u8_id='hls'))
+                    elif ext == 'f4m':
+                        formats.extend(self._extract_f4m_formats(
+                            video_url, video_id, f4m_id='hds'))
+                    else:
+                        proto = compat_urllib_parse_urlparse(video_url).scheme
+                        if not child.tag.startswith('HTTP') and proto != 'rtmp':
+                            continue
+                        preference = -1 if proto == 'rtmp' else 1
+                        label = child.get('label')
+                        tbr = int_or_none(child.get('bitrate'))
+                        width, height = [int_or_none(x) for x in child.get('resolution', '').split('x')]
+                        formats.append({
+                            'format_id': '%s-%s' % (proto, label if label else tbr),
+                            'url': video_url,
+                            'width': width,
+                            'height': height,
+                            'tbr': tbr,
+                            'preference': preference,
+                        })
+
+        extract_formats(video_xml.find('./Clip'))
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': self._og_search_title(webpage),
+            'description': self._og_search_description(webpage),
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')),
+            'age_limit': int_or_none(xpath_text(video_xml, './/AgeLimit', 'age limit')),
+            'formats': formats,
+        }
index 10251f29e033ef241618ed7985e214dc0e76cd51..f3c80708c86ab2fc29fbd029b245bbe894af2dfb 100644 (file)
@@ -83,7 +83,7 @@ class SafariIE(SafariBaseIE):
                                     library/view/[^/]+|
                                     api/v1/book
                                 )/
-                                (?P<course_id>\d+)/
+                                (?P<course_id>[^/]+)/
                                     (?:chapter(?:-content)?/)?
                                 (?P<part>part\d+)\.html
     '''
@@ -100,6 +100,10 @@ class SafariIE(SafariBaseIE):
     }, {
         'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html',
         'only_matching': True,
+    }, {
+        # non-digits in course id
+        'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
@@ -122,7 +126,7 @@ class SafariCourseIE(SafariBaseIE):
     IE_NAME = 'safari:course'
     IE_DESC = 'safaribooksonline.com online courses'
 
-    _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)/(?P<id>\d+)/?(?:[#?]|$)'
+    _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)/(?P<id>[^/]+)/?(?:[#?]|$)'
 
     _TESTS = [{
         'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/',
index b8775c2f99f4a105ae35f1b04a919e64c987df0f..d6ee2d9e2245475d236c12fb6967af68558d8598 100644 (file)
@@ -1,18 +1,12 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 
-import json
-import re
 from .common import InfoExtractor
-from ..utils import (
-    js_to_json,
-    remove_end,
-)
 
 
 class SBSIE(InfoExtractor):
     IE_DESC = 'sbs.com.au'
-    _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/ondemand/video/(?:single/)?(?P<id>[0-9]+)'
+    _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand|news)/video/(?:single/)?(?P<id>[0-9]+)'
 
     _TESTS = [{
         # Original URL is handled by the generic IE which finds the iframe:
@@ -22,38 +16,36 @@ class SBSIE(InfoExtractor):
         'info_dict': {
             'id': '320403011771',
             'ext': 'mp4',
-            'title': 'Dingo Conservation',
-            'description': 'Dingoes are on the brink of extinction; most of the animals we think are dingoes are in fact crossbred with wild dogs. This family run a dingo conservation park to prevent their extinction',
+            'title': 'Dingo Conservation (The Feed)',
+            'description': 'md5:f250a9856fca50d22dec0b5b8015f8a5',
             'thumbnail': 're:http://.*\.jpg',
+            'duration': 308,
         },
-        'add_ies': ['generic'],
     }, {
         'url': 'http://www.sbs.com.au/ondemand/video/320403011771/Dingo-Conservation-The-Feed',
         'only_matching': True,
+    }, {
+        'url': 'http://www.sbs.com.au/news/video/471395907773/The-Feed-July-9',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        webpage = self._download_webpage(url, video_id)
+        video_id = self._match_id(url)
 
-        release_urls_json = js_to_json(self._search_regex(
-            r'(?s)playerParams\.releaseUrls\s*=\s*(\{.*?\n\});\n',
-            webpage, ''))
-        release_urls = json.loads(release_urls_json)
-        theplatform_url = (
-            release_urls.get('progressive') or release_urls.get('standard'))
+        webpage = self._download_webpage(
+            'http://www.sbs.com.au/ondemand/video/single/%s?context=web' % video_id, video_id)
 
-        title = remove_end(self._og_search_title(webpage), ' (The Feed)')
-        description = self._html_search_meta('description', webpage)
-        thumbnail = self._og_search_thumbnail(webpage)
+        player_params = self._parse_json(
+            self._search_regex(
+                r'(?s)var\s+playerParams\s*=\s*({.+?});', webpage, 'playerParams'),
+            video_id)
+
+        urls = player_params['releaseUrls']
+        theplatform_url = (urls.get('progressive') or urls.get('standard') or
+                           urls.get('html') or player_params['relatedItemsURL'])
 
         return {
             '_type': 'url_transparent',
             'id': video_id,
             'url': theplatform_url,
-
-            'title': title,
-            'description': description,
-            'thumbnail': thumbnail,
         }
index 74fb1983ac4cbb60f83299e441ecc7ca802b1687..d1ab66b3216d5153a5480769fb0723919f3fdb37 100644 (file)
@@ -11,7 +11,7 @@ from ..utils import (
 
 
 class ScreenwaveMediaIE(InfoExtractor):
-    _VALID_URL = r'http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=(?P<id>.+)'
+    _VALID_URL = r'http://player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?[^"]*\bid=(?P<id>.+)'
 
     _TESTS = [{
         'url': 'http://player.screenwavemedia.com/play/play.php?playerdiv=videoarea&companiondiv=squareAd&id=Cinemassacre-19911',
@@ -20,7 +20,10 @@ class ScreenwaveMediaIE(InfoExtractor):
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        playerdata = self._download_webpage(url, video_id, 'Downloading player webpage')
+
+        playerdata = self._download_webpage(
+            'http://player.screenwavemedia.com/play/player.php?id=%s' % video_id,
+            video_id, 'Downloading player webpage')
 
         vidtitle = self._search_regex(
             r'\'vidtitle\'\s*:\s*"([^"]+)"', playerdata, 'vidtitle').replace('\\/', '/')
@@ -99,7 +102,7 @@ class TeamFourIE(InfoExtractor):
         webpage = self._download_webpage(url, display_id)
 
         playerdata_url = self._search_regex(
-            r'src="(http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"',
+            r'src="(http://player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?[^"]*\bid=.+?)"',
             webpage, 'player data URL')
 
         video_title = self._html_search_regex(
index d3b8a1be49702f71a1a8c4eb7bd01d17cf103071..9c53704ea383b1af34e8f8157e327b71c2c3865a 100644 (file)
@@ -48,7 +48,7 @@ class SenateISVPIE(InfoExtractor):
         ["arch", "", "http://ussenate-f.akamaihd.net/"]
     ]
     _IE_NAME = 'senate.gov'
-    _VALID_URL = r'http://www\.senate\.gov/isvp/\?(?P<qs>.+)'
+    _VALID_URL = r'http://www\.senate\.gov/isvp/?\?(?P<qs>.+)'
     _TESTS = [{
         'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png',
         'info_dict': {
@@ -72,12 +72,16 @@ class SenateISVPIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'Integrated Senate Video Player'
         }
+    }, {
+        # From http://www.c-span.org/video/?96791-1
+        'url': 'http://www.senate.gov/isvp?type=live&comm=banking&filename=banking012715',
+        'only_matching': True,
     }]
 
     @staticmethod
     def _search_iframe_url(webpage):
         mobj = re.search(
-            r"<iframe[^>]+src=['\"](?P<url>http://www\.senate\.gov/isvp/\?[^'\"]+)['\"]",
+            r"<iframe[^>]+src=['\"](?P<url>http://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]",
             webpage)
         if mobj:
             return mobj.group('url')
index 26ced716e8a875f1c4c5c9527b856475dce83f9e..a07677686a4ecc2923b310c3aeeeaab610bb0868 100644 (file)
@@ -1,6 +1,5 @@
 from __future__ import unicode_literals
 
-import re
 import base64
 
 from .common import InfoExtractor
@@ -35,8 +34,7 @@ class SharedIE(InfoExtractor):
             raise ExtractorError(
                 'Video %s does not exist' % video_id, expected=True)
 
-        download_form = dict(re.findall(
-            r'<input type="hidden" name="([^"]+)" value="([^"]*)"', webpage))
+        download_form = self._hidden_inputs(webpage)
         request = compat_urllib_request.Request(
             url, compat_urllib_parse.urlencode(download_form))
         request.add_header('Content-Type', 'application/x-www-form-urlencoded')
@@ -47,7 +45,7 @@ class SharedIE(InfoExtractor):
         video_url = self._html_search_regex(
             r'data-url="([^"]+)"', video_page, 'video URL')
         title = base64.b64decode(self._html_search_meta(
-            'full:title', webpage, 'title')).decode('utf-8')
+            'full:title', webpage, 'title').encode('utf-8')).decode('utf-8')
         filesize = int_or_none(self._html_search_meta(
             'full:size', webpage, 'file size', fatal=False))
         thumbnail = self._html_search_regex(
index 24746a09a0c2183e8a0bd8e239cb59291b41f19a..93a7cfe15cc764bc61b912dd2e3283d950790565 100644 (file)
@@ -53,7 +53,7 @@ class SmotriIE(InfoExtractor):
                 'thumbnail': 'http://frame4.loadup.ru/03/ed/57591.2.3.jpg',
             },
         },
-        # video-password
+        # video-password, not approved by moderator
         {
             'url': 'http://smotri.com/video/view/?id=v1390466a13c',
             'md5': 'f6331cef33cad65a0815ee482a54440b',
@@ -71,7 +71,24 @@ class SmotriIE(InfoExtractor):
             },
             'skip': 'Video is not approved by moderator',
         },
-        # age limit + video-password
+        # video-password
+        {
+            'url': 'http://smotri.com/video/view/?id=v6984858774#',
+            'md5': 'f11e01d13ac676370fc3b95b9bda11b0',
+            'info_dict': {
+                'id': 'v6984858774',
+                'ext': 'mp4',
+                'title': 'Дача Солженицина ПАРОЛЬ 223322',
+                'uploader': 'psavari1',
+                'uploader_id': 'psavari1',
+                'upload_date': '20081103',
+                'thumbnail': 're:^https?://.*\.jpg$',
+            },
+            'params': {
+                'videopassword': '223322',
+            },
+        },
+        # age limit + video-password, not approved by moderator
         {
             'url': 'http://smotri.com/video/view/?id=v15408898bcf',
             'md5': '91e909c9f0521adf5ee86fbe073aad70',
@@ -90,19 +107,22 @@ class SmotriIE(InfoExtractor):
             },
             'skip': 'Video is not approved by moderator',
         },
-        # not approved by moderator, but available
+        # age limit + video-password
         {
-            'url': 'http://smotri.com/video/view/?id=v28888533b73',
-            'md5': 'f44bc7adac90af518ef1ecf04893bb34',
+            'url': 'http://smotri.com/video/view/?id=v7780025814',
+            'md5': 'b4599b068422559374a59300c5337d72',
             'info_dict': {
-                'id': 'v28888533b73',
+                'id': 'v7780025814',
                 'ext': 'mp4',
-                'title': 'Russian Spies Killed By ISIL Child Soldier',
-                'uploader': 'Mopeder',
-                'uploader_id': 'mopeder',
-                'duration': 71,
-                'thumbnail': 'http://frame9.loadup.ru/d7/32/2888853.2.3.jpg',
-                'upload_date': '20150114',
+                'title': 'Sexy Beach (пароль 123)',
+                'uploader': 'вАся',
+                'uploader_id': 'asya_prosto',
+                'upload_date': '20081218',
+                'thumbnail': 're:^https?://.*\.jpg$',
+                'age_limit': 18,
+            },
+            'params': {
+                'videopassword': '123'
             },
         },
         # swf player
@@ -152,6 +172,10 @@ class SmotriIE(InfoExtractor):
             'getvideoinfo': '1',
         }
 
+        video_password = self._downloader.params.get('videopassword', None)
+        if video_password:
+            video_form['pass'] = hashlib.md5(video_password.encode('utf-8')).hexdigest()
+
         request = compat_urllib_request.Request(
             'http://smotri.com/video/view/url/bot/', compat_urllib_parse.urlencode(video_form))
         request.add_header('Content-Type', 'application/x-www-form-urlencoded')
@@ -161,13 +185,18 @@ class SmotriIE(InfoExtractor):
         video_url = video.get('_vidURL') or video.get('_vidURL_mp4')
 
         if not video_url:
-            if video.get('_moderate_no') or not video.get('moderated'):
+            if video.get('_moderate_no'):
                 raise ExtractorError(
                     'Video %s has not been approved by moderator' % video_id, expected=True)
 
             if video.get('error'):
                 raise ExtractorError('Video %s does not exist' % video_id, expected=True)
 
+            if video.get('_pass_protected') == 1:
+                msg = ('Invalid video password' if video_password
+                       else 'This video is protected by a password, use the --video-password option')
+                raise ExtractorError(msg, expected=True)
+
         title = video['title']
         thumbnail = video['_imgURL']
         upload_date = unified_strdate(video['added'])
diff --git a/youtube_dl/extractor/snagfilms.py b/youtube_dl/extractor/snagfilms.py
new file mode 100644 (file)
index 0000000..6977afb
--- /dev/null
@@ -0,0 +1,181 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    clean_html,
+    determine_ext,
+    int_or_none,
+    js_to_json,
+    parse_duration,
+)
+
+
+class SnagFilmsEmbedIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:(?:www|embed)\.)?snagfilms\.com/embed/player\?.*\bfilmId=(?P<id>[\da-f-]{36})'
+    _TESTS = [{
+        'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500',
+        'md5': '2924e9215c6eff7a55ed35b72276bd93',
+        'info_dict': {
+            'id': '74849a00-85a9-11e1-9660-123139220831',
+            'ext': 'mp4',
+            'title': '#whilewewatch',
+        }
+    }, {
+        # invalid labels, 360p is better that 480p
+        'url': 'http://www.snagfilms.com/embed/player?filmId=17ca0950-a74a-11e0-a92a-0026bb61d036',
+        'md5': '882fca19b9eb27ef865efeeaed376a48',
+        'info_dict': {
+            'id': '17ca0950-a74a-11e0-a92a-0026bb61d036',
+            'ext': 'mp4',
+            'title': 'Life in Limbo',
+        }
+    }, {
+        'url': 'http://www.snagfilms.com/embed/player?filmId=0000014c-de2f-d5d6-abcf-ffef58af0017',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?snagfilms\.com/embed/player.+?)\1',
+            webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        if '>This film is not playable in your area.<' in webpage:
+            raise ExtractorError(
+                'Film %s is not playable in your area.' % video_id, expected=True)
+
+        formats = []
+        for source in self._parse_json(js_to_json(self._search_regex(
+                r'(?s)sources:\s*(\[.+?\]),', webpage, 'json')), video_id):
+            file_ = source.get('file')
+            if not file_:
+                continue
+            type_ = source.get('type')
+            ext = determine_ext(file_)
+            format_id = source.get('label') or ext
+            if all(v == 'm3u8' for v in (type_, ext)):
+                formats.extend(self._extract_m3u8_formats(
+                    file_, video_id, 'mp4', m3u8_id='hls'))
+            else:
+                bitrate = int_or_none(self._search_regex(
+                    [r'(\d+)kbps', r'_\d{1,2}x\d{1,2}_(\d{3,})\.%s' % ext],
+                    file_, 'bitrate', default=None))
+                height = int_or_none(self._search_regex(
+                    r'^(\d+)[pP]$', format_id, 'height', default=None))
+                formats.append({
+                    'url': file_,
+                    'format_id': format_id,
+                    'tbr': bitrate,
+                    'height': height,
+                })
+        self._sort_formats(formats)
+
+        title = self._search_regex(
+            [r"title\s*:\s*'([^']+)'", r'<title>([^<]+)</title>'],
+            webpage, 'title')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+        }
+
+
+class SnagFilmsIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?snagfilms\.com/(?:films/title|show)/(?P<id>[^?#]+)'
+    _TESTS = [{
+        'url': 'http://www.snagfilms.com/films/title/lost_for_life',
+        'md5': '19844f897b35af219773fd63bdec2942',
+        'info_dict': {
+            'id': '0000014c-de2f-d5d6-abcf-ffef58af0017',
+            'display_id': 'lost_for_life',
+            'ext': 'mp4',
+            'title': 'Lost for Life',
+            'description': 'md5:fbdacc8bb6b455e464aaf98bc02e1c82',
+            'thumbnail': 're:^https?://.*\.jpg',
+            'duration': 4489,
+            'categories': ['Documentary', 'Crime', 'Award Winning', 'Festivals']
+        }
+    }, {
+        'url': 'http://www.snagfilms.com/show/the_world_cut_project/india',
+        'md5': 'e6292e5b837642bbda82d7f8bf3fbdfd',
+        'info_dict': {
+            'id': '00000145-d75c-d96e-a9c7-ff5c67b20000',
+            'display_id': 'the_world_cut_project/india',
+            'ext': 'mp4',
+            'title': 'India',
+            'description': 'md5:5c168c5a8f4719c146aad2e0dfac6f5f',
+            'thumbnail': 're:^https?://.*\.jpg',
+            'duration': 979,
+            'categories': ['Documentary', 'Sports', 'Politics']
+        }
+    }, {
+        # Film is not playable in your area.
+        'url': 'http://www.snagfilms.com/films/title/inside_mecca',
+        'only_matching': True,
+    }, {
+        # Film is not available.
+        'url': 'http://www.snagfilms.com/show/augie_alone/flirting',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        if ">Sorry, the Film you're looking for is not available.<" in webpage:
+            raise ExtractorError(
+                'Film %s is not available.' % display_id, expected=True)
+
+        film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id')
+
+        snag = self._parse_json(
+            self._search_regex(
+                'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag'),
+            display_id)
+
+        for item in snag:
+            if item.get('data', {}).get('film', {}).get('id') == film_id:
+                data = item['data']['film']
+                title = data['title']
+                description = clean_html(data.get('synopsis'))
+                thumbnail = data.get('image')
+                duration = int_or_none(data.get('duration') or data.get('runtime'))
+                categories = [
+                    category['title'] for category in data.get('categories', [])
+                    if category.get('title')]
+                break
+        else:
+            title = self._search_regex(
+                r'itemprop="title">([^<]+)<', webpage, 'title')
+            description = self._html_search_regex(
+                r'(?s)<div itemprop="description" class="film-synopsis-inner ">(.+?)</div>',
+                webpage, 'description', default=None) or self._og_search_description(webpage)
+            thumbnail = self._og_search_thumbnail(webpage)
+            duration = parse_duration(self._search_regex(
+                r'<span itemprop="duration" class="film-duration strong">([^<]+)<',
+                webpage, 'duration', fatal=False))
+            categories = re.findall(r'<a href="/movies/[^"]+">([^<]+)</a>', webpage)
+
+        return {
+            '_type': 'url_transparent',
+            'url': 'http://embed.snagfilms.com/embed/player?filmId=%s' % film_id,
+            'id': film_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'categories': categories,
+        }
diff --git a/youtube_dl/extractor/sockshare.py b/youtube_dl/extractor/sockshare.py
deleted file mode 100644 (file)
index b5fa6f1..0000000
+++ /dev/null
@@ -1,83 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from ..compat import (
-    compat_urllib_parse,
-    compat_urllib_request,
-)
-from ..utils import (
-    determine_ext,
-    ExtractorError,
-)
-
-from .common import InfoExtractor
-
-
-class SockshareIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?sockshare\.com/file/(?P<id>[0-9A-Za-z]+)'
-    _FILE_DELETED_REGEX = r'This file doesn\'t exist, or has been removed\.</div>'
-    _TEST = {
-        'url': 'http://www.sockshare.com/file/437BE28B89D799D7',
-        'md5': '9d0bf1cfb6dbeaa8d562f6c97506c5bd',
-        'info_dict': {
-            'id': '437BE28B89D799D7',
-            'title': 'big_buck_bunny_720p_surround.avi',
-            'ext': 'avi',
-        }
-    }
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-        url = 'http://sockshare.com/file/%s' % video_id
-        webpage = self._download_webpage(url, video_id)
-
-        if re.search(self._FILE_DELETED_REGEX, webpage) is not None:
-            raise ExtractorError('Video %s does not exist' % video_id,
-                                 expected=True)
-
-        confirm_hash = self._html_search_regex(r'''(?x)<input\s+
-            type="hidden"\s+
-            value="([^"]*)"\s+
-            name="hash"
-            ''', webpage, 'hash')
-
-        fields = {
-            "hash": confirm_hash.encode('utf-8'),
-            "confirm": "Continue as Free User"
-        }
-
-        post = compat_urllib_parse.urlencode(fields)
-        req = compat_urllib_request.Request(url, post)
-        # Apparently, this header is required for confirmation to work.
-        req.add_header('Host', 'www.sockshare.com')
-        req.add_header('Content-type', 'application/x-www-form-urlencoded')
-
-        webpage = self._download_webpage(
-            req, video_id, 'Downloading video page')
-
-        video_url = self._html_search_regex(
-            r'<a href="([^"]*)".+class="download_file_link"',
-            webpage, 'file url')
-        video_url = "http://www.sockshare.com" + video_url
-        title = self._html_search_regex((
-            r'<h1>(.+)<strong>',
-            r'var name = "([^"]+)";'),
-            webpage, 'title', default=None)
-        thumbnail = self._html_search_regex(
-            r'<img\s+src="([^"]*)".+?name="bg"',
-            webpage, 'thumbnail', default=None)
-
-        formats = [{
-            'format_id': 'sd',
-            'url': video_url,
-            'ext': determine_ext(title),
-        }]
-
-        return {
-            'id': video_id,
-            'title': title,
-            'thumbnail': thumbnail,
-            'formats': formats,
-        }
index f8a4840f7b210d0092154a0910188a22efd9bf8b..ba2d5e19bc0d1de322b4b12ed5b8c0dc31157f7f 100644 (file)
@@ -6,9 +6,12 @@ import re
 from .common import InfoExtractor
 from ..compat import (
     compat_str,
-    compat_urllib_request
+    compat_urllib_request,
+    compat_urllib_parse,
+)
+from ..utils import (
+    ExtractorError,
 )
-from ..utils import sanitize_url_path_consecutive_slashes
 
 
 class SohuIE(InfoExtractor):
@@ -23,9 +26,7 @@ class SohuIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'MV:Far East Movement《The Illest》',
         },
-        'params': {
-            'cn_verification_proxy': 'proxy.uku.im:8888'
-        }
+        'skip': 'On available in China',
     }, {
         'url': 'http://tv.sohu.com/20150305/n409385080.shtml',
         'md5': '699060e75cf58858dd47fb9c03c42cfb',
@@ -117,6 +118,15 @@ class SohuIE(InfoExtractor):
             r'var vid ?= ?["\'](\d+)["\']',
             webpage, 'video path')
         vid_data = _fetch_data(vid, mytv)
+        if vid_data['play'] != 1:
+            if vid_data.get('status') == 12:
+                raise ExtractorError(
+                    'Sohu said: There\'s something wrong in the video.',
+                    expected=True)
+            else:
+                raise ExtractorError(
+                    'Sohu said: The video is only licensed to users in Mainland China.',
+                    expected=True)
 
         formats_json = {}
         for format_id in ('nor', 'high', 'super', 'ori', 'h2644k', 'h2654k'):
@@ -133,23 +143,41 @@ class SohuIE(InfoExtractor):
             formats = []
             for format_id, format_data in formats_json.items():
                 allot = format_data['allot']
-                prot = format_data['prot']
 
                 data = format_data['data']
                 clips_url = data['clipsURL']
                 su = data['su']
 
-                part_str = self._download_webpage(
-                    'http://%s/?prot=%s&file=%s&new=%s' %
-                    (allot, prot, clips_url[i], su[i]),
-                    video_id,
-                    'Downloading %s video URL part %d of %d'
-                    % (format_id, i + 1, part_count))
+                video_url = 'newflv.sohu.ccgslb.net'
+                cdnId = None
+                retries = 0
+
+                while 'newflv.sohu.ccgslb.net' in video_url:
+                    params = {
+                        'prot': 9,
+                        'file': clips_url[i],
+                        'new': su[i],
+                        'prod': 'flash',
+                    }
+
+                    if cdnId is not None:
+                        params['idc'] = cdnId
+
+                    download_note = 'Downloading %s video URL part %d of %d' % (
+                        format_id, i + 1, part_count)
+
+                    if retries > 0:
+                        download_note += ' (retry #%d)' % retries
+                    part_info = self._parse_json(self._download_webpage(
+                        'http://%s/?%s' % (allot, compat_urllib_parse.urlencode(params)),
+                        video_id, download_note), video_id)
 
-                part_info = part_str.split('|')
+                    video_url = part_info['url']
+                    cdnId = part_info.get('nid')
 
-                video_url = sanitize_url_path_consecutive_slashes(
-                    '%s%s?key=%s' % (part_info[0], su[i], part_info[3]))
+                    retries += 1
+                    if retries > 5:
+                        raise ExtractorError('Failed to get video URL')
 
                 formats.append({
                     'url': video_url,
diff --git a/youtube_dl/extractor/soompi.py b/youtube_dl/extractor/soompi.py
new file mode 100644 (file)
index 0000000..5da66ca
--- /dev/null
@@ -0,0 +1,146 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .crunchyroll import CrunchyrollIE
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    remove_start,
+    xpath_text,
+)
+
+
+class SoompiBaseIE(InfoExtractor):
+    def _get_episodes(self, webpage, episode_filter=None):
+        episodes = self._parse_json(
+            self._search_regex(
+                r'VIDEOS\s*=\s*(\[.+?\]);', webpage, 'episodes JSON'),
+            None)
+        return list(filter(episode_filter, episodes))
+
+
+class SoompiIE(SoompiBaseIE, CrunchyrollIE):
+    IE_NAME = 'soompi'
+    _VALID_URL = r'https?://tv\.soompi\.com/(?:en/)?watch/(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://tv.soompi.com/en/watch/29235',
+        'info_dict': {
+            'id': '29235',
+            'ext': 'mp4',
+            'title': 'Episode 1096',
+            'description': '2015-05-20'
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    def _get_episode(self, webpage, video_id):
+        return self._get_episodes(webpage, lambda x: x['id'] == video_id)[0]
+
+    def _get_subtitles(self, config, video_id):
+        sub_langs = {}
+        for subtitle in config.findall('./{default}preload/subtitles/subtitle'):
+            sub_langs[subtitle.attrib['id']] = subtitle.attrib['title']
+
+        subtitles = {}
+        for s in config.findall('./{default}preload/subtitle'):
+            lang_code = sub_langs.get(s.attrib['id'])
+            if not lang_code:
+                continue
+            sub_id = s.get('id')
+            data = xpath_text(s, './data', 'data')
+            iv = xpath_text(s, './iv', 'iv')
+            if not id or not iv or not data:
+                continue
+            subtitle = self._decrypt_subtitles(data, iv, sub_id).decode('utf-8')
+            subtitles[lang_code] = self._extract_subtitles(subtitle)
+        return subtitles
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        try:
+            webpage = self._download_webpage(
+                url, video_id, 'Downloading episode page')
+        except ExtractorError as ee:
+            if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
+                webpage = ee.cause.read()
+                block_message = self._html_search_regex(
+                    r'(?s)<div class="block-message">(.+?)</div>', webpage,
+                    'block message', default=None)
+                if block_message:
+                    raise ExtractorError(block_message, expected=True)
+            raise
+
+        formats = []
+        config = None
+        for format_id in re.findall(r'\?quality=([0-9a-zA-Z]+)', webpage):
+            config = self._download_xml(
+                'http://tv.soompi.com/en/show/_/%s-config.xml?mode=hls&quality=%s' % (video_id, format_id),
+                video_id, 'Downloading %s XML' % format_id)
+            m3u8_url = xpath_text(
+                config, './{default}preload/stream_info/file',
+                '%s m3u8 URL' % format_id)
+            if not m3u8_url:
+                continue
+            formats.extend(self._extract_m3u8_formats(
+                m3u8_url, video_id, 'mp4', m3u8_id=format_id))
+        self._sort_formats(formats)
+
+        episode = self._get_episode(webpage, video_id)
+
+        title = episode['name']
+        description = episode.get('description')
+        duration = int_or_none(episode.get('duration'))
+
+        thumbnails = [{
+            'id': thumbnail_id,
+            'url': thumbnail_url,
+        } for thumbnail_id, thumbnail_url in episode.get('img_url', {}).items()]
+
+        subtitles = self.extract_subtitles(config, video_id)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnails': thumbnails,
+            'duration': duration,
+            'formats': formats,
+            'subtitles': subtitles
+        }
+
+
+class SoompiShowIE(SoompiBaseIE):
+    IE_NAME = 'soompi:show'
+    _VALID_URL = r'https?://tv\.soompi\.com/en/shows/(?P<id>[0-9a-zA-Z\-_]+)'
+    _TESTS = [{
+        'url': 'http://tv.soompi.com/en/shows/liar-game',
+        'info_dict': {
+            'id': 'liar-game',
+            'title': 'Liar Game',
+            'description': 'md5:52c02bce0c1a622a95823591d0589b66',
+        },
+        'playlist_count': 14,
+    }]
+
+    def _real_extract(self, url):
+        show_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            url, show_id, 'Downloading show page')
+
+        title = remove_start(self._og_search_title(webpage), 'SoompiTV | ')
+        description = self._og_search_description(webpage)
+
+        entries = [
+            self.url_result('http://tv.soompi.com/en/watch/%s' % episode['id'], 'Soompi')
+            for episode in self._get_episodes(webpage)]
+
+        return self.playlist_result(entries, show_id, title, description)
index 183ff50f4fc2b15a2ca99704e9277b8c68abd576..0a6c9fe727895b00c8830b2b3568d62f973541a7 100644 (file)
@@ -29,7 +29,7 @@ class SoundcloudIE(InfoExtractor):
     _VALID_URL = r'''(?x)^(?:https?://)?
                     (?:(?:(?:www\.|m\.)?soundcloud\.com/
                             (?P<uploader>[\w\d-]+)/
-                            (?!sets/|likes/?(?:$|[?#]))
+                            (?!sets/|(?:likes|tracks)/?(?:$|[?#]))
                             (?P<title>[\w\d-]+)/?
                             (?P<token>[^?]+?)?(?:[?].*)?$)
                        |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+)
@@ -282,9 +282,11 @@ class SoundcloudSetIE(SoundcloudIE):
             msgs = (compat_str(err['error_message']) for err in info['errors'])
             raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs))
 
+        entries = [self.url_result(track['permalink_url'], 'Soundcloud') for track in info['tracks']]
+
         return {
             '_type': 'playlist',
-            'entries': [self._extract_info_dict(track, secret_token=token) for track in info['tracks']],
+            'entries': entries,
             'id': '%s' % info['id'],
             'title': info['title'],
         }
@@ -307,6 +309,9 @@ class SoundcloudUserIE(SoundcloudIE):
             'title': 'The Royal Concept',
         },
         'playlist_mincount': 1,
+    }, {
+        'url': 'https://soundcloud.com/the-akashic-chronicler/tracks',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
@@ -336,7 +341,7 @@ class SoundcloudUserIE(SoundcloudIE):
             if len(new_entries) == 0:
                 self.to_screen('%s: End page received' % uploader)
                 break
-            entries.extend(self._extract_info_dict(e, quiet=True) for e in new_entries)
+            entries.extend(self.url_result(e['permalink_url'], 'Soundcloud') for e in new_entries)
 
         return {
             '_type': 'playlist',
@@ -376,9 +381,7 @@ class SoundcloudPlaylistIE(SoundcloudIE):
         data = self._download_json(
             base_url + data, playlist_id, 'Downloading playlist')
 
-        entries = [
-            self._extract_info_dict(t, quiet=True, secret_token=token)
-            for t in data['tracks']]
+        entries = [self.url_result(track['permalink_url'], 'Soundcloud') for track in data['tracks']]
 
         return {
             '_type': 'playlist',
index 59e31198cd08b24af12c8a9175ad3a89a1e53eb8..7fb165a872766f4c1917d2929ec8a73b8f74434e 100644 (file)
@@ -57,3 +57,14 @@ class SouthParkNlIE(SouthParkIE):
         'url': 'http://www.southpark.nl/full-episodes/s18e06-freemium-isnt-free',
         'playlist_count': 4,
     }]
+
+
+class SouthParkDkIE(SouthParkIE):
+    IE_NAME = 'southparkstudios.dk'
+    _VALID_URL = r'https?://(?:www\.)?(?P<url>southparkstudios\.dk/(?:clips|full-episodes)/(?P<id>.+?)(\?|#|$))'
+    _FEED_URL = 'http://www.southparkstudios.dk/feeds/video-player/mrss/'
+
+    _TESTS = [{
+        'url': 'http://www.southparkstudios.dk/full-episodes/s18e07-grounded-vindaloop',
+        'playlist_count': 4,
+    }]
index b936202f6f3005fe9ae085724566d709c6a484cc..5fa6faf18b738aa32e384972bf65ad56188ad9b4 100644 (file)
@@ -4,7 +4,7 @@ import re
 
 from .common import InfoExtractor
 from ..compat import (
-    compat_urllib_parse,
+    compat_urllib_parse_unquote,
     compat_urllib_parse_urlparse,
     compat_urllib_request,
 )
@@ -27,7 +27,7 @@ class SpankwireIE(InfoExtractor):
             'description': 'Crazy Bitch X rated music video.',
             'uploader': 'oreusz',
             'uploader_id': '124697',
-            'upload_date': '20070508',
+            'upload_date': '20070507',
             'age_limit': 18,
         }
     }
@@ -44,7 +44,7 @@ class SpankwireIE(InfoExtractor):
         title = self._html_search_regex(
             r'<h1>([^<]+)', webpage, 'title')
         description = self._html_search_regex(
-            r'<div\s+id="descriptionContent">([^<]+)<',
+            r'(?s)<div\s+id="descriptionContent">(.+?)</div>',
             webpage, 'description', fatal=False)
         thumbnail = self._html_search_regex(
             r'playerData\.screenShot\s*=\s*["\']([^"\']+)["\']',
@@ -64,14 +64,14 @@ class SpankwireIE(InfoExtractor):
             r'<div id="viewsCounter"><span>([\d,\.]+)</span> views</div>',
             webpage, 'view count', fatal=False))
         comment_count = str_to_int(self._html_search_regex(
-            r'Comments<span[^>]+>\s*\(([\d,\.]+)\)</span>',
+            r'<span\s+id="spCommentCount"[^>]*>([\d,\.]+)</span>',
             webpage, 'comment count', fatal=False))
 
         video_urls = list(map(
-            compat_urllib_parse.unquote,
-            re.findall(r'playerData\.cdnPath[0-9]{3,}\s*=\s*["\']([^"\']+)["\']', webpage)))
+            compat_urllib_parse_unquote,
+            re.findall(r'playerData\.cdnPath[0-9]{3,}\s*=\s*(?:encodeURIComponent\()?["\']([^"\']+)["\']', webpage)))
         if webpage.find('flashvars\.encrypted = "true"') != -1:
-            password = self._html_search_regex(
+            password = self._search_regex(
                 r'flashvars\.video_title = "([^"]+)',
                 webpage, 'password').replace('+', ' ')
             video_urls = list(map(
index 98cf92d89a1151edfd11b8f15a86eeaa6a83178d..27f4033c547a9700db6af520c4fe4a957e3755c0 100644 (file)
@@ -2,7 +2,11 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..utils import float_or_none
+from ..compat import compat_urllib_parse_urlparse
+from ..utils import (
+    determine_ext,
+    float_or_none,
+)
 
 
 class SpiegeltvIE(InfoExtractor):
@@ -17,7 +21,7 @@ class SpiegeltvIE(InfoExtractor):
             'thumbnail': 're:http://.*\.jpg$',
         },
         'params': {
-            # rtmp download
+            # m3u8 download
             'skip_download': True,
         }
     }, {
@@ -51,9 +55,39 @@ class SpiegeltvIE(InfoExtractor):
         is_wide = media_json['is_wide']
 
         server_json = self._download_json(
-            'http://www.spiegel.tv/streaming_servers/', video_id,
-            note='Downloading server information')
-        server = server_json[0]['endpoint']
+            'http://spiegeltv-prod-static.s3.amazonaws.com/projectConfigs/projectConfig.json',
+            video_id, note='Downloading server information')
+
+        format = '16x9' if is_wide else '4x3'
+
+        formats = []
+        for streamingserver in server_json['streamingserver']:
+            endpoint = streamingserver.get('endpoint')
+            if not endpoint:
+                continue
+            play_path = 'mp4:%s_spiegeltv_0500_%s.m4v' % (uuid, format)
+            if endpoint.startswith('rtmp'):
+                formats.append({
+                    'url': endpoint,
+                    'format_id': 'rtmp',
+                    'app': compat_urllib_parse_urlparse(endpoint).path[1:],
+                    'play_path': play_path,
+                    'player_path': 'http://prod-static.spiegel.tv/frontend-076.swf',
+                    'ext': 'flv',
+                    'rtmp_live': True,
+                })
+            elif determine_ext(endpoint) == 'm3u8':
+                m3u8_formats = self._extract_m3u8_formats(
+                    endpoint.replace('[video]', play_path),
+                    video_id, 'm4v',
+                    preference=1,  # Prefer hls since it allows to workaround georestriction
+                    m3u8_id='hls', fatal=False)
+                if m3u8_formats is not False:
+                    formats.extend(m3u8_formats)
+            else:
+                formats.append({
+                    'url': endpoint,
+                })
 
         thumbnails = []
         for image in media_json['images']:
@@ -65,16 +99,12 @@ class SpiegeltvIE(InfoExtractor):
 
         description = media_json['subtitle']
         duration = float_or_none(media_json.get('duration_in_ms'), scale=1000)
-        format = '16x9' if is_wide else '4x3'
-
-        url = server + 'mp4:' + uuid + '_spiegeltv_0500_' + format + '.m4v'
 
         return {
             'id': video_id,
             'title': title,
-            'url': url,
-            'ext': 'm4v',
             'description': description,
             'duration': duration,
-            'thumbnails': thumbnails
+            'thumbnails': thumbnails,
+            'formats': formats,
         }
index becdf658f6e0ce8b209dffc0ce4c96a2857099dc..86d509ae5351a3cc15be66dda9f485d63ec166ba 100644 (file)
@@ -4,37 +4,36 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..compat import compat_urlparse
 from ..utils import (
-    parse_duration,
-    parse_iso8601,
+    unified_strdate,
 )
 
 
 class SportBoxIE(InfoExtractor):
-    _VALID_URL = r'https?://news\.sportbox\.ru/Vidy_sporta/(?:[^/]+/)+spbvideo_NI\d+_(?P<display_id>.+)'
-    _TESTS = [
-        {
-            'url': 'http://news.sportbox.ru/Vidy_sporta/Avtosport/Rossijskij/spbvideo_NI483529_Gonka-2-zaezd-Obyedinenniy-2000-klassi-Turing-i-S',
-            'md5': 'ff56a598c2cf411a9a38a69709e97079',
-            'info_dict': {
-                'id': '80822',
-                'ext': 'mp4',
-                'title': 'Гонка 2  заезд ««Объединенный 2000»: классы Туринг и Супер-продакшн',
-                'description': 'md5:81715fa9c4ea3d9e7915dc8180c778ed',
-                'thumbnail': 're:^https?://.*\.jpg$',
-                'timestamp': 1411896237,
-                'upload_date': '20140928',
-                'duration': 4846,
-            },
-            'params': {
-                # m3u8 download
-                'skip_download': True,
-            },
-        }, {
-            'url': 'http://news.sportbox.ru/Vidy_sporta/billiard/spbvideo_NI486287_CHempionat-mira-po-dinamichnoy-piramide-4',
-            'only_matching': True,
-        }
-    ]
+    _VALID_URL = r'https?://news\.sportbox\.ru/(?:[^/]+/)+spbvideo_NI\d+_(?P<display_id>.+)'
+    _TESTS = [{
+        'url': 'http://news.sportbox.ru/Vidy_sporta/Avtosport/Rossijskij/spbvideo_NI483529_Gonka-2-zaezd-Obyedinenniy-2000-klassi-Turing-i-S',
+        'md5': 'ff56a598c2cf411a9a38a69709e97079',
+        'info_dict': {
+            'id': '80822',
+            'ext': 'mp4',
+            'title': 'Гонка 2  заезд ««Объединенный 2000»: классы Туринг и Супер-продакшн',
+            'description': 'md5:3d72dc4a006ab6805d82f037fdc637ad',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'upload_date': '20140928',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://news.sportbox.ru/Vidy_sporta/billiard/spbvideo_NI486287_CHempionat-mira-po-dinamichnoy-piramide-4',
+        'only_matching': True,
+    }, {
+        'url': 'http://news.sportbox.ru/video/no_ads/spbvideo_NI536574_V_Novorossijske_proshel_detskij_turnir_Pole_slavy_bojevoj?ci=211355',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
@@ -42,35 +41,75 @@ class SportBoxIE(InfoExtractor):
 
         webpage = self._download_webpage(url, display_id)
 
-        video_id = self._search_regex(
-            r'src="/vdl/player/media/(\d+)"', webpage, 'video id')
+        player = self._search_regex(
+            r'src="/?(vdl/player/[^"]+)"', webpage, 'player')
+
+        title = self._html_search_regex(
+            [r'"nodetitle"\s*:\s*"([^"]+)"', r'class="node-header_{1,2}title">([^<]+)'],
+            webpage, 'title')
+        description = self._og_search_description(webpage) or self._html_search_meta(
+            'description', webpage, 'description')
+        thumbnail = self._og_search_thumbnail(webpage)
+        upload_date = unified_strdate(self._html_search_meta(
+            'dateCreated', webpage, 'upload date'))
+
+        return {
+            '_type': 'url_transparent',
+            'url': compat_urlparse.urljoin(url, '/%s' % player),
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'upload_date': upload_date,
+        }
 
-        player = self._download_webpage(
-            'http://news.sportbox.ru/vdl/player/media/%s' % video_id,
-            display_id, 'Downloading player webpage')
+
+class SportBoxEmbedIE(InfoExtractor):
+    _VALID_URL = r'https?://news\.sportbox\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://news.sportbox.ru/vdl/player/ci/211355',
+        'info_dict': {
+            'id': '211355',
+            'ext': 'mp4',
+            'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»',
+            'thumbnail': 're:^https?://.*\.jpg$',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://news.sportbox.ru/vdl/player?nid=370908&only_player=1&autostart=false&playeri=2&height=340&width=580',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return re.findall(
+            r'<iframe[^>]+src="(https?://news\.sportbox\.ru/vdl/player[^"]+)"',
+            webpage)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
 
         hls = self._search_regex(
-            r"var\s+original_hls_file\s*=\s*'([^']+)'", player, 'hls file')
+            r"sportboxPlayer\.jwplayer_common_params\.file\s*=\s*['\"]([^'\"]+)['\"]",
+            webpage, 'hls file')
 
-        formats = self._extract_m3u8_formats(hls, display_id, 'mp4')
+        formats = self._extract_m3u8_formats(hls, video_id, 'mp4')
 
-        title = self._html_search_regex(
-            r'<h1 itemprop="name">([^<]+)</h1>', webpage, 'title')
-        description = self._html_search_regex(
-            r'(?s)<div itemprop="description">(.+?)</div>', webpage, 'description', fatal=False)
-        thumbnail = self._og_search_thumbnail(webpage)
-        timestamp = parse_iso8601(self._search_regex(
-            r'<span itemprop="uploadDate">([^<]+)</span>', webpage, 'timestamp', fatal=False))
-        duration = parse_duration(self._html_search_regex(
-            r'<meta itemprop="duration" content="PT([^"]+)">', webpage, 'duration', fatal=False))
+        title = self._search_regex(
+            r'sportboxPlayer\.node_title\s*=\s*"([^"]+)"', webpage, 'title')
+
+        thumbnail = self._search_regex(
+            r'sportboxPlayer\.jwplayer_common_params\.image\s*=\s*"([^"]+)"',
+            webpage, 'thumbnail', default=None)
 
         return {
             'id': video_id,
-            'display_id': display_id,
             'title': title,
-            'description': description,
             'thumbnail': thumbnail,
-            'timestamp': timestamp,
-            'duration': duration,
             'formats': formats,
         }
index 854d01beeb5cefd1f82d7991ee2c0ce75ad33dfa..e527aa97188b1860e054f8af7c7bd7a33301729e 100644 (file)
@@ -44,7 +44,7 @@ class SunPornoIE(InfoExtractor):
             webpage, 'duration', fatal=False))
 
         view_count = int_or_none(self._html_search_regex(
-            r'class="views">\s*(\d+)\s*<',
+            r'class="views">(?:<noscript>)?\s*(\d+)\s*<',
             webpage, 'view count', fatal=False))
         comment_count = int_or_none(self._html_search_regex(
             r'(\d+)</b> Comments?',
diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py
new file mode 100644 (file)
index 0000000..fc20f66
--- /dev/null
@@ -0,0 +1,117 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+)
+
+
+class SVTBaseIE(InfoExtractor):
+    def _extract_video(self, url, video_id):
+        info = self._download_json(url, video_id)
+
+        title = info['context']['title']
+        thumbnail = info['context'].get('thumbnailImage')
+
+        video_info = info['video']
+        formats = []
+        for vr in video_info['videoReferences']:
+            vurl = vr['url']
+            ext = determine_ext(vurl)
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    vurl, video_id,
+                    ext='mp4', entry_protocol='m3u8_native',
+                    m3u8_id=vr.get('playerType')))
+            elif ext == 'f4m':
+                formats.extend(self._extract_f4m_formats(
+                    vurl + '?hdcore=3.3.0', video_id,
+                    f4m_id=vr.get('playerType')))
+            else:
+                formats.append({
+                    'format_id': vr.get('playerType'),
+                    'url': vurl,
+                })
+        self._sort_formats(formats)
+
+        duration = video_info.get('materialLength')
+        age_limit = 18 if video_info.get('inappropriateForChildren') else 0
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'age_limit': age_limit,
+        }
+
+
+class SVTIE(SVTBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?svt\.se/wd\?(?:.*?&)?widgetId=(?P<widget_id>\d+)&.*?\barticleId=(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.svt.se/wd?widgetId=23991&sectionId=541&articleId=2900353&type=embed&contextSectionId=123&autostart=false',
+        'md5': '9648197555fc1b49e3dc22db4af51d46',
+        'info_dict': {
+            'id': '2900353',
+            'ext': 'flv',
+            'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)',
+            'duration': 27,
+            'age_limit': 0,
+        },
+    }
+
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            r'(?:<iframe src|href)="(?P<url>%s[^"]*)"' % SVTIE._VALID_URL, webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        widget_id = mobj.group('widget_id')
+        article_id = mobj.group('id')
+        return self._extract_video(
+            'http://www.svt.se/wd?widgetId=%s&articleId=%s&format=json&type=embed&output=json' % (widget_id, article_id),
+            article_id)
+
+
+class SVTPlayIE(SVTBaseIE):
+    IE_DESC = 'SVT Play and Öppet arkiv'
+    _VALID_URL = r'https?://(?:www\.)?(?P<host>svtplay|oppetarkiv)\.se/video/(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://www.svtplay.se/video/2609989/sm-veckan/sm-veckan-rally-final-sasong-1-sm-veckan-rally-final',
+        'md5': 'ade3def0643fa1c40587a422f98edfd9',
+        'info_dict': {
+            'id': '2609989',
+            'ext': 'flv',
+            'title': 'SM veckan vinter, Örebro - Rally, final',
+            'duration': 4500,
+            'thumbnail': 're:^https?://.*[\.-]jpg$',
+            'age_limit': 0,
+        },
+    }, {
+        'url': 'http://www.oppetarkiv.se/video/1058509/rederiet-sasong-1-avsnitt-1-av-318',
+        'md5': 'c3101a17ce9634f4c1f9800f0746c187',
+        'info_dict': {
+            'id': '1058509',
+            'ext': 'flv',
+            'title': 'Farlig kryssning',
+            'duration': 2566,
+            'thumbnail': 're:^https?://.*[\.-]jpg$',
+            'age_limit': 0,
+        },
+        'skip': 'Only works from Sweden',
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        host = mobj.group('host')
+        return self._extract_video(
+            'http://www.%s.se/video/%s?output=json' % (host, video_id),
+            video_id)
diff --git a/youtube_dl/extractor/svtplay.py b/youtube_dl/extractor/svtplay.py
deleted file mode 100644 (file)
index 433dfd1..0000000
+++ /dev/null
@@ -1,82 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import (
-    determine_ext,
-)
-
-
-class SVTPlayIE(InfoExtractor):
-    IE_DESC = 'SVT Play and Öppet arkiv'
-    _VALID_URL = r'https?://(?:www\.)?(?P<host>svtplay|oppetarkiv)\.se/video/(?P<id>[0-9]+)'
-    _TESTS = [{
-        'url': 'http://www.svtplay.se/video/2609989/sm-veckan/sm-veckan-rally-final-sasong-1-sm-veckan-rally-final',
-        'md5': 'ade3def0643fa1c40587a422f98edfd9',
-        'info_dict': {
-            'id': '2609989',
-            'ext': 'flv',
-            'title': 'SM veckan vinter, Örebro - Rally, final',
-            'duration': 4500,
-            'thumbnail': 're:^https?://.*[\.-]jpg$',
-            'age_limit': 0,
-        },
-    }, {
-        'url': 'http://www.oppetarkiv.se/video/1058509/rederiet-sasong-1-avsnitt-1-av-318',
-        'md5': 'c3101a17ce9634f4c1f9800f0746c187',
-        'info_dict': {
-            'id': '1058509',
-            'ext': 'flv',
-            'title': 'Farlig kryssning',
-            'duration': 2566,
-            'thumbnail': 're:^https?://.*[\.-]jpg$',
-            'age_limit': 0,
-        },
-        'skip': 'Only works from Sweden',
-    }]
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        host = mobj.group('host')
-
-        info = self._download_json(
-            'http://www.%s.se/video/%s?output=json' % (host, video_id), video_id)
-
-        title = info['context']['title']
-        thumbnail = info['context'].get('thumbnailImage')
-
-        video_info = info['video']
-        formats = []
-        for vr in video_info['videoReferences']:
-            vurl = vr['url']
-            ext = determine_ext(vurl)
-            if ext == 'm3u8':
-                formats.extend(self._extract_m3u8_formats(
-                    vurl, video_id,
-                    ext='mp4', entry_protocol='m3u8_native',
-                    m3u8_id=vr.get('playerType')))
-            elif ext == 'f4m':
-                formats.extend(self._extract_f4m_formats(
-                    vurl + '?hdcore=3.3.0', video_id,
-                    f4m_id=vr.get('playerType')))
-            else:
-                formats.append({
-                    'format_id': vr.get('playerType'),
-                    'url': vurl,
-                })
-        self._sort_formats(formats)
-
-        duration = video_info.get('materialLength')
-        age_limit = 18 if video_info.get('inappropriateForChildren') else 0
-
-        return {
-            'id': video_id,
-            'title': title,
-            'formats': formats,
-            'thumbnail': thumbnail,
-            'duration': duration,
-            'age_limit': age_limit,
-        }
index bfe07b02417a2a44f23a09c10c25d48ec18b5535..73e7657d4bec7b1bc37753923744d92b769d8843 100644 (file)
@@ -8,17 +8,17 @@ from ..utils import parse_filesize
 
 
 class TagesschauIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:sendung/ts|video/video)(?P<id>-?[0-9]+)\.html'
+    _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:[^/]+/)*?[^/#?]+?(?P<id>-?[0-9]+)(?:~_[^/#?]+?)?\.html'
 
     _TESTS = [{
-        'url': 'http://www.tagesschau.de/multimedia/video/video1399128.html',
-        'md5': 'bcdeac2194fb296d599ce7929dfa4009',
+        'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html',
+        'md5': '917a228bc7df7850783bc47979673a09',
         'info_dict': {
-            'id': '1399128',
+            'id': '102143',
             'ext': 'mp4',
-            'title': 'Harald Range, Generalbundesanwalt, zu den Ermittlungen',
-            'description': 'md5:69da3c61275b426426d711bde96463ab',
-            'thumbnail': 're:^http:.*\.jpg$',
+            'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt',
+            'description': 'md5:171feccd9d9b3dd54d05d501568f6359',
+            'thumbnail': 're:^https?:.*\.jpg$',
         },
     }, {
         'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html',
@@ -28,8 +28,39 @@ class TagesschauIE(InfoExtractor):
             'ext': 'mp4',
             'description': 'md5:695c01bfd98b7e313c501386327aea59',
             'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr',
-            'thumbnail': 're:^http:.*\.jpg$',
-        }
+            'thumbnail': 're:^https?:.*\.jpg$',
+        },
+    }, {
+        'url': 'http://www.tagesschau.de/multimedia/politikimradio/audio-18407.html',
+        'md5': 'aef45de271c4bf0a5db834aa40bf774c',
+        'info_dict': {
+            'id': '18407',
+            'ext': 'mp3',
+            'title': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich',
+            'description': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich',
+            'thumbnail': 're:^https?:.*\.jpg$',
+        },
+    }, {
+        'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.tagesschau.de/multimedia/sendung/tt-3827.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.tagesschau.de/multimedia/sendung/nm-3475.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.tagesschau.de/multimedia/sendung/weltspiegel-3167.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.tagesschau.de/multimedia/tsvorzwanzig-959.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.tagesschau.de/multimedia/sendung/bab/bab-3299~_bab-sendung-209.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html',
+        'only_matching': True,
     }]
 
     _FORMATS = {
@@ -49,19 +80,26 @@ class TagesschauIE(InfoExtractor):
             playerpage = self._download_webpage(
                 player_url, display_id, 'Downloading player page')
 
-            medias = re.findall(
-                r'"(http://media.+?)", type:"video/(.+?)", quality:"(.+?)"',
-                playerpage)
             formats = []
-            for url, ext, res in medias:
+            for media in re.finditer(
+                    r'''(?x)
+                        (?P<q_url>["\'])(?P<url>http://media.+?)(?P=q_url)
+                        ,\s*type:(?P<q_type>["\'])(?P<type>video|audio)/(?P<ext>.+?)(?P=q_type)
+                        (?:,\s*quality:(?P<q_quality>["\'])(?P<quality>.+?)(?P=q_quality))?
+                    ''', playerpage):
+                url = media.group('url')
+                type_ = media.group('type')
+                ext = media.group('ext')
+                res = media.group('quality')
                 f = {
-                    'format_id': res + '_' + ext,
+                    'format_id': '%s_%s' % (res, ext) if res else ext,
                     'url': url,
                     'ext': ext,
+                    'vcodec': 'none' if type_ == 'audio' else None,
                 }
                 f.update(self._FORMATS.get(res, {}))
                 formats.append(f)
-            thumbnail_fn = re.findall(r'"(/multimedia/.+?\.jpg)"', playerpage)[-1]
+            thumbnail = self._og_search_thumbnail(playerpage)
             title = self._og_search_title(webpage).strip()
             description = self._og_search_description(webpage).strip()
         else:
@@ -99,17 +137,14 @@ class TagesschauIE(InfoExtractor):
                         'filesize_approx': parse_filesize(m.group('filesize_approx')),
                     })
                 formats.append(format)
-            thumbnail_fn = self._search_regex(
-                r'(?s)<img alt="Sendungsbild".*?src="([^"]+)"',
-                webpage, 'thumbnail', fatal=False)
+            thumbnail = self._og_search_thumbnail(webpage)
             description = self._html_search_regex(
                 r'(?s)<p class="teasertext">(.*?)</p>',
-                webpage, 'description', fatal=False)
+                webpage, 'description', default=None)
             title = self._html_search_regex(
                 r'<span class="headline".*?>(.*?)</span>', webpage, 'title')
 
         self._sort_formats(formats)
-        thumbnail = 'http://www.tagesschau.de' + thumbnail_fn
 
         return {
             'id': display_id,
index 95d58ddd0429e84abeef9f6dc3dea74f011318f9..d1b7264b4ca4a0cb72e491da26d7f5bbc1cc66b7 100644 (file)
@@ -4,11 +4,13 @@ from __future__ import unicode_literals
 import base64
 import binascii
 import re
+import json
 
 from .common import InfoExtractor
 from ..utils import (
     ExtractorError,
     qualities,
+    determine_ext,
 )
 from ..compat import compat_ord
 
@@ -49,6 +51,17 @@ class TeamcocoIE(InfoExtractor):
             'params': {
                 'skip_download': True,  # m3u8 downloads
             }
+        }, {
+            'url': 'http://teamcoco.com/video/full-episode-mon-6-1-joel-mchale-jake-tapper-and-musical-guest-courtney-barnett?playlist=x;eyJ0eXBlIjoidGFnIiwiaWQiOjl9',
+            'info_dict': {
+                'id': '89341',
+                'ext': 'mp4',
+                'title': 'Full Episode - Mon. 6/1 - Joel McHale, Jake Tapper, And Musical Guest Courtney Barnett',
+                'description': 'Guests: Joel McHale, Jake Tapper, And Musical Guest Courtney Barnett',
+            },
+            'params': {
+                'skip_download': True,  # m3u8 downloads
+            }
         }
     ]
     _VIDEO_ID_REGEXES = (
@@ -61,56 +74,70 @@ class TeamcocoIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
 
         display_id = mobj.group('display_id')
-        webpage = self._download_webpage(url, display_id)
+        webpage, urlh = self._download_webpage_handle(url, display_id)
+        if 'src=expired' in urlh.geturl():
+            raise ExtractorError('This video is expired.', expected=True)
 
         video_id = mobj.group('video_id')
         if not video_id:
             video_id = self._html_search_regex(
                 self._VIDEO_ID_REGEXES, webpage, 'video id')
 
-        data = preload = None
-        preloads = re.findall(r'"preload":\s*"([^"]+)"', webpage)
-        if preloads:
-            preload = max([(len(p), p) for p in preloads])[1]
-
-        if not preload:
-            preload = ''.join(re.findall(r'this\.push\("([^"]+)"\);', webpage))
-
-        if not preload:
-            preload = self._html_search_regex([
-                r'player,\[?"([^"]+)"\]?', r'player.init\(\[?"([^"]+)"\]?\)'
-            ], webpage.replace('","', ''), 'preload data', default=None)
-
-        if not preload:
-            preload_codes = self._html_search_regex(
-                r'(function.+)setTimeout\(function\(\)\{playlist',
-                webpage, 'preload codes')
-            base64_fragments = re.findall(r'"([a-zA-z0-9+/=]+)"', preload_codes)
-            base64_fragments.remove('init')
-            for i in range(len(base64_fragments)):
-                cur_sequence = (''.join(base64_fragments[i:] + base64_fragments[:i])).encode('ascii')
+        data = None
+
+        preload_codes = self._html_search_regex(
+            r'(function.+)setTimeout\(function\(\)\{playlist',
+            webpage, 'preload codes')
+        base64_fragments = re.findall(r'"([a-zA-z0-9+/=]+)"', preload_codes)
+        base64_fragments.remove('init')
+
+        def _check_sequence(cur_fragments):
+            if not cur_fragments:
+                return
+            for i in range(len(cur_fragments)):
+                cur_sequence = (''.join(cur_fragments[i:] + cur_fragments[:i])).encode('ascii')
                 try:
                     raw_data = base64.b64decode(cur_sequence)
-                except (TypeError, binascii.Error):
+                    if compat_ord(raw_data[0]) == compat_ord('{'):
+                        return json.loads(raw_data.decode('utf-8'))
+                except (TypeError, binascii.Error, UnicodeDecodeError, ValueError):
                     continue
-                if compat_ord(raw_data[0]) == compat_ord('{'):
-                    data = self._parse_json(raw_data.decode('utf-8'), video_id, fatal=False)
 
-        if not preload and not data:
-            raise ExtractorError(
-                'Preload information could not be extracted', expected=True)
+        def _check_data():
+            for i in range(len(base64_fragments) + 1):
+                for j in range(i, len(base64_fragments) + 1):
+                    data = _check_sequence(base64_fragments[:i] + base64_fragments[j:])
+                    if data:
+                        return data
+
+        self.to_screen('Try to compute possible data sequence. This may take some time.')
+        data = _check_data()
 
         if not data:
-            data = self._parse_json(
-                base64.b64decode(preload.encode('ascii')).decode('utf-8'), video_id)
+            raise ExtractorError(
+                'Preload information could not be extracted', expected=True)
 
         formats = []
         get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p'])
         for filed in data['files']:
-            if filed['type'] == 'hls':
-                formats.extend(self._extract_m3u8_formats(
-                    filed['url'], video_id, ext='mp4'))
+            if determine_ext(filed['url']) == 'm3u8':
+                # compat_urllib_parse.urljoin does not work here
+                if filed['url'].startswith('/'):
+                    m3u8_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + filed['url']
+                else:
+                    m3u8_url = filed['url']
+                m3u8_formats = self._extract_m3u8_formats(
+                    m3u8_url, video_id, ext='mp4')
+                for m3u8_format in m3u8_formats:
+                    if m3u8_format not in formats:
+                        formats.append(m3u8_format)
+            elif determine_ext(filed['url']) == 'f4m':
+                # TODO Correct f4m extraction
+                continue
             else:
+                if filed['url'].startswith('/mp4:protected/'):
+                    # TODO Correct extraction for these files
+                    continue
                 m_format = re.search(r'(\d+(k|p))\.mp4', filed['url'])
                 if m_format is not None:
                     format_id = m_format.group(1)
index 251a686804b6f26915c3fa25d9f6b2cc1f98ed4b..a0c744fd16b633b08e7bbb632b77cf40e8410710 100644 (file)
@@ -16,6 +16,10 @@ class TelecincoIE(MiTeleIE):
             'title': 'Con Martín Berasategui, hacer un bacalao al ...',
             'duration': 662,
         },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
     }, {
         'url': 'http://www.telecinco.es/informativos/nacional/Pablo_Iglesias-Informativos_Telecinco-entrevista-Pedro_Piqueras_2_1945155182.html',
         'only_matching': True,
index 466155ef800fbc540292eb343bc9092ab9da6416..f6694149b8e3509b4446458300824a3f3d5fc5de 100644 (file)
@@ -2,6 +2,10 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    float_or_none,
+)
 
 
 class TenPlayIE(InfoExtractor):
@@ -49,18 +53,23 @@ class TenPlayIE(InfoExtractor):
             if protocol == 'rtmp':
                 url = url.replace('&mp4:', '')
 
+                tbr = int_or_none(rendition.get('encodingRate'), 1000)
+
             formats.append({
-                'format_id': '_'.join(['rtmp', rendition['videoContainer'].lower(), rendition['videoCodec'].lower()]),
-                'width': rendition['frameWidth'],
-                'height': rendition['frameHeight'],
-                'tbr': rendition['encodingRate'] / 1024,
-                'filesize': rendition['size'],
+                'format_id': '_'.join(
+                    ['rtmp', rendition['videoContainer'].lower(),
+                     rendition['videoCodec'].lower(), '%sk' % tbr]),
+                'width': int_or_none(rendition['frameWidth']),
+                'height': int_or_none(rendition['frameHeight']),
+                'tbr': tbr,
+                'filesize': int_or_none(rendition['size']),
                 'protocol': protocol,
                 'ext': ext,
                 'vcodec': rendition['videoCodec'].lower(),
                 'container': rendition['videoContainer'].lower(),
                 'url': url,
             })
+        self._sort_formats(formats)
 
         return {
             'id': video_id,
@@ -74,8 +83,8 @@ class TenPlayIE(InfoExtractor):
                 'url': json['thumbnailURL']
             }],
             'thumbnail': json['videoStillURL'],
-            'duration': json['length'] / 1000,
-            'timestamp': float(json['creationDate']) / 1000,
-            'uploader': json['customFields']['production_company_distributor'] if 'production_company_distributor' in json['customFields'] else 'TENplay',
-            'view_count': json['playsTotal']
+            'duration': float_or_none(json.get('length'), 1000),
+            'timestamp': float_or_none(json.get('creationDate'), 1000),
+            'uploader': json.get('customFields', {}).get('production_company_distributor') or 'TENplay',
+            'view_count': int_or_none(json.get('playsTotal')),
         }
index 025d0877cb928bb433aff9f6eff19a29d253e006..3a68eaa80ea6867e6806a4f242a8afc910b8ba06 100644 (file)
@@ -6,8 +6,8 @@ from .common import InfoExtractor
 
 class TF1IE(InfoExtractor):
     """TF1 uses the wat.tv player."""
-    _VALID_URL = r'http://(?:videos\.tf1|www\.tfou)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html'
-    _TESTS = {
+    _VALID_URL = r'http://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html'
+    _TESTS = [{
         'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
         'info_dict': {
             'id': '10635995',
@@ -32,7 +32,13 @@ class TF1IE(InfoExtractor):
             # Sometimes wat serves the whole file with the --test option
             'skip_download': True,
         },
-    }
+    }, {
+        'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://lci.tf1.fr/sept-a-huit/videos/sept-a-huit-du-24-mai-2015-8611550.html',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
index 92731ad3d7e8dcc3167b50ce1a15e3b035fb7721..83d833e30dbeb60caa43aa272bfd4d35f4507a53 100644 (file)
@@ -26,7 +26,7 @@ _x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language
 class ThePlatformIE(InfoExtractor):
     _VALID_URL = r'''(?x)
         (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/
-           (?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/)?
+           (?:(?P<media>(?:[^/]+/)+select/media/)|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))?
          |theplatform:)(?P<id>[^/\?&]+)'''
 
     _TESTS = [{
@@ -56,6 +56,17 @@ class ThePlatformIE(InfoExtractor):
             # rtmp download
             'skip_download': True,
         }
+    }, {
+        'url': 'https://player.theplatform.com/p/D6x-PC/pulse_preview/embed/select/media/yMBg9E8KFxZD',
+        'info_dict': {
+            'id': 'yMBg9E8KFxZD',
+            'ext': 'mp4',
+            'description': 'md5:644ad9188d655b742f942bf2e06b002d',
+            'title': 'HIGHLIGHTS: USA bag first ever series Cup win',
+        }
+    }, {
+        'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7',
+        'only_matching': True,
     }]
 
     @staticmethod
@@ -85,6 +96,11 @@ class ThePlatformIE(InfoExtractor):
         if not provider_id:
             provider_id = 'dJ5BDC'
 
+        path = provider_id
+        if mobj.group('media'):
+            path += '/media'
+        path += '/' + video_id
+
         if smuggled_data.get('force_smil_url', False):
             smil_url = url
         elif mobj.group('config'):
@@ -94,8 +110,7 @@ class ThePlatformIE(InfoExtractor):
             config = self._download_json(config_url, video_id, 'Downloading config')
             smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4&manifest=f4m'
         else:
-            smil_url = ('http://link.theplatform.com/s/{0}/{1}/meta.smil?'
-                        'format=smil&mbr=true'.format(provider_id, video_id))
+            smil_url = 'http://link.theplatform.com/s/%s/meta.smil?format=smil&mbr=true' % path
 
         sig = smuggled_data.get('sig')
         if sig:
@@ -112,7 +127,7 @@ class ThePlatformIE(InfoExtractor):
         else:
             raise ExtractorError(error_msg, expected=True)
 
-        info_url = 'http://link.theplatform.com/s/{0}/{1}?format=preview'.format(provider_id, video_id)
+        info_url = 'http://link.theplatform.com/s/%s?format=preview' % path
         info_json = self._download_webpage(info_url, video_id)
         info = json.loads(info_json)
 
index a77c6a2fc9f2838305145c97e9920d09635ceba7..5d09eb9a8b28cdd2f8bea0743d947f98415564a2 100644 (file)
@@ -1,9 +1,6 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import json
-import re
-
 from .common import InfoExtractor
 from ..utils import unified_strdate
 
@@ -17,7 +14,7 @@ class TheSixtyOneIE(InfoExtractor):
             song
         )/(?P<id>[A-Za-z0-9]+)/?$'''
     _SONG_URL_TEMPLATE = 'http://thesixtyone.com/s/{0:}'
-    _SONG_FILE_URL_TEMPLATE = 'http://{audio_server:}.thesixtyone.com/thesixtyone_production/audio/{0:}_stream'
+    _SONG_FILE_URL_TEMPLATE = 'http://{audio_server:}/thesixtyone_production/audio/{0:}_stream'
     _THUMBNAIL_URL_TEMPLATE = '{photo_base_url:}_desktop'
     _TESTS = [
         {
@@ -70,14 +67,19 @@ class TheSixtyOneIE(InfoExtractor):
     }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        song_id = mobj.group('id')
+        song_id = self._match_id(url)
 
         webpage = self._download_webpage(
             self._SONG_URL_TEMPLATE.format(song_id), song_id)
 
-        song_data = json.loads(self._search_regex(
-            r'"%s":\s(\{.*?\})' % song_id, webpage, 'song_data'))
+        song_data = self._parse_json(self._search_regex(
+            r'"%s":\s(\{.*?\})' % song_id, webpage, 'song_data'), song_id)
+
+        if self._search_regex(r'(t61\.s3_audio_load\s*=\s*1\.0;)', webpage, 's3_audio_load marker', default=None):
+            song_data['audio_server'] = 's3.amazonaws.com'
+        else:
+            song_data['audio_server'] = song_data['audio_server'] + '.thesixtyone.com'
+
         keys = [self._DECODE_MAP.get(s, s) for s in song_data['key']]
         url = self._SONG_FILE_URL_TEMPLATE.format(
             "".join(reversed(keys)), **song_data)
diff --git a/youtube_dl/extractor/thisamericanlife.py b/youtube_dl/extractor/thisamericanlife.py
new file mode 100644 (file)
index 0000000..36493a5
--- /dev/null
@@ -0,0 +1,40 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class ThisAmericanLifeIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?thisamericanlife\.org/(?:radio-archives/episode/|play_full\.php\?play=)(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://www.thisamericanlife.org/radio-archives/episode/487/harper-high-school-part-one',
+        'md5': '8f7d2da8926298fdfca2ee37764c11ce',
+        'info_dict': {
+            'id': '487',
+            'ext': 'm4a',
+            'title': '487: Harper High School, Part One',
+            'description': 'md5:ee40bdf3fb96174a9027f76dbecea655',
+            'thumbnail': 're:^https?://.*\.jpg$',
+        },
+    }, {
+        'url': 'http://www.thisamericanlife.org/play_full.php?play=487',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'http://www.thisamericanlife.org/radio-archives/episode/%s' % video_id, video_id)
+
+        return {
+            'id': video_id,
+            'url': 'http://stream.thisamericanlife.org/{0}/stream/{0}_64k.m3u8'.format(video_id),
+            'protocol': 'm3u8_native',
+            'ext': 'm4a',
+            'acodec': 'aac',
+            'vcodec': 'none',
+            'abr': 64,
+            'title': self._html_search_meta(r'twitter:title', webpage, 'title', fatal=True),
+            'description': self._html_search_meta(r'description', webpage, 'description'),
+            'thumbnail': self._og_search_thumbnail(webpage),
+        }
index 9f9e388c50948d658d1022f8514122643b623a03..13263614cc06b099d929ee71564899ac3620f76a 100644 (file)
@@ -12,17 +12,22 @@ class TlcIE(DiscoveryIE):
     IE_NAME = 'tlc.com'
     _VALID_URL = r'http://www\.tlc\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9\-]*)(.htm)?'
 
-    _TEST = {
+    # DiscoveryIE has _TESTS
+    _TESTS = [{
         'url': 'http://www.tlc.com/tv-shows/cake-boss/videos/too-big-to-fly.htm',
-        'md5': 'c4038f4a9b44d0b5d74caaa64ed2a01a',
         'info_dict': {
-            'id': '853232',
+            'id': '104493',
             'ext': 'mp4',
-            'title': 'Cake Boss: Too Big to Fly',
+            'title': 'Too Big to Fly',
             'description': 'Buddy has taken on a high flying task.',
             'duration': 119,
+            'timestamp': 1393365060,
+            'upload_date': '20140225',
         },
-    }
+        'params': {
+            'skip_download': True,  # requires ffmpef
+        },
+    }]
 
 
 class TlcDeIE(InfoExtractor):
index c5c6fdc51b19fce90d45298858ce345bc901307e..7dbe68b5c228ea6d3fa20d835a60ecf38b820816 100644 (file)
@@ -30,3 +30,31 @@ class TMZIE(InfoExtractor):
             'description': self._og_search_description(webpage),
             'thumbnail': self._html_search_meta('ThumbURL', webpage),
         }
+
+
+class TMZArticleIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?tmz\.com/\d{4}/\d{2}/\d{2}/(?P<id>[^/]+)/?'
+    _TEST = {
+        'url': 'http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert',
+        'md5': 'e482a414a38db73087450e3a6ce69d00',
+        'info_dict': {
+            'id': '0_6snoelag',
+            'ext': 'mp4',
+            'title': 'Bobby Brown Tells Crowd ... Bobbi Kristina is Awake',
+            'description': 'Bobby Brown stunned his audience during a concert Saturday night, when he told the crowd, "Bobbi is awake.  She\'s watching me."',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+        embedded_video_info_str = self._html_search_regex(
+            r'tmzVideoEmbedV2\("([^)]+)"\);', webpage, 'embedded video info')
+
+        embedded_video_info = self._parse_json(
+            embedded_video_info_str, video_id,
+            transform_source=lambda s: s.replace('\\', ''))
+
+        return self.url_result(
+            'http://www.tmz.com/videos/%s/' % embedded_video_info['id'])
index d48cbbf140054e639f7191acfa0909972ef3ab76..49516abca690721a83dee5044bb2cdd6540d4a07 100644 (file)
@@ -3,33 +3,70 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..compat import compat_str
 from ..utils import (
-    parse_duration,
     fix_xml_ampersands,
+    float_or_none,
+    int_or_none,
+    parse_duration,
+    str_to_int,
+    xpath_text,
 )
 
 
-class TNAFlixIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/(?P<cat_id>[\w-]+)/(?P<display_id>[\w-]+)/video(?P<id>\d+)'
+class TNAFlixNetworkBaseIE(InfoExtractor):
+    # May be overridden in descendants if necessary
+    _CONFIG_REGEX = [
+        r'flashvars\.config\s*=\s*escape\("([^"]+)"',
+        r'<input[^>]+name="config\d?" value="([^"]+)"',
+    ]
+    _TITLE_REGEX = r'<input[^>]+name="title" value="([^"]+)"'
+    _DESCRIPTION_REGEX = r'<input[^>]+name="description" value="([^"]+)"'
+    _UPLOADER_REGEX = r'<input[^>]+name="username" value="([^"]+)"'
+    _VIEW_COUNT_REGEX = None
+    _COMMENT_COUNT_REGEX = None
+    _AVERAGE_RATING_REGEX = None
+    _CATEGORIES_REGEX = r'<li[^>]*>\s*<span[^>]+class="infoTitle"[^>]*>Categories:</span>\s*<span[^>]+class="listView"[^>]*>(.+?)</span>\s*</li>'
 
-    _TITLE_REGEX = r'<title>(.+?) - TNAFlix Porn Videos</title>'
-    _DESCRIPTION_REGEX = r'<h3 itemprop="description">([^<]+)</h3>'
-    _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"'
+    def _extract_thumbnails(self, flix_xml):
 
-    _TEST = {
-        'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878',
-        'md5': 'ecf3498417d09216374fc5907f9c6ec0',
-        'info_dict': {
-            'id': '553878',
-            'display_id': 'Carmella-Decesare-striptease',
-            'ext': 'mp4',
-            'title': 'Carmella Decesare - striptease',
-            'description': '',
-            'thumbnail': 're:https?://.*\.jpg$',
-            'duration': 91,
-            'age_limit': 18,
-        }
-    }
+        def get_child(elem, names):
+            for name in names:
+                child = elem.find(name)
+                if child is not None:
+                    return child
+
+        timeline = get_child(flix_xml, ['timeline', 'rolloverBarImage'])
+        if timeline is None:
+            return
+
+        pattern_el = get_child(timeline, ['imagePattern', 'pattern'])
+        if pattern_el is None or not pattern_el.text:
+            return
+
+        first_el = get_child(timeline, ['imageFirst', 'first'])
+        last_el = get_child(timeline, ['imageLast', 'last'])
+        if first_el is None or last_el is None:
+            return
+
+        first_text = first_el.text
+        last_text = last_el.text
+        if not first_text.isdigit() or not last_text.isdigit():
+            return
+
+        first = int(first_text)
+        last = int(last_text)
+        if first > last:
+            return
+
+        width = int_or_none(xpath_text(timeline, './imageWidth', 'thumbnail width'))
+        height = int_or_none(xpath_text(timeline, './imageHeight', 'thumbnail height'))
+
+        return [{
+            'url': self._proto_relative_url(pattern_el.text.replace('#', compat_str(i)), 'http:'),
+            'width': width,
+            'height': height,
+        } for i in range(first, last + 1)]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
@@ -38,47 +75,195 @@ class TNAFlixIE(InfoExtractor):
 
         webpage = self._download_webpage(url, display_id)
 
-        title = self._html_search_regex(
-            self._TITLE_REGEX, webpage, 'title') if self._TITLE_REGEX else self._og_search_title(webpage)
-        description = self._html_search_regex(
-            self._DESCRIPTION_REGEX, webpage, 'description', fatal=False, default='')
-
-        age_limit = self._rta_search(webpage)
-
-        duration = self._html_search_meta('duration', webpage, 'duration', default=None)
-        if duration:
-            duration = parse_duration(duration[1:])
-
         cfg_url = self._proto_relative_url(self._html_search_regex(
             self._CONFIG_REGEX, webpage, 'flashvars.config'), 'http:')
 
         cfg_xml = self._download_xml(
-            cfg_url, display_id, note='Downloading metadata',
+            cfg_url, display_id, 'Downloading metadata',
             transform_source=fix_xml_ampersands)
 
-        thumbnail = cfg_xml.find('./startThumb').text
-
         formats = []
+
+        def extract_video_url(vl):
+            return re.sub('speed=\d+', 'speed=', vl.text)
+
+        video_link = cfg_xml.find('./videoLink')
+        if video_link is not None:
+            formats.append({
+                'url': extract_video_url(video_link),
+                'ext': xpath_text(cfg_xml, './videoConfig/type', 'type', default='flv'),
+            })
+
         for item in cfg_xml.findall('./quality/item'):
-            video_url = re.sub('speed=\d+', 'speed=', item.find('videoLink').text)
-            format_id = item.find('res').text
-            fmt = {
-                'url': video_url,
+            video_link = item.find('./videoLink')
+            if video_link is None:
+                continue
+            res = item.find('res')
+            format_id = None if res is None else res.text
+            height = int_or_none(self._search_regex(
+                r'^(\d+)[pP]', format_id, 'height', default=None))
+            formats.append({
+                'url': self._proto_relative_url(extract_video_url(video_link), 'http:'),
                 'format_id': format_id,
-            }
-            m = re.search(r'^(\d+)', format_id)
-            if m:
-                fmt['height'] = int(m.group(1))
-            formats.append(fmt)
+                'height': height,
+            })
+
         self._sort_formats(formats)
 
+        thumbnail = self._proto_relative_url(
+            xpath_text(cfg_xml, './startThumb', 'thumbnail'), 'http:')
+        thumbnails = self._extract_thumbnails(cfg_xml)
+
+        title = self._html_search_regex(
+            self._TITLE_REGEX, webpage, 'title') if self._TITLE_REGEX else self._og_search_title(webpage)
+
+        age_limit = self._rta_search(webpage)
+
+        duration = parse_duration(self._html_search_meta(
+            'duration', webpage, 'duration', default=None))
+
+        def extract_field(pattern, name):
+            return self._html_search_regex(pattern, webpage, name, default=None) if pattern else None
+
+        description = extract_field(self._DESCRIPTION_REGEX, 'description')
+        uploader = extract_field(self._UPLOADER_REGEX, 'uploader')
+        view_count = str_to_int(extract_field(self._VIEW_COUNT_REGEX, 'view count'))
+        comment_count = str_to_int(extract_field(self._COMMENT_COUNT_REGEX, 'comment count'))
+        average_rating = float_or_none(extract_field(self._AVERAGE_RATING_REGEX, 'average rating'))
+
+        categories_str = extract_field(self._CATEGORIES_REGEX, 'categories')
+        categories = categories_str.split(', ') if categories_str is not None else []
+
         return {
             'id': video_id,
             'display_id': display_id,
             'title': title,
             'description': description,
             'thumbnail': thumbnail,
+            'thumbnails': thumbnails,
             'duration': duration,
             'age_limit': age_limit,
+            'uploader': uploader,
+            'view_count': view_count,
+            'comment_count': comment_count,
+            'average_rating': average_rating,
+            'categories': categories,
             'formats': formats,
         }
+
+
+class TNAFlixIE(TNAFlixNetworkBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)'
+
+    _TITLE_REGEX = r'<title>(.+?) - TNAFlix Porn Videos</title>'
+    _DESCRIPTION_REGEX = r'<h3 itemprop="description">([^<]+)</h3>'
+    _UPLOADER_REGEX = r'(?s)<span[^>]+class="infoTitle"[^>]*>Uploaded By:</span>(.+?)<div'
+
+    _TESTS = [{
+        # anonymous uploader, no categories
+        'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878',
+        'md5': 'ecf3498417d09216374fc5907f9c6ec0',
+        'info_dict': {
+            'id': '553878',
+            'display_id': 'Carmella-Decesare-striptease',
+            'ext': 'mp4',
+            'title': 'Carmella Decesare - striptease',
+            'thumbnail': 're:https?://.*\.jpg$',
+            'duration': 91,
+            'age_limit': 18,
+            'uploader': 'Anonymous',
+            'categories': [],
+        }
+    }, {
+        # non-anonymous uploader, categories
+        'url': 'https://www.tnaflix.com/teen-porn/Educational-xxx-video/video6538',
+        'md5': '0f5d4d490dbfd117b8607054248a07c0',
+        'info_dict': {
+            'id': '6538',
+            'display_id': 'Educational-xxx-video',
+            'ext': 'mp4',
+            'title': 'Educational xxx video',
+            'description': 'md5:b4fab8f88a8621c8fabd361a173fe5b8',
+            'thumbnail': 're:https?://.*\.jpg$',
+            'duration': 164,
+            'age_limit': 18,
+            'uploader': 'bobwhite39',
+            'categories': ['Amateur Porn', 'Squirting Videos', 'Teen Girls 18+'],
+        }
+    }, {
+        'url': 'https://www.tnaflix.com/amateur-porn/bunzHD-Ms.Donk/video358632',
+        'only_matching': True,
+    }]
+
+
+class EMPFlixIE(TNAFlixNetworkBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?empflix\.com/videos/(?P<display_id>.+?)-(?P<id>[0-9]+)\.html'
+
+    _UPLOADER_REGEX = r'<span[^>]+class="infoTitle"[^>]*>Uploaded By:</span>(.+?)</li>'
+
+    _TESTS = [{
+        'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html',
+        'md5': 'b1bc15b6412d33902d6e5952035fcabc',
+        'info_dict': {
+            'id': '33051',
+            'display_id': 'Amateur-Finger-Fuck',
+            'ext': 'mp4',
+            'title': 'Amateur Finger Fuck',
+            'description': 'Amateur solo finger fucking.',
+            'thumbnail': 're:https?://.*\.jpg$',
+            'duration': 83,
+            'age_limit': 18,
+            'uploader': 'cwbike',
+            'categories': ['Amateur', 'Anal', 'Fisting', 'Home made', 'Solo'],
+        }
+    }, {
+        'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html',
+        'only_matching': True,
+    }]
+
+
+class MovieFapIE(TNAFlixNetworkBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?moviefap\.com/videos/(?P<id>[0-9a-f]+)/(?P<display_id>[^/]+)\.html'
+
+    _VIEW_COUNT_REGEX = r'<br>Views\s*<strong>([\d,.]+)</strong>'
+    _COMMENT_COUNT_REGEX = r'<span[^>]+id="comCount"[^>]*>([\d,.]+)</span>'
+    _AVERAGE_RATING_REGEX = r'Current Rating\s*<br>\s*<strong>([\d.]+)</strong>'
+    _CATEGORIES_REGEX = r'(?s)<div[^>]+id="vid_info"[^>]*>\s*<div[^>]*>.+?</div>(.*?)<br>'
+
+    _TESTS = [{
+        # normal, multi-format video
+        'url': 'http://www.moviefap.com/videos/be9867c9416c19f54a4a/experienced-milf-amazing-handjob.html',
+        'md5': '26624b4e2523051b550067d547615906',
+        'info_dict': {
+            'id': 'be9867c9416c19f54a4a',
+            'display_id': 'experienced-milf-amazing-handjob',
+            'ext': 'mp4',
+            'title': 'Experienced MILF Amazing Handjob',
+            'description': 'Experienced MILF giving an Amazing Handjob',
+            'thumbnail': 're:https?://.*\.jpg$',
+            'age_limit': 18,
+            'uploader': 'darvinfred06',
+            'view_count': int,
+            'comment_count': int,
+            'average_rating': float,
+            'categories': ['Amateur', 'Masturbation', 'Mature', 'Flashing'],
+        }
+    }, {
+        # quirky single-format case where the extension is given as fid, but the video is really an flv
+        'url': 'http://www.moviefap.com/videos/e5da0d3edce5404418f5/jeune-couple-russe.html',
+        'md5': 'fa56683e291fc80635907168a743c9ad',
+        'info_dict': {
+            'id': 'e5da0d3edce5404418f5',
+            'display_id': 'jeune-couple-russe',
+            'ext': 'flv',
+            'title': 'Jeune Couple Russe',
+            'description': 'Amateur',
+            'thumbnail': 're:https?://.*\.jpg$',
+            'age_limit': 18,
+            'uploader': 'whiskeyjar',
+            'view_count': int,
+            'comment_count': int,
+            'average_rating': float,
+            'categories': ['Amateur', 'Teen'],
+        }
+    }]
index d73ad3762a1b455cfd4bc384c27e2dd85e776dde..c9cb69333f7da0a9f4fe009e79b06433bca83726 100644 (file)
@@ -47,7 +47,7 @@ class Tube8IE(InfoExtractor):
         webpage = self._download_webpage(req, display_id)
 
         flashvars = json.loads(self._html_search_regex(
-            r'var flashvars\s*=\s*({.+?})', webpage, 'flashvars'))
+            r'flashvars\s*=\s*({.+?});\r?\n', webpage, 'flashvars'))
 
         video_url = flashvars['video_url']
         if flashvars.get('encrypted') is True:
@@ -58,19 +58,19 @@ class Tube8IE(InfoExtractor):
         thumbnail = flashvars.get('image_url')
 
         title = self._html_search_regex(
-            r'videotitle\s*=\s*"([^"]+)', webpage, 'title')
+            r'videoTitle\s*=\s*"([^"]+)', webpage, 'title')
         description = self._html_search_regex(
-            r'>Description:</strong>(.+?)<', webpage, 'description', fatal=False)
+            r'>Description:</strong>\s*(.+?)\s*<', webpage, 'description', fatal=False)
         uploader = self._html_search_regex(
-            r'<strong class="video-username">(?:<a href="[^"]+">)?([^<]+)(?:</a>)?</strong>',
+            r'<span class="username">\s*(.+?)\s*<',
             webpage, 'uploader', fatal=False)
 
         like_count = int_or_none(self._html_search_regex(
-            r"rupVar\s*=\s*'(\d+)'", webpage, 'like count', fatal=False))
+            r'rupVar\s*=\s*"(\d+)"', webpage, 'like count', fatal=False))
         dislike_count = int_or_none(self._html_search_regex(
-            r"rdownVar\s*=\s*'(\d+)'", webpage, 'dislike count', fatal=False))
+            r'rdownVar\s*=\s*"(\d+)"', webpage, 'dislike count', fatal=False))
         view_count = self._html_search_regex(
-            r'<strong>Views: </strong>([\d,\.]+)</li>', webpage, 'view count', fatal=False)
+            r'<strong>Views: </strong>([\d,\.]+)\s*</li>', webpage, 'view count', fatal=False)
         if view_count:
             view_count = str_to_int(view_count)
         comment_count = self._html_search_regex(
diff --git a/youtube_dl/extractor/tubitv.py b/youtube_dl/extractor/tubitv.py
new file mode 100644 (file)
index 0000000..2c4b218
--- /dev/null
@@ -0,0 +1,84 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import codecs
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_parse,
+    compat_urllib_request
+)
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+)
+
+
+class TubiTvIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?tubitv\.com/video\?id=(?P<id>[0-9]+)'
+    _LOGIN_URL = 'http://tubitv.com/login'
+    _NETRC_MACHINE = 'tubitv'
+    _TEST = {
+        'url': 'http://tubitv.com/video?id=54411&title=The_Kitchen_Musical_-_EP01',
+        'info_dict': {
+            'id': '54411',
+            'ext': 'mp4',
+            'title': 'The Kitchen Musical - EP01',
+            'thumbnail': 're:^https?://.*\.png$',
+            'description': 'md5:37532716166069b353e8866e71fefae7',
+            'duration': 2407,
+        },
+        'params': {
+            'skip_download': 'HLS download',
+        },
+    }
+
+    def _login(self):
+        (username, password) = self._get_login_info()
+        if username is None:
+            return
+        self.report_login()
+        form_data = {
+            'username': username,
+            'password': password,
+        }
+        payload = compat_urllib_parse.urlencode(form_data).encode('utf-8')
+        request = compat_urllib_request.Request(self._LOGIN_URL, payload)
+        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+        login_page = self._download_webpage(
+            request, None, False, 'Wrong login info')
+        if not re.search(r'id="tubi-logout"', login_page):
+            raise ExtractorError(
+                'Login failed (invalid username/password)', expected=True)
+
+    def _real_initialize(self):
+        self._login()
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+        if re.search(r"<(?:DIV|div) class='login-required-screen'>", webpage):
+            raise ExtractorError(
+                'This video requires login, use --username and --password '
+                'options to provide account credentials.', expected=True)
+
+        title = self._og_search_title(webpage)
+        description = self._og_search_description(webpage)
+        thumbnail = self._og_search_thumbnail(webpage)
+        duration = int_or_none(self._html_search_meta(
+            'video:duration', webpage, 'duration'))
+
+        apu = self._search_regex(r"apu='([^']+)'", webpage, 'apu')
+        m3u8_url = codecs.decode(apu, 'rot_13')[::-1]
+        formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': thumbnail,
+            'description': description,
+            'duration': duration,
+        }
index 828c808a6456b6b99b134cb7ae9d9017de9ad3aa..3d3b635e4cb362515b365ccd8f9321e5124aadeb 100644 (file)
@@ -28,6 +28,28 @@ class TumblrIE(InfoExtractor):
             'description': 'md5:dba62ac8639482759c8eb10ce474586a',
             'thumbnail': 're:http://.*\.jpg',
         }
+    }, {
+        'url': 'http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching',
+        'md5': 'de07e5211d60d4f3a2c3df757ea9f6ab',
+        'info_dict': {
+            'id': 'Wmur',
+            'ext': 'mp4',
+            'title': 'naked smoking & stretching',
+            'upload_date': '20150506',
+            'timestamp': 1430931613,
+        },
+        'add_ie': ['Vidme'],
+    }, {
+        'url': 'http://camdamage.tumblr.com/post/98846056295/',
+        'md5': 'a9e0c8371ea1ca306d6554e3fecf50b6',
+        'info_dict': {
+            'id': '105463834',
+            'ext': 'mp4',
+            'title': 'Cam Damage-HD 720p',
+            'uploader': 'John Moyer',
+            'uploader_id': 'user32021558',
+        },
+        'add_ie': ['Vimeo'],
     }]
 
     def _real_extract(self, url):
@@ -36,12 +58,16 @@ class TumblrIE(InfoExtractor):
         blog = m_url.group('blog_name')
 
         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
-        webpage = self._download_webpage(url, video_id)
+        webpage, urlh = self._download_webpage_handle(url, video_id)
 
         iframe_url = self._search_regex(
             r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'',
-            webpage, 'iframe url')
-        iframe = self._download_webpage(iframe_url, video_id)
+            webpage, 'iframe url', default=None)
+        if iframe_url is None:
+            return self.url_result(urlh.geturl(), 'Generic')
+
+        iframe = self._download_webpage(iframe_url, video_id,
+                                        'Downloading iframe page')
         video_url = self._search_regex(r'<source src="([^"]+)"',
                                        iframe, 'video url')
 
index 29703a8a9a6ddf0981642c28cd2f1f68cc07c7b7..7ae63a4992a74368ec8b5f6a266a298cb6776b79 100644 (file)
@@ -23,7 +23,7 @@ class TurboIE(InfoExtractor):
             'ext': 'mp4',
             'duration': 3715,
             'title': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia... ',
-            'description': 'Retrouvez dans cette rubrique toutes les vidéos de l\'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia... ',
+            'description': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia...',
             'thumbnail': 're:^https?://.*\.jpg$',
         }
     }
@@ -42,7 +42,7 @@ class TurboIE(InfoExtractor):
         title = xpath_text(item, './title', 'title')
         duration = int_or_none(xpath_text(item, './durate', 'duration'))
         thumbnail = xpath_text(item, './visuel_clip', 'thumbnail')
-        description = self._og_search_description(webpage)
+        description = self._html_search_meta('description', webpage)
 
         formats = []
         get_quality = qualities(['3g', 'sd', 'hq'])
index 4de0aac523313eced334aab38a9a20c7bf08dfc7..fad720b681e125ac495b26ba3870dbd65340e3ce 100644 (file)
@@ -26,7 +26,7 @@ class TutvIE(InfoExtractor):
 
         data_content = self._download_webpage(
             'http://tu.tv/flvurl.php?codVideo=%s' % internal_id, video_id, 'Downloading video info')
-        video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0]).decode('utf-8')
+        video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0].encode('utf-8')).decode('utf-8')
 
         return {
             'id': internal_id,
diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py
new file mode 100644 (file)
index 0000000..fa338b9
--- /dev/null
@@ -0,0 +1,126 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    float_or_none,
+    parse_iso8601,
+    remove_end,
+)
+
+
+class TV2IE(InfoExtractor):
+    _VALID_URL = 'http://(?:www\.)?tv2\.no/v/(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.tv2.no/v/916509/',
+        'md5': '9cb9e3410b18b515d71892f27856e9b1',
+        'info_dict': {
+            'id': '916509',
+            'ext': 'flv',
+            'title': 'Se Gryttens hyllest av Steven Gerrard',
+            'description': 'TV 2 Sportens huspoet tar avskjed med Liverpools kaptein Steven Gerrard.',
+            'timestamp': 1431715610,
+            'upload_date': '20150515',
+            'duration': 156.967,
+            'view_count': int,
+            'categories': list,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        formats = []
+        format_urls = []
+        for protocol in ('HDS', 'HLS'):
+            data = self._download_json(
+                'http://sumo.tv2.no/api/web/asset/%s/play.json?protocol=%s&videoFormat=SMIL+ISMUSP' % (video_id, protocol),
+                video_id, 'Downloading play JSON')['playback']
+            for item in data['items']['item']:
+                video_url = item.get('url')
+                if not video_url or video_url in format_urls:
+                    continue
+                format_id = '%s-%s' % (protocol.lower(), item.get('mediaFormat'))
+                if not self._is_valid_url(video_url, video_id, format_id):
+                    continue
+                format_urls.append(video_url)
+                ext = determine_ext(video_url)
+                if ext == 'f4m':
+                    formats.extend(self._extract_f4m_formats(
+                        video_url, video_id, f4m_id=format_id))
+                elif ext == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        video_url, video_id, 'mp4', m3u8_id=format_id))
+                elif ext == 'ism' or video_url.endswith('.ism/Manifest'):
+                    pass
+                else:
+                    formats.append({
+                        'url': video_url,
+                        'format_id': format_id,
+                        'tbr': int_or_none(item.get('bitrate')),
+                        'filesize': int_or_none(item.get('fileSize')),
+                    })
+        self._sort_formats(formats)
+
+        asset = self._download_json(
+            'http://sumo.tv2.no/api/web/asset/%s.json' % video_id,
+            video_id, 'Downloading metadata JSON')['asset']
+
+        title = asset['title']
+        description = asset.get('description')
+        timestamp = parse_iso8601(asset.get('createTime'))
+        duration = float_or_none(asset.get('accurateDuration') or asset.get('duration'))
+        view_count = int_or_none(asset.get('views'))
+        categories = asset.get('keywords', '').split(',')
+
+        thumbnails = [{
+            'id': thumbnail.get('@type'),
+            'url': thumbnail.get('url'),
+        } for _, thumbnail in asset.get('imageVersions', {}).items()]
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'description': description,
+            'thumbnails': thumbnails,
+            'timestamp': timestamp,
+            'duration': duration,
+            'view_count': view_count,
+            'categories': categories,
+            'formats': formats,
+        }
+
+
+class TV2ArticleIE(InfoExtractor):
+    _VALID_URL = 'http://(?:www\.)?tv2\.no/(?:a|\d{4}/\d{2}/\d{2}(/[^/]+)+)/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://www.tv2.no/2015/05/16/nyheter/alesund/krim/pingvin/6930542',
+        'info_dict': {
+            'id': '6930542',
+            'title': 'Russen hetses etter pingvintyveri – innrømmer å ha åpnet luken på buret',
+            'description': 'md5:339573779d3eea3542ffe12006190954',
+        },
+        'playlist_count': 2,
+    }, {
+        'url': 'http://www.tv2.no/a/6930542',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        entries = [
+            self.url_result('http://www.tv2.no/v/%s' % video_id, 'TV2')
+            for video_id in re.findall(r'data-assetid="(\d+)"', webpage)]
+
+        title = remove_end(self._og_search_title(webpage), ' - TV2.no')
+        description = remove_end(self._og_search_description(webpage), ' - TV2.no')
+
+        return self.playlist_result(entries, playlist_id, title, description)
diff --git a/youtube_dl/extractor/tvc.py b/youtube_dl/extractor/tvc.py
new file mode 100644 (file)
index 0000000..3a4f393
--- /dev/null
@@ -0,0 +1,109 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    int_or_none,
+)
+
+
+class TVCIE(InfoExtractor):
+    _VALID_URL = r'http://(?:www\.)?tvc\.ru/video/iframe/id/(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.tvc.ru/video/iframe/id/74622/isPlay/false/id_stat/channel/?acc_video_id=/channel/brand/id/17/show/episodes/episode_id/39702',
+        'md5': 'bbc5ff531d1e90e856f60fc4b3afd708',
+        'info_dict': {
+            'id': '74622',
+            'ext': 'mp4',
+            'title': 'События. "События". Эфир от 22.05.2015 14:30',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 1122,
+        },
+    }
+
+    @classmethod
+    def _extract_url(cls, webpage):
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:http:)?//(?:www\.)?tvc\.ru/video/iframe/id/[^"]+)\1', webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video = self._download_json(
+            'http://www.tvc.ru/video/json/id/%s' % video_id, video_id)
+
+        formats = []
+        for info in video.get('path', {}).get('quality', []):
+            video_url = info.get('url')
+            if not video_url:
+                continue
+            format_id = self._search_regex(
+                r'cdnvideo/([^/]+?)(?:-[^/]+?)?/', video_url,
+                'format id', default=None)
+            formats.append({
+                'url': video_url,
+                'format_id': format_id,
+                'width': int_or_none(info.get('width')),
+                'height': int_or_none(info.get('height')),
+                'tbr': int_or_none(info.get('bitrate')),
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': video['title'],
+            'thumbnail': video.get('picture'),
+            'duration': int_or_none(video.get('duration')),
+            'formats': formats,
+        }
+
+
+class TVCArticleIE(InfoExtractor):
+    _VALID_URL = r'http://(?:www\.)?tvc\.ru/(?!video/iframe/id/)(?P<id>[^?#]+)'
+    _TESTS = [{
+        'url': 'http://www.tvc.ru/channel/brand/id/29/show/episodes/episode_id/39702/',
+        'info_dict': {
+            'id': '74622',
+            'ext': 'mp4',
+            'title': 'События. "События". Эфир от 22.05.2015 14:30',
+            'description': 'md5:ad7aa7db22903f983e687b8a3e98c6dd',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 1122,
+        },
+    }, {
+        'url': 'http://www.tvc.ru/news/show/id/69944',
+        'info_dict': {
+            'id': '75399',
+            'ext': 'mp4',
+            'title': 'Эксперты: в столице встал вопрос о максимально безопасных остановках',
+            'description': 'md5:f2098f71e21f309e89f69b525fd9846e',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 278,
+        },
+    }, {
+        'url': 'http://www.tvc.ru/channel/brand/id/47/show/episodes#',
+        'info_dict': {
+            'id': '2185',
+            'ext': 'mp4',
+            'title': 'Ещё не поздно. Эфир от 03.08.2013',
+            'description': 'md5:51fae9f3f8cfe67abce014e428e5b027',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 3316,
+        },
+    }]
+
+    def _real_extract(self, url):
+        webpage = self._download_webpage(url, self._match_id(url))
+        return {
+            '_type': 'url_transparent',
+            'ie_key': 'TVC',
+            'url': self._og_search_video_url(webpage),
+            'title': clean_html(self._og_search_title(webpage)),
+            'description': clean_html(self._og_search_description(webpage)),
+            'thumbnail': self._og_search_thumbnail(webpage),
+        }
index 102362b295450f58ff085ec9be7d21921a1ac494..dc3a8334a6b335143dff417d805a26df412d8783 100644 (file)
@@ -5,7 +5,9 @@ import re
 
 from .common import InfoExtractor
 from ..utils import (
+    ExtractorError,
     float_or_none,
+    int_or_none,
     parse_age_limit,
 )
 
@@ -24,22 +26,24 @@ class TvigleIE(InfoExtractor):
                 'display_id': 'sokrat',
                 'ext': 'flv',
                 'title': 'Сократ',
-                'description': 'md5:a05bd01be310074d5833efc6743be95e',
+                'description': 'md5:d6b92ffb7217b4b8ebad2e7665253c17',
                 'duration': 6586,
-                'age_limit': 0,
+                'age_limit': 12,
             },
+            'skip': 'georestricted',
         },
         {
             'url': 'http://www.tvigle.ru/video/vladimir-vysotskii/vedushchii-teleprogrammy-60-minut-ssha-o-vladimire-vysotskom/',
-            'md5': 'd9012d7c7c598fe7a11d7fb46dc1f574',
+            'md5': 'e7efe5350dd5011d0de6550b53c3ba7b',
             'info_dict': {
                 'id': '5142516',
-                'ext': 'mp4',
+                'ext': 'flv',
                 'title': 'Ведущий телепрограммы «60 минут» (США) о Владимире Высоцком',
                 'description': 'md5:027f7dc872948f14c96d19b4178428a4',
                 'duration': 186.080,
                 'age_limit': 0,
             },
+            'skip': 'georestricted',
         }, {
             'url': 'https://cloud.tvigle.ru/video/5267604/',
             'only_matching': True,
@@ -54,7 +58,7 @@ class TvigleIE(InfoExtractor):
         if not video_id:
             webpage = self._download_webpage(url, display_id)
             video_id = self._html_search_regex(
-                r'<li class="video-preview current_playing" id="(\d+)">',
+                r'class="video-preview current_playing" id="(\d+)">',
                 webpage, 'video id')
 
         video_data = self._download_json(
@@ -62,21 +66,34 @@ class TvigleIE(InfoExtractor):
 
         item = video_data['playlist']['items'][0]
 
+        videos = item.get('videos')
+
+        error_message = item.get('errorMessage')
+        if not videos and error_message:
+            raise ExtractorError(
+                '%s returned error: %s' % (self.IE_NAME, error_message), expected=True)
+
         title = item['title']
-        description = item['description']
-        thumbnail = item['thumbnail']
+        description = item.get('description')
+        thumbnail = item.get('thumbnail')
         duration = float_or_none(item.get('durationMilliseconds'), 1000)
         age_limit = parse_age_limit(item.get('ageRestrictions'))
 
         formats = []
         for vcodec, fmts in item['videos'].items():
-            for quality, video_url in fmts.items():
+            for format_id, video_url in fmts.items():
+                if format_id == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        video_url, video_id, 'mp4', m3u8_id=vcodec))
+                    continue
+                height = self._search_regex(
+                    r'^(\d+)[pP]$', format_id, 'height', default=None)
                 formats.append({
                     'url': video_url,
-                    'format_id': '%s-%s' % (vcodec, quality),
+                    'format_id': '%s-%s' % (vcodec, format_id),
                     'vcodec': vcodec,
-                    'height': int(quality[:-1]),
-                    'filesize': item['video_files_size'][vcodec][quality],
+                    'height': int_or_none(height),
+                    'filesize': int_or_none(item.get('video_files_size', {}).get(vcodec, {}).get(format_id)),
                 })
         self._sort_formats(formats)
 
index e83e31a31640fa32e4a19a48a745d279a14d3753..79863e781fd41101c76659ab3b43a85433d25665 100644 (file)
@@ -26,6 +26,7 @@ class TVPlayIE(InfoExtractor):
            viasat4play\.no/programmer|
            tv6play\.no/programmer|
            tv3play\.dk/programmer|
+           play\.novatv\.bg/programi
         )/[^/]+/(?P<id>\d+)
         '''
     _TESTS = [
@@ -173,6 +174,22 @@ class TVPlayIE(InfoExtractor):
                 'skip_download': True,
             },
         },
+        {
+            'url': 'http://play.novatv.bg/programi/zdravei-bulgariya/624952?autostart=true',
+            'info_dict': {
+                'id': '624952',
+                'ext': 'flv',
+                'title': 'Здравей, България (12.06.2015 г.) ',
+                'description': 'md5:99f3700451ac5bb71a260268b8daefd7',
+                'duration': 8838,
+                'timestamp': 1434100372,
+                'upload_date': '20150612',
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
     ]
 
     def _real_extract(self, url):
index 67e8bfea03476ccf78d2470a973655f2a7213730..c1ee1decc433627ffa52196d44f7563b46d309cc 100644 (file)
@@ -15,7 +15,7 @@ class TwentyFourVideoIE(InfoExtractor):
     _TESTS = [
         {
             'url': 'http://www.24video.net/video/view/1044982',
-            'md5': '48dd7646775690a80447a8dca6a2df76',
+            'md5': 'd041af8b5b4246ea466226a0d6693345',
             'info_dict': {
                 'id': '1044982',
                 'ext': 'mp4',
@@ -54,7 +54,7 @@ class TwentyFourVideoIE(InfoExtractor):
             webpage, 'upload date'))
 
         uploader = self._html_search_regex(
-            r'Загрузил\s*<a href="/jsecUser/movies/[^"]+" class="link">([^<]+)</a>',
+            r'class="video-uploaded"[^>]*>\s*<a href="/jsecUser/movies/[^"]+"[^>]*>([^<]+)</a>',
             webpage, 'uploader', fatal=False)
 
         view_count = int_or_none(self._html_search_regex(
index 94bd6345da18815a50b72502a8b91ae4e30ae2b5..73ce335b7f0a5b5790f8dd65e3ac170e9791b7d8 100644 (file)
@@ -22,8 +22,8 @@ class TwitchBaseIE(InfoExtractor):
 
     _API_BASE = 'https://api.twitch.tv'
     _USHER_BASE = 'http://usher.twitch.tv'
-    _LOGIN_URL = 'https://secure.twitch.tv/user/login'
-    _LOGIN_POST_URL = 'https://secure-login.twitch.tv/login'
+    _LOGIN_URL = 'https://secure.twitch.tv/login'
+    _LOGIN_POST_URL = 'https://passport.twitch.tv/authorize'
     _NETRC_MACHINE = 'twitch'
 
     def _handle_error(self, response):
@@ -59,20 +59,12 @@ class TwitchBaseIE(InfoExtractor):
         login_page = self._download_webpage(
             self._LOGIN_URL, None, 'Downloading login page')
 
-        authenticity_token = self._search_regex(
-            r'<input name="authenticity_token" type="hidden" value="([^"]+)"',
-            login_page, 'authenticity token')
-
-        login_form = {
-            'utf8': '✓'.encode('utf-8'),
-            'authenticity_token': authenticity_token,
-            'redirect_on_login': '',
-            'embed_form': 'false',
-            'mp_source_action': 'login-button',
-            'follow': '',
-            'login': username,
-            'password': password,
-        }
+        login_form = self._hidden_inputs(login_page)
+
+        login_form.update({
+            'login': username.encode('utf-8'),
+            'password': password.encode('utf-8'),
+        })
 
         request = compat_urllib_request.Request(
             self._LOGIN_POST_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
@@ -80,11 +72,15 @@ class TwitchBaseIE(InfoExtractor):
         response = self._download_webpage(
             request, None, 'Logging in as %s' % username)
 
-        m = re.search(
-            r"id=([\"'])login_error_message\1[^>]*>(?P<msg>[^<]+)", response)
-        if m:
+        error_message = self._search_regex(
+            r'<div[^>]+class="subwindow_notice"[^>]*>([^<]+)</div>',
+            response, 'error message', default=None)
+        if error_message:
             raise ExtractorError(
-                'Unable to login: %s' % m.group('msg').strip(), expected=True)
+                'Unable to login. Twitch said: %s' % error_message, expected=True)
+
+        if '>Reset your password<' in response:
+            self.report_warning('Twitch asks you to reset your password, go to https://secure.twitch.tv/reset/submit')
 
     def _prefer_source(self, formats):
         try:
@@ -189,17 +185,17 @@ class TwitchVodIE(TwitchItemBaseIE):
     _ITEM_SHORTCUT = 'v'
 
     _TEST = {
-        'url': 'http://www.twitch.tv/ksptv/v/3622000',
+        'url': 'http://www.twitch.tv/riotgames/v/6528877',
         'info_dict': {
-            'id': 'v3622000',
+            'id': 'v6528877',
             'ext': 'mp4',
-            'title': '''KSPTV: Squadcast: "Everyone's on vacation so here's Dahud" Edition!''',
+            'title': 'LCK Summer Split - Week 6 Day 1',
             'thumbnail': 're:^https?://.*\.jpg$',
-            'duration': 6951,
-            'timestamp': 1419028564,
-            'upload_date': '20141219',
-            'uploader': 'KSPTV',
-            'uploader_id': 'ksptv',
+            'duration': 17208,
+            'timestamp': 1435131709,
+            'upload_date': '20150624',
+            'uploader': 'Riot Games',
+            'uploader_id': 'riotgames',
             'view_count': int,
         },
         'params': {
@@ -215,7 +211,7 @@ class TwitchVodIE(TwitchItemBaseIE):
             '%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id,
             'Downloading %s access token' % self._ITEM_TYPE)
         formats = self._extract_m3u8_formats(
-            '%s/vod/%s?nauth=%s&nauthsig=%s'
+            '%s/vod/%s?nauth=%s&nauthsig=%s&allow_source=true'
             % (self._USHER_BASE, item_id, access_token['token'], access_token['sig']),
             item_id, 'mp4')
         self._prefer_source(formats)
@@ -314,9 +310,9 @@ class TwitchBookmarksIE(TwitchPlaylistBaseIE):
 
 class TwitchStreamIE(TwitchBaseIE):
     IE_NAME = 'twitch:stream'
-    _VALID_URL = r'%s/(?P<id>[^/]+)/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
+    _VALID_URL = r'%s/(?P<id>[^/#?]+)/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.twitch.tv/shroomztv',
         'info_dict': {
             'id': '12772022048',
@@ -335,7 +331,10 @@ class TwitchStreamIE(TwitchBaseIE):
             # m3u8 download
             'skip_download': True,
         },
-    }
+    }, {
+        'url': 'http://www.twitch.tv/miracle_doto#profile-0',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         channel_id = self._match_id(url)
@@ -350,6 +349,12 @@ class TwitchStreamIE(TwitchBaseIE):
                 'http://www.twitch.tv/%s/profile' % channel_id,
                 'TwitchProfile', channel_id)
 
+        # Channel name may be typed if different case than the original channel name
+        # (e.g. http://www.twitch.tv/TWITCHPLAYSPOKEMON) that will lead to constructing
+        # an invalid m3u8 URL. Working around by use of original channel name from stream
+        # JSON and fallback to lowercase if it's not available.
+        channel_id = stream.get('channel', {}).get('name') or channel_id.lower()
+
         access_token = self._download_json(
             '%s/api/channels/%s/access_token' % (self._API_BASE, channel_id), channel_id,
             'Downloading channel access token')
diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py
new file mode 100644 (file)
index 0000000..1aaa063
--- /dev/null
@@ -0,0 +1,72 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_request
+from ..utils import (
+    float_or_none,
+    unescapeHTML,
+)
+
+
+class TwitterCardIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/cards/tfw/v1/(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889',
+        'md5': 'a74f50b310c83170319ba16de6955192',
+        'info_dict': {
+            'id': '560070183650213889',
+            'ext': 'mp4',
+            'title': 'TwitterCard',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 30.033,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        # Different formats served for different User-Agents
+        USER_AGENTS = [
+            'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',  # mp4
+            'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0',  # webm
+        ]
+
+        config = None
+        formats = []
+        for user_agent in USER_AGENTS:
+            request = compat_urllib_request.Request(url)
+            request.add_header('User-Agent', user_agent)
+            webpage = self._download_webpage(request, video_id)
+
+            config = self._parse_json(
+                unescapeHTML(self._search_regex(
+                    r'data-player-config="([^"]+)"', webpage, 'data player config')),
+                video_id)
+
+            video_url = config['playlist'][0]['source']
+
+            f = {
+                'url': video_url,
+            }
+
+            m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url)
+            if m:
+                f.update({
+                    'width': int(m.group('width')),
+                    'height': int(m.group('height')),
+                })
+            formats.append(f)
+        self._sort_formats(formats)
+
+        thumbnail = config.get('posterImageUrl')
+        duration = float_or_none(config.get('duration'))
+
+        return {
+            'id': video_id,
+            'title': 'TwitterCard',
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'formats': formats,
+        }
index 4667ed83b71f4aec5f081741834e2c9cca010e82..4a0eaf65f78be0dbac2b089aa064eec043b15e41 100644 (file)
@@ -15,7 +15,8 @@ from ..utils import (
 class UdemyIE(InfoExtractor):
     IE_NAME = 'udemy'
     _VALID_URL = r'https?://www\.udemy\.com/(?:[^#]+#/lecture/|lecture/view/?\?lectureId=)(?P<id>\d+)'
-    _LOGIN_URL = 'https://www.udemy.com/join/login-submit/'
+    _LOGIN_URL = 'https://www.udemy.com/join/login-popup/?displayType=ajax&showSkipButton=1'
+    _ORIGIN_URL = 'https://www.udemy.com'
     _NETRC_MACHINE = 'udemy'
 
     _TESTS = [{
@@ -74,29 +75,36 @@ class UdemyIE(InfoExtractor):
                 expected=True)
 
         login_popup = self._download_webpage(
-            'https://www.udemy.com/join/login-popup?displayType=ajax&showSkipButton=1', None,
-            'Downloading login popup')
+            self._LOGIN_URL, None, 'Downloading login popup')
 
-        if login_popup == '<div class="run-command close-popup redirect" data-url="https://www.udemy.com/"></div>':
+        def is_logged(webpage):
+            return any(p in webpage for p in ['href="https://www.udemy.com/user/logout/', '>Logout<'])
+
+        # already logged in
+        if is_logged(login_popup):
             return
 
-        csrf = self._html_search_regex(
-            r'<input type="hidden" name="csrf" value="(.+?)"',
-            login_popup, 'csrf token')
+        login_form = self._form_hidden_inputs('login-form', login_popup)
+
+        login_form.update({
+            'email': username.encode('utf-8'),
+            'password': password.encode('utf-8'),
+        })
 
-        login_form = {
-            'email': username,
-            'password': password,
-            'csrf': csrf,
-            'displayType': 'json',
-            'isSubmitted': '1',
-        }
         request = compat_urllib_request.Request(
             self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
-        response = self._download_json(
+        request.add_header('Referer', self._ORIGIN_URL)
+        request.add_header('Origin', self._ORIGIN_URL)
+
+        response = self._download_webpage(
             request, None, 'Logging in as %s' % username)
 
-        if 'returnUrl' not in response:
+        if not is_logged(response):
+            error = self._html_search_regex(
+                r'(?s)<div[^>]+class="form-errors[^"]*">(.+?)</div>',
+                response, 'error message', default=None)
+            if error:
+                raise ExtractorError('Unable to login: %s' % error, expected=True)
             raise ExtractorError('Unable to log in')
 
     def _real_extract(self, url):
index c08428acfab446dff6157035ef032ae326199ebf..2151f83382d6b3185722b54de2d0eab2a988c6ae 100644 (file)
@@ -11,6 +11,7 @@ from ..compat import compat_urlparse
 
 
 class UDNEmbedIE(InfoExtractor):
+    IE_DESC = '聯合影音'
     _VALID_URL = r'https?://video\.udn\.com/(?:embed|play)/news/(?P<id>\d+)'
     _TESTS = [{
         'url': 'http://video.udn.com/embed/news/300040',
index 96c809eaf7155290210e0f8b18d3a2c7c948ba97..c4751050ec60901c2750b2f1692059f6246e23dc 100644 (file)
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..compat import compat_urllib_parse_urlparse
 from ..utils import (
     ExtractorError,
     qualities,
@@ -44,9 +45,9 @@ class UltimediaIE(InfoExtractor):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
 
-        deliver_url = self._search_regex(
-            r'<iframe[^>]+src="(https?://(?:www\.)?ultimedia\.com/deliver/[^"]+)"',
-            webpage, 'deliver URL')
+        deliver_url = self._proto_relative_url(self._search_regex(
+            r'<iframe[^>]+src="((?:https?:)?//(?:www\.)?ultimedia\.com/deliver/[^"]+)"',
+            webpage, 'deliver URL'), compat_urllib_parse_urlparse(url).scheme + ':')
 
         deliver_page = self._download_webpage(
             deliver_url, video_id, 'Downloading iframe page')
@@ -57,7 +58,8 @@ class UltimediaIE(InfoExtractor):
 
         player = self._parse_json(
             self._search_regex(
-                r"jwplayer\('player(?:_temp)?'\)\.setup\(({.+?})\)\.on", deliver_page, 'player'),
+                r"jwplayer\('player(?:_temp)?'\)\.setup\(({.+?})\)\.on",
+                deliver_page, 'player'),
             video_id)
 
         quality = qualities(['flash', 'html5'])
index dd026748dcbb536f9f49181b0d211bf0a9157777..722eb52368825b92c88506ff33d79bf1f2f91a32 100644 (file)
@@ -5,6 +5,7 @@ from .common import InfoExtractor
 from ..compat import (
     compat_urllib_parse,
     compat_urllib_request,
+    compat_urlparse,
 )
 from ..utils import (
     ExtractorError,
@@ -26,11 +27,21 @@ class Vbox7IE(InfoExtractor):
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        redirect_page, urlh = self._download_webpage_handle(url, video_id)
-        new_location = self._search_regex(r'window\.location = \'(.*)\';',
-                                          redirect_page, 'redirect location')
-        redirect_url = urlh.geturl() + new_location
-        webpage = self._download_webpage(redirect_url, video_id,
+        # need to get the page 3 times for the correct jsSecretToken cookie
+        # which is necessary for the correct title
+        def get_session_id():
+            redirect_page = self._download_webpage(url, video_id)
+            session_id_url = self._search_regex(
+                r'var\s*url\s*=\s*\'([^\']+)\';', redirect_page,
+                'session id url')
+            self._download_webpage(
+                compat_urlparse.urljoin(url, session_id_url), video_id,
+                'Getting session id')
+
+        get_session_id()
+        get_session_id()
+
+        webpage = self._download_webpage(url, video_id,
                                          'Downloading redirect page')
 
         title = self._html_search_regex(r'<title>(.*)</title>',
index 346edf485998e973f4a56f8dee3213093f90aae1..0d8d832cc0890a8cb99c8388b1b9fc1964396d23 100644 (file)
@@ -5,6 +5,7 @@ import json
 
 from .common import InfoExtractor
 from ..compat import (
+    compat_urllib_parse_unquote,
     compat_urlparse,
 )
 from ..utils import (
@@ -76,7 +77,7 @@ class VeeHDIE(InfoExtractor):
 
         if config_json:
             config = json.loads(config_json)
-            video_url = compat_urlparse.unquote(config['clip']['url'])
+            video_url = compat_urllib_parse_unquote(config['clip']['url'])
 
         if not video_url:
             video_url = self._html_search_regex(
index 69dc9a75964f4c81a6aa010ec48b25db5c517b4a..f38a72fde8974a7a1ea290de04281f67079b1a16 100644 (file)
@@ -4,11 +4,26 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..utils import float_or_none
+from ..utils import (
+    ExtractorError,
+    float_or_none,
+)
 
 
 class VGTVIE(InfoExtractor):
-    _VALID_URL = r'http://(?:www\.)?vgtv\.no/#!/[^/]+/(?P<id>[0-9]+)'
+    IE_DESC = 'VGTV and BTTV'
+    _VALID_URL = r'''(?x)
+                    (?:
+                        vgtv:|
+                        http://(?:www\.)?
+                    )
+                    (?P<host>vgtv|bt)
+                    (?:
+                        :|
+                        \.no/(?:tv/)?\#!/(?:video|live)/
+                    )
+                    (?P<id>[0-9]+)
+                    '''
     _TESTS = [
         {
             # streamType: vod
@@ -47,16 +62,16 @@ class VGTVIE(InfoExtractor):
         },
         {
             # streamType: live
-            'url': 'http://www.vgtv.no/#!/live/100015/direkte-her-kan-du-se-laksen-live-fra-suldalslaagen',
+            'url': 'http://www.vgtv.no/#!/live/113063/direkte-v75-fra-solvalla',
             'info_dict': {
-                'id': '100015',
+                'id': '113063',
                 'ext': 'flv',
-                'title': 'DIREKTE: Her kan du se laksen live fra Suldalslågen!',
-                'description': 'md5:9a60cc23fa349f761628924e56eeec2d',
+                'title': 're:^DIREKTE: V75 fra Solvalla [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+                'description': 'md5:b3743425765355855f88e096acc93231',
                 'thumbnail': 're:^https?://.*\.jpg',
                 'duration': 0,
-                'timestamp': 1407423348,
-                'upload_date': '20140807',
+                'timestamp': 1432975582,
+                'upload_date': '20150530',
                 'view_count': int,
             },
             'params': {
@@ -64,25 +79,47 @@ class VGTVIE(InfoExtractor):
                 'skip_download': True,
             },
         },
+        {
+            'url': 'http://www.bt.no/tv/#!/video/100250/norling-dette-er-forskjellen-paa-1-divisjon-og-eliteserien',
+            'only_matching': True,
+        },
     ]
 
     def _real_extract(self, url):
-        video_id = self._match_id(url)
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        host = mobj.group('host')
+
+        HOST_WEBSITES = {
+            'vgtv': 'vgtv',
+            'bt': 'bttv',
+        }
+
         data = self._download_json(
-            'http://svp.vg.no/svp/api/v1/vgtv/assets/%s?appName=vgtv-website' % video_id,
+            'http://svp.vg.no/svp/api/v1/%s/assets/%s?appName=%s-website'
+            % (host, video_id, HOST_WEBSITES[host]),
             video_id, 'Downloading media JSON')
 
+        if data.get('status') == 'inactive':
+            raise ExtractorError(
+                'Video %s is no longer available' % video_id, expected=True)
+
         streams = data['streamUrls']
+        stream_type = data.get('streamType')
 
         formats = []
 
         hls_url = streams.get('hls')
         if hls_url:
-            formats.extend(self._extract_m3u8_formats(hls_url, video_id, 'mp4'))
+            formats.extend(self._extract_m3u8_formats(
+                hls_url, video_id, 'mp4', m3u8_id='hls'))
 
         hds_url = streams.get('hds')
-        if hds_url:
-            formats.extend(self._extract_f4m_formats(hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', video_id))
+        # wasLive hds are always 404
+        if hds_url and stream_type != 'wasLive':
+            formats.extend(self._extract_f4m_formats(
+                hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18',
+                video_id, f4m_id='hds'))
 
         mp4_url = streams.get('mp4')
         if mp4_url:
@@ -107,11 +144,60 @@ class VGTVIE(InfoExtractor):
 
         return {
             'id': video_id,
-            'title': data['title'],
+            'title': self._live_title(data['title']),
             'description': data['description'],
             'thumbnail': data['images']['main'] + '?t[]=900x506q80',
             'timestamp': data['published'],
             'duration': float_or_none(data['duration'], 1000),
             'view_count': data['displays'],
             'formats': formats,
+            'is_live': True if stream_type == 'live' else False,
         }
+
+
+class BTArticleIE(InfoExtractor):
+    IE_NAME = 'bt:article'
+    IE_DESC = 'Bergens Tidende Articles'
+    _VALID_URL = 'http://(?:www\.)?bt\.no/(?:[^/]+/)+(?P<id>[^/]+)-\d+\.html'
+    _TEST = {
+        'url': 'http://www.bt.no/nyheter/lokalt/Kjemper-for-internatet-1788214.html',
+        'md5': 'd055e8ee918ef2844745fcfd1a4175fb',
+        'info_dict': {
+            'id': '23199',
+            'ext': 'mp4',
+            'title': 'Alrekstad internat',
+            'description': 'md5:dc81a9056c874fedb62fc48a300dac58',
+            'thumbnail': 're:^https?://.*\.jpg',
+            'duration': 191,
+            'timestamp': 1289991323,
+            'upload_date': '20101117',
+            'view_count': int,
+        },
+    }
+
+    def _real_extract(self, url):
+        webpage = self._download_webpage(url, self._match_id(url))
+        video_id = self._search_regex(
+            r'SVP\.Player\.load\(\s*(\d+)', webpage, 'video id')
+        return self.url_result('vgtv:bt:%s' % video_id, 'VGTV')
+
+
+class BTVestlendingenIE(InfoExtractor):
+    IE_NAME = 'bt:vestlendingen'
+    IE_DESC = 'Bergens Tidende - Vestlendingen'
+    _VALID_URL = 'http://(?:www\.)?bt\.no/spesial/vestlendingen/#!/(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.bt.no/spesial/vestlendingen/#!/86588',
+        'md5': 'd7d17e3337dc80de6d3a540aefbe441b',
+        'info_dict': {
+            'id': '86588',
+            'ext': 'mov',
+            'title': 'Otto Wollertsen',
+            'description': 'Vestlendingen Otto Fredrik Wollertsen',
+            'timestamp': 1430473209,
+            'upload_date': '20150501',
+        },
+    }
+
+    def _real_extract(self, url):
+        return self.url_result('xstream:btno:%s' % self._match_id(url), 'Xstream')
index 04e2b0ba7849adee473a42e471d77ced7df0652c..01af7a99574401b38e487b01dd5104e674740bbc 100644 (file)
@@ -1,5 +1,4 @@
 from __future__ import unicode_literals
-import re
 
 from .common import InfoExtractor
 from .ooyala import OoyalaIE
@@ -7,25 +6,29 @@ from ..utils import ExtractorError
 
 
 class ViceIE(InfoExtractor):
-    _VALID_URL = r'http://www\.vice\.com/.*?/(?P<name>.+)'
+    _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)+(?P<id>.+)'
 
-    _TEST = {
-        'url': 'http://www.vice.com/Fringes/cowboy-capitalists-part-1',
-        'info_dict': {
-            'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp',
-            'ext': 'mp4',
-            'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
-        },
-        'params': {
-            # Requires ffmpeg (m3u8 manifest)
-            'skip_download': True,
-        },
-    }
+    _TESTS = [
+        {
+            'url': 'http://www.vice.com/Fringes/cowboy-capitalists-part-1',
+            'info_dict': {
+                'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp',
+                'ext': 'mp4',
+                'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
+            },
+            'params': {
+                # Requires ffmpeg (m3u8 manifest)
+                'skip_download': True,
+            },
+        }, {
+            'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab',
+            'only_matching': True,
+        }
+    ]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        name = mobj.group('name')
-        webpage = self._download_webpage(url, name)
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
         try:
             embed_code = self._search_regex(
                 r'embedCode=([^&\'"]+)', webpage,
index eb309a7cdf99b3ebc4bde755fe09d47505516f28..78ff6310a07f6864abda658ca1e804e26594460b 100644 (file)
@@ -8,20 +8,23 @@ from ..compat import compat_urllib_request
 
 
 class VideoMegaIE(InfoExtractor):
-    _VALID_URL = r'''(?x)https?://
-        (?:www\.)?videomega\.tv/
-        (?:iframe\.php|cdn\.php)?\?ref=(?P<id>[A-Za-z0-9]+)
-        '''
-    _TEST = {
-        'url': 'http://videomega.tv/?ref=4GNA688SU99US886ANG4',
-        'md5': 'bf5c2f95c4c917536e80936af7bc51e1',
+    _VALID_URL = r'(?:videomega:|https?://(?:www\.)?videomega\.tv/(?:(?:view|iframe|cdn)\.php)?\?ref=)(?P<id>[A-Za-z0-9]+)'
+    _TESTS = [{
+        'url': 'http://videomega.tv/cdn.php?ref=AOSQBJYKIDDIKYJBQSOA',
+        'md5': 'cc1920a58add3f05c6a93285b84fb3aa',
         'info_dict': {
-            'id': '4GNA688SU99US886ANG4',
+            'id': 'AOSQBJYKIDDIKYJBQSOA',
             'ext': 'mp4',
-            'title': 'BigBuckBunny_320x180',
+            'title': '1254207',
             'thumbnail': 're:^https?://.*\.jpg$',
         }
-    }
+    }, {
+        'url': 'http://videomega.tv/cdn.php?ref=AOSQBJYKIDDIKYJBQSOA&width=1070&height=600',
+        'only_matching': True,
+    }, {
+        'url': 'http://videomega.tv/view.php?ref=090051111052065112106089103052052103089106112065052111051090',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
@@ -29,12 +32,13 @@ class VideoMegaIE(InfoExtractor):
         iframe_url = 'http://videomega.tv/cdn.php?ref=%s' % video_id
         req = compat_urllib_request.Request(iframe_url)
         req.add_header('Referer', url)
+        req.add_header('Cookie', 'noadvtday=0')
         webpage = self._download_webpage(req, video_id)
 
         title = self._html_search_regex(
-            r'<title>(.*?)</title>', webpage, 'title')
+            r'<title>(.+?)</title>', webpage, 'title')
         title = re.sub(
-            r'(?:^[Vv]ideo[Mm]ega\.tv\s-\s?|\s?-\svideomega\.tv$)', '', title)
+            r'(?:^[Vv]ideo[Mm]ega\.tv\s-\s*|\s*-\svideomega\.tv$)', '', title)
         thumbnail = self._search_regex(
             r'<video[^>]+?poster="([^"]+)"', webpage, 'thumbnail', fatal=False)
         video_url = self._search_regex(
index ececc7ee0118932716ca9bdb06779cf94e6dc0ec..591024eaded0cdddbb0779bd942c0fa8f63d86a6 100644 (file)
@@ -43,7 +43,7 @@ class VideoTtIE(InfoExtractor):
 
         formats = [
             {
-                'url': base64.b64decode(res['u']).decode('utf-8'),
+                'url': base64.b64decode(res['u'].encode('utf-8')).decode('utf-8'),
                 'ext': 'flv',
                 'format_id': res['l'],
             } for res in settings['res'] if res['u']
index bd953fb4cc212f50dce2cac624c9391a14e82898..e0b55078b2c9af8bf654ee6cd6982305074cb39b 100644 (file)
@@ -10,7 +10,7 @@ from ..utils import (
 
 class VidmeIE(InfoExtractor):
     _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]+)'
-    _TEST = {
+    _TESTS = [{
         'url': 'https://vid.me/QNB',
         'md5': 'f42d05e7149aeaec5c037b17e5d3dc82',
         'info_dict': {
@@ -23,9 +23,14 @@ class VidmeIE(InfoExtractor):
             'upload_date': '20140725',
             'thumbnail': 're:^https?://.*\.jpg',
         },
-    }
+    }, {
+        # From http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching
+        'url': 'https://vid.me/e/Wmur',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
+        url = url.replace('vid.me/e/', 'vid.me/')
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
 
index 619039e516c96209c38953e6b73f93a6895df54c..15377097e658b20e75a08f19b370be3bef2158c7 100644 (file)
@@ -38,11 +38,14 @@ class VierIE(InfoExtractor):
         webpage = self._download_webpage(url, display_id)
 
         video_id = self._search_regex(
-            r'"nid"\s*:\s*"(\d+)"', webpage, 'video id')
+            [r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'],
+            webpage, 'video id')
         application = self._search_regex(
-            r'"application"\s*:\s*"([^"]+)"', webpage, 'application', default='vier_vod')
+            [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'],
+            webpage, 'application', default='vier_vod')
         filename = self._search_regex(
-            r'"filename"\s*:\s*"([^"]+)"', webpage, 'filename')
+            [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'],
+            webpage, 'filename')
 
         playlist_url = 'http://vod.streamcloud.be/%s/mp4:_definst_/%s.mp4/playlist.m3u8' % (application, filename)
         formats = self._extract_m3u8_formats(playlist_url, display_id, 'mp4')
index 1742e66f481c91477ae41e5c42a262e35b7adc47..6ef36290b417a846bb2a6f36cc80fd2a6e59e105 100644 (file)
+# coding: utf-8
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..compat import compat_urllib_request
+from ..compat import (
+    compat_urllib_request,
+    compat_urllib_parse,
+)
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    parse_iso8601,
+)
 
 
 class ViewsterIE(InfoExtractor):
-    _VALID_URL = r'http://(?:www\.)?viewster\.com/movie/(?P<id>\d+-\d+-\d+)'
+    _VALID_URL = r'http://(?:www\.)?viewster\.com/(?:serie|movie)/(?P<id>\d+-\d+-\d+)'
     _TESTS = [{
-        # movielink, paymethod=fre
-        'url': 'http://www.viewster.com/movie/1293-19341-000/hout-wood/',
-        'playlist': [{
-            'md5': '8f9d94b282d80c42b378dffdbb11caf3',
-            'info_dict': {
-                'id': '1293-19341-000-movie',
-                'ext': 'flv',
-                'title': "'Hout' (Wood) - Movie",
-            },
-        }],
-        'info_dict': {
-            'id': '1293-19341-000',
-            'title': "'Hout' (Wood)",
-            'description': 'md5:925733185a9242ef96f436937683f33b',
-        }
-    }, {
-        # movielink, paymethod=adv
+        # movie, Type=Movie
         'url': 'http://www.viewster.com/movie/1140-11855-000/the-listening-project/',
-        'playlist': [{
-            'md5': '77a005453ca7396cbe3d35c9bea30aef',
-            'info_dict': {
-                'id': '1140-11855-000-movie',
-                'ext': 'flv',
-                'title': "THE LISTENING PROJECT - Movie",
-            },
-        }],
+        'md5': '14d3cfffe66d57b41ae2d9c873416f01',
         'info_dict': {
             'id': '1140-11855-000',
-            'title': "THE LISTENING PROJECT",
-            'description': 'md5:714421ae9957e112e672551094bf3b08',
-        }
+            'ext': 'flv',
+            'title': 'The listening Project',
+            'description': 'md5:bac720244afd1a8ea279864e67baa071',
+            'timestamp': 1214870400,
+            'upload_date': '20080701',
+            'duration': 4680,
+        },
     }, {
-        # direct links, no movielink
-        'url': 'http://www.viewster.com/movie/1198-56411-000/sinister/',
-        'playlist': [{
-            'md5': '0307b7eac6bfb21ab0577a71f6eebd8f',
-            'info_dict': {
-                'id': '1198-56411-000-trailer',
-                'ext': 'mp4',
-                'title': "Sinister - Trailer",
-            },
-        }, {
-            'md5': '80b9ee3ad69fb368f104cb5d9732ae95',
-            'info_dict': {
-                'id': '1198-56411-000-behind-scenes',
-                'ext': 'mp4',
-                'title': "Sinister - Behind Scenes",
-            },
-        }, {
-            'md5': '3b3ea897ecaa91fca57a8a94ac1b15c5',
-            'info_dict': {
-                'id': '1198-56411-000-scene-from-movie',
-                'ext': 'mp4',
-                'title': "Sinister - Scene from movie",
-            },
-        }],
+        # series episode, Type=Episode
+        'url': 'http://www.viewster.com/serie/1284-19427-001/the-world-and-a-wall/',
+        'md5': 'd5434c80fcfdb61651cc2199a88d6ba3',
         'info_dict': {
-            'id': '1198-56411-000',
-            'title': "Sinister",
-            'description': 'md5:014c40b0488848de9683566a42e33372',
-        }
+            'id': '1284-19427-001',
+            'ext': 'flv',
+            'title': 'The World and a Wall',
+            'description': 'md5:24814cf74d3453fdf5bfef9716d073e3',
+            'timestamp': 1428192000,
+            'upload_date': '20150405',
+            'duration': 1500,
+        },
+    }, {
+        # serie, Type=Serie
+        'url': 'http://www.viewster.com/serie/1303-19426-000/',
+        'info_dict': {
+            'id': '1303-19426-000',
+            'title': 'Is It Wrong to Try to Pick up Girls in a Dungeon?',
+            'description': 'md5:eeda9bef25b0d524b3a29a97804c2f11',
+        },
+        'playlist_count': 13,
+    }, {
+        # unfinished serie, no Type
+        'url': 'http://www.viewster.com/serie/1284-19427-000/baby-steps-season-2/',
+        'info_dict': {
+            'id': '1284-19427-000',
+            'title': 'Baby Steps—Season 2',
+            'description': 'md5:e7097a8fc97151e25f085c9eb7a1cdb1',
+        },
+        'playlist_mincount': 16,
     }]
 
     _ACCEPT_HEADER = 'application/json, text/javascript, */*; q=0.01'
+    _AUTH_TOKEN = '/YqhSYsx8EaU9Bsta3ojlA=='
 
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-
-        request = compat_urllib_request.Request(
-            'http://api.live.viewster.com/api/v1/movie/%s' % video_id)
+    def _download_json(self, url, video_id, note='Downloading JSON metadata', fatal=True):
+        request = compat_urllib_request.Request(url)
         request.add_header('Accept', self._ACCEPT_HEADER)
+        request.add_header('Auth-token', self._AUTH_TOKEN)
+        return super(ViewsterIE, self)._download_json(request, video_id, note, fatal=fatal)
 
-        movie = self._download_json(
-            request, video_id, 'Downloading movie metadata JSON')
-
-        title = movie.get('title') or movie['original_title']
-        description = movie.get('synopsis')
-        thumbnail = movie.get('large_artwork') or movie.get('artwork')
-
-        entries = []
-        for clip in movie['play_list']:
-            entry = None
-
-            # movielink api
-            link_request = clip.get('link_request')
-            if link_request:
-                request = compat_urllib_request.Request(
-                    'http://api.live.viewster.com/api/v1/movielink?movieid=%(movieid)s&action=%(action)s&paymethod=%(paymethod)s&price=%(price)s&currency=%(currency)s&language=%(language)s&subtitlelanguage=%(subtitlelanguage)s&ischromecast=%(ischromecast)s'
-                    % link_request)
-                request.add_header('Accept', self._ACCEPT_HEADER)
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
 
-                movie_link = self._download_json(
-                    request, video_id, 'Downloading movie link JSON', fatal=False)
+        info = self._download_json(
+            'https://public-api.viewster.com/search/%s' % video_id,
+            video_id, 'Downloading entry JSON')
 
-                if movie_link:
-                    formats = self._extract_f4m_formats(
-                        movie_link['url'] + '&hdcore=3.2.0&plugin=flowplayer-3.2.0.1', video_id)
-                    self._sort_formats(formats)
-                    entry = {
-                        'formats': formats,
-                    }
+        entry_id = info.get('Id') or info['id']
 
-            # direct link
-            clip_url = clip.get('clip_data', {}).get('url')
-            if clip_url:
-                entry = {
-                    'url': clip_url,
-                    'ext': 'mp4',
-                }
+        # unfinished serie has no Type
+        if info.get('Type') in ['Serie', None]:
+            episodes = self._download_json(
+                'https://public-api.viewster.com/series/%s/episodes' % entry_id,
+                video_id, 'Downloading series JSON')
+            entries = [
+                self.url_result(
+                    'http://www.viewster.com/movie/%s' % episode['OriginId'], 'Viewster')
+                for episode in episodes]
+            title = (info.get('Title') or info['Synopsis']['Title']).strip()
+            description = info.get('Synopsis', {}).get('Detailed')
+            return self.playlist_result(entries, video_id, title, description)
 
-            if entry:
-                entry.update({
-                    'id': '%s-%s' % (video_id, clip['canonical_title']),
-                    'title': '%s - %s' % (title, clip['title']),
+        formats = []
+        for media_type in ('application/f4m+xml', 'application/x-mpegURL'):
+            media = self._download_json(
+                'https://public-api.viewster.com/movies/%s/video?mediaType=%s'
+                % (entry_id, compat_urllib_parse.quote(media_type)),
+                video_id, 'Downloading %s JSON' % media_type, fatal=False)
+            if not media:
+                continue
+            video_url = media.get('Uri')
+            if not video_url:
+                continue
+            ext = determine_ext(video_url)
+            if ext == 'f4m':
+                video_url += '&' if '?' in video_url else '?'
+                video_url += 'hdcore=3.2.0&plugin=flowplayer-3.2.0.1'
+                formats.extend(self._extract_f4m_formats(
+                    video_url, video_id, f4m_id='hds'))
+            elif ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    video_url, video_id, 'mp4', m3u8_id='hls',
+                    fatal=False  # m3u8 sometimes fail
+                ))
+            else:
+                formats.append({
+                    'url': video_url,
                 })
-                entries.append(entry)
+        self._sort_formats(formats)
 
-        playlist = self.playlist_result(entries, video_id, title, description)
-        playlist['thumbnail'] = thumbnail
-        return playlist
+        synopsis = info.get('Synopsis', {})
+        # Prefer title outside synopsis since it's less messy
+        title = (info.get('Title') or synopsis['Title']).strip()
+        description = synopsis.get('Detailed') or info.get('Synopsis', {}).get('Short')
+        duration = int_or_none(info.get('Duration'))
+        timestamp = parse_iso8601(info.get('ReleaseDate'))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'timestamp': timestamp,
+            'duration': duration,
+            'formats': formats,
+        }
index cf6af1e5cdb6315d325d2bd355d384cc283a3e0c..ddbd395c89ee183dee719f549cbe534c64835769 100644 (file)
+# coding: utf-8
 from __future__ import unicode_literals
 
-import re
+import json
+import time
+import hmac
+import hashlib
+import itertools
 
-from ..compat import (
-    compat_urlparse,
-    compat_urllib_request,
-)
 from ..utils import (
     ExtractorError,
-    unescapeHTML,
-    unified_strdate,
-    US_RATINGS,
-    determine_ext,
-    mimetype2ext,
+    int_or_none,
+    parse_age_limit,
+    parse_iso8601,
 )
+from ..compat import compat_urllib_request
 from .common import InfoExtractor
 
 
-class VikiIE(InfoExtractor):
-    IE_NAME = 'viki'
+class VikiBaseIE(InfoExtractor):
+    _VALID_URL_BASE = r'https?://(?:www\.)?viki\.(?:com|net|mx|jp|fr)/'
+    _API_QUERY_TEMPLATE = '/v4/%sapp=%s&t=%s&site=www.viki.com'
+    _API_URL_TEMPLATE = 'http://api.viki.io%s&sig=%s'
+
+    _APP = '65535a'
+    _APP_VERSION = '2.2.5.1428709186'
+    _APP_SECRET = '-$iJ}@p7!G@SyU/je1bEyWg}upLu-6V6-Lg9VD(]siH,r.,m-r|ulZ,U4LC/SeR)'
+
+    _NETRC_MACHINE = 'viki'
+
+    _token = None
+
+    def _prepare_call(self, path, timestamp=None, post_data=None):
+        path += '?' if '?' not in path else '&'
+        if not timestamp:
+            timestamp = int(time.time())
+        query = self._API_QUERY_TEMPLATE % (path, self._APP, timestamp)
+        if self._token:
+            query += '&token=%s' % self._token
+        sig = hmac.new(
+            self._APP_SECRET.encode('ascii'),
+            query.encode('ascii'),
+            hashlib.sha1
+        ).hexdigest()
+        url = self._API_URL_TEMPLATE % (query, sig)
+        return compat_urllib_request.Request(
+            url, json.dumps(post_data).encode('utf-8')) if post_data else url
+
+    def _call_api(self, path, video_id, note, timestamp=None, post_data=None):
+        resp = self._download_json(
+            self._prepare_call(path, timestamp, post_data), video_id, note)
+
+        error = resp.get('error')
+        if error:
+            if error == 'invalid timestamp':
+                resp = self._download_json(
+                    self._prepare_call(path, int(resp['current_timestamp']), post_data),
+                    video_id, '%s (retry)' % note)
+                error = resp.get('error')
+            if error:
+                self._raise_error(resp['error'])
+
+        return resp
+
+    def _raise_error(self, error):
+        raise ExtractorError(
+            '%s returned error: %s' % (self.IE_NAME, error),
+            expected=True)
+
+    def _real_initialize(self):
+        self._login()
+
+    def _login(self):
+        (username, password) = self._get_login_info()
+        if username is None:
+            return
+
+        login_form = {
+            'login_id': username,
+            'password': password,
+        }
+
+        login = self._call_api(
+            'sessions.json', None,
+            'Logging in as %s' % username, post_data=login_form)
+
+        self._token = login.get('token')
+        if not self._token:
+            self.report_warning('Unable to get session token, login has probably failed')
+
+    @staticmethod
+    def dict_selection(dict_obj, preferred_key):
+        if preferred_key in dict_obj:
+            return dict_obj.get(preferred_key)
 
-    # iPad2
-    _USER_AGENT = 'Mozilla/5.0(iPad; U; CPU OS 4_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8F191 Safari/6533.18.5'
+        filtered_dict = list(filter(None, [dict_obj.get(k) for k in dict_obj.keys()]))
+        return filtered_dict[0] if filtered_dict else None
 
-    _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)'
+
+class VikiIE(VikiBaseIE):
+    IE_NAME = 'viki'
+    _VALID_URL = r'%s(?:videos|player)/(?P<id>[0-9]+v)' % VikiBaseIE._VALID_URL_BASE
     _TESTS = [{
         'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14',
         'info_dict': {
@@ -37,111 +113,224 @@ class VikiIE(InfoExtractor):
         },
         'skip': 'Blocked in the US',
     }, {
+        # clip
         'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference',
-        'md5': 'ca6493e6f0a6ec07da9aa8d6304b4b2c',
+        'md5': '86c0b5dbd4d83a6611a79987cc7a1989',
         'info_dict': {
             'id': '1067139v',
             'ext': 'mp4',
+            'title': "'The Avengers: Age of Ultron' Press Conference",
             'description': 'md5:d70b2f9428f5488321bfe1db10d612ea',
+            'duration': 352,
+            'timestamp': 1430380829,
             'upload_date': '20150430',
-            'title': '\'The Avengers: Age of Ultron\' Press Conference',
+            'uploader': 'Arirang TV',
+            'like_count': int,
+            'age_limit': 0,
         }
     }, {
         'url': 'http://www.viki.com/videos/1048879v-ankhon-dekhi',
         'info_dict': {
             'id': '1048879v',
             'ext': 'mp4',
-            'upload_date': '20140820',
-            'description': 'md5:54ff56d51bdfc7a30441ec967394e91c',
             'title': 'Ankhon Dekhi',
+            'duration': 6512,
+            'timestamp': 1408532356,
+            'upload_date': '20140820',
+            'uploader': 'Spuul',
+            'like_count': int,
+            'age_limit': 13,
         },
         'params': {
-            # requires ffmpeg
+            # m3u8 download
             'skip_download': True,
         }
+    }, {
+        # episode
+        'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1',
+        'md5': '190f3ef426005ba3a080a63325955bc3',
+        'info_dict': {
+            'id': '44699v',
+            'ext': 'mp4',
+            'title': 'Boys Over Flowers - Episode 1',
+            'description': 'md5:52617e4f729c7d03bfd4bcbbb6e946f2',
+            'duration': 4155,
+            'timestamp': 1270496524,
+            'upload_date': '20100405',
+            'uploader': 'group8',
+            'like_count': int,
+            'age_limit': 13,
+        }
+    }, {
+        # youtube external
+        'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1',
+        'md5': '216d1afdc0c64d1febc1e9f2bd4b864b',
+        'info_dict': {
+            'id': '50562v',
+            'ext': 'mp4',
+            'title': 'Poor Nastya [COMPLETE] - Episode 1',
+            'description': '',
+            'duration': 607,
+            'timestamp': 1274949505,
+            'upload_date': '20101213',
+            'uploader': 'ad14065n',
+            'uploader_id': 'ad14065n',
+            'like_count': int,
+            'age_limit': 13,
+        }
+    }, {
+        'url': 'http://www.viki.com/player/44699v',
+        'only_matching': True,
+    }, {
+        # non-English description
+        'url': 'http://www.viki.com/videos/158036v-love-in-magic',
+        'md5': '1713ae35df5a521b31f6dc40730e7c9c',
+        'info_dict': {
+            'id': '158036v',
+            'ext': 'mp4',
+            'uploader': 'I Planet Entertainment',
+            'upload_date': '20111122',
+            'timestamp': 1321985454,
+            'description': 'md5:44b1e46619df3a072294645c770cef36',
+            'title': 'Love In Magic',
+        },
     }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        webpage = self._download_webpage(url, video_id)
-        title = self._og_search_title(webpage)
-        description = self._og_search_description(webpage)
-        thumbnail = self._og_search_thumbnail(webpage)
-
-        uploader_m = re.search(
-            r'<strong>Broadcast Network: </strong>\s*([^<]*)<', webpage)
-        if uploader_m is None:
-            uploader = None
-        else:
-            uploader = uploader_m.group(1).strip()
-
-        rating_str = self._html_search_regex(
-            r'<strong>Rating: </strong>\s*([^<]*)<', webpage,
-            'rating information', default='').strip()
-        age_limit = US_RATINGS.get(rating_str)
-
-        req = compat_urllib_request.Request(
-            'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id)
-        req.add_header('User-Agent', self._USER_AGENT)
-        info_webpage = self._download_webpage(
-            req, video_id, note='Downloading info page')
-        err_msg = self._html_search_regex(r'<div[^>]+class="video-error[^>]+>(.+)</div>', info_webpage, 'error message', default=None)
-        if err_msg:
-            if 'not available in your region' in err_msg:
-                raise ExtractorError(
-                    'Video %s is blocked from your location.' % video_id,
-                    expected=True)
-            else:
-                raise ExtractorError('Viki said: ' + err_msg)
-        mobj = re.search(
-            r'<source[^>]+type="(?P<mime_type>[^"]+)"[^>]+src="(?P<url>[^"]+)"', info_webpage)
-        if not mobj:
-            raise ExtractorError('Unable to find video URL')
-        video_url = unescapeHTML(mobj.group('url'))
-        video_ext = mimetype2ext(mobj.group('mime_type'))
-
-        if determine_ext(video_url) == 'm3u8':
-            formats = self._extract_m3u8_formats(
-                video_url, video_id, ext=video_ext)
-        else:
-            formats = [{
-                'url': video_url,
-                'ext': video_ext,
-            }]
-
-        upload_date_str = self._html_search_regex(
-            r'"created_at":"([^"]+)"', info_webpage, 'upload date')
-        upload_date = (
-            unified_strdate(upload_date_str)
-            if upload_date_str is not None
-            else None
-        )
-
-        # subtitles
-        video_subtitles = self.extract_subtitles(video_id, info_webpage)
-
-        return {
+        video = self._call_api(
+            'videos/%s.json' % video_id, video_id, 'Downloading video JSON')
+
+        title = self.dict_selection(video.get('titles', {}), 'en')
+        if not title:
+            title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id
+            container_titles = video.get('container', {}).get('titles', {})
+            container_title = self.dict_selection(container_titles, 'en')
+            title = '%s - %s' % (container_title, title)
+
+        description = self.dict_selection(video.get('descriptions', {}), 'en')
+
+        duration = int_or_none(video.get('duration'))
+        timestamp = parse_iso8601(video.get('created_at'))
+        uploader = video.get('author')
+        like_count = int_or_none(video.get('likes', {}).get('count'))
+        age_limit = parse_age_limit(video.get('rating'))
+
+        thumbnails = []
+        for thumbnail_id, thumbnail in video.get('images', {}).items():
+            thumbnails.append({
+                'id': thumbnail_id,
+                'url': thumbnail.get('url'),
+            })
+
+        subtitles = {}
+        for subtitle_lang, _ in video.get('subtitle_completions', {}).items():
+            subtitles[subtitle_lang] = [{
+                'ext': subtitles_format,
+                'url': self._prepare_call(
+                    'videos/%s/subtitles/%s.%s' % (video_id, subtitle_lang, subtitles_format)),
+            } for subtitles_format in ('srt', 'vtt')]
+
+        result = {
             'id': video_id,
             'title': title,
-            'formats': formats,
             'description': description,
-            'thumbnail': thumbnail,
-            'age_limit': age_limit,
+            'duration': duration,
+            'timestamp': timestamp,
             'uploader': uploader,
-            'subtitles': video_subtitles,
-            'upload_date': upload_date,
+            'like_count': like_count,
+            'age_limit': age_limit,
+            'thumbnails': thumbnails,
+            'subtitles': subtitles,
         }
 
-    def _get_subtitles(self, video_id, info_webpage):
-        res = {}
-        for sturl_html in re.findall(r'<track src="([^"]+)"', info_webpage):
-            sturl = unescapeHTML(sturl_html)
-            m = re.search(r'/(?P<lang>[a-z]+)\.vtt', sturl)
-            if not m:
-                continue
-            res[m.group('lang')] = [{
-                'url': compat_urlparse.urljoin('http://www.viki.com', sturl),
-                'ext': 'vtt',
-            }]
-        return res
+        streams = self._call_api(
+            'videos/%s/streams.json' % video_id, video_id,
+            'Downloading video streams JSON')
+
+        if 'external' in streams:
+            result.update({
+                '_type': 'url_transparent',
+                'url': streams['external']['url'],
+            })
+            return result
+
+        formats = []
+        for format_id, stream_dict in streams.items():
+            height = int_or_none(self._search_regex(
+                r'^(\d+)[pP]$', format_id, 'height', default=None))
+            for protocol, format_dict in stream_dict.items():
+                if format_id == 'm3u8':
+                    formats = self._extract_m3u8_formats(
+                        format_dict['url'], video_id, 'mp4', m3u8_id='m3u8-%s' % protocol)
+                else:
+                    formats.append({
+                        'url': format_dict['url'],
+                        'format_id': '%s-%s' % (format_id, protocol),
+                        'height': height,
+                    })
+        self._sort_formats(formats)
+
+        result['formats'] = formats
+        return result
+
+
+class VikiChannelIE(VikiBaseIE):
+    IE_NAME = 'viki:channel'
+    _VALID_URL = r'%s(?:tv|news|movies|artists)/(?P<id>[0-9]+c)' % VikiBaseIE._VALID_URL_BASE
+    _TESTS = [{
+        'url': 'http://www.viki.com/tv/50c-boys-over-flowers',
+        'info_dict': {
+            'id': '50c',
+            'title': 'Boys Over Flowers',
+            'description': 'md5:ecd3cff47967fe193cff37c0bec52790',
+        },
+        'playlist_count': 70,
+    }, {
+        'url': 'http://www.viki.com/tv/1354c-poor-nastya-complete',
+        'info_dict': {
+            'id': '1354c',
+            'title': 'Poor Nastya [COMPLETE]',
+            'description': 'md5:05bf5471385aa8b21c18ad450e350525',
+        },
+        'playlist_count': 127,
+    }, {
+        'url': 'http://www.viki.com/news/24569c-showbiz-korea',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.viki.com/movies/22047c-pride-and-prejudice-2005',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.viki.com/artists/2141c-shinee',
+        'only_matching': True,
+    }]
+
+    _PER_PAGE = 25
+
+    def _real_extract(self, url):
+        channel_id = self._match_id(url)
+
+        channel = self._call_api(
+            'containers/%s.json' % channel_id, channel_id,
+            'Downloading channel JSON')
+
+        title = self.dict_selection(channel['titles'], 'en')
+
+        description = self.dict_selection(channel['descriptions'], 'en')
+
+        entries = []
+        for video_type in ('episodes', 'clips', 'movies'):
+            for page_num in itertools.count(1):
+                page = self._call_api(
+                    'containers/%s/%s.json?per_page=%d&sort=number&direction=asc&with_paging=true&page=%d'
+                    % (channel_id, video_type, self._PER_PAGE, page_num), channel_id,
+                    'Downloading %s JSON page #%d' % (video_type, page_num))
+                for video in page['response']:
+                    video_id = video['id']
+                    entries.append(self.url_result(
+                        'http://www.viki.com/videos/%s' % video_id, 'Viki'))
+                if not page['pagination']['next']:
+                    break
+
+        return self.playlist_result(entries, channel_id, title, description)
index f300c7ca40eeba75f872bbd6cd79c3f0720e0955..10d6745af703e00d6962d3e14c8b01f2419ad955 100644 (file)
@@ -22,6 +22,7 @@ from ..utils import (
     unified_strdate,
     unsmuggle_url,
     urlencode_postdata,
+    unescapeHTML,
 )
 
 
@@ -173,6 +174,21 @@ class VimeoIE(VimeoBaseInfoExtractor):
         },
     ]
 
+    @staticmethod
+    def _extract_vimeo_url(url, webpage):
+        # Look for embedded (iframe) Vimeo player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
+        if mobj:
+            player_url = unescapeHTML(mobj.group('url'))
+            surl = smuggle_url(player_url, {'Referer': url})
+            return surl
+        # Look for embedded (swf embed) Vimeo player
+        mobj = re.search(
+            r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
+        if mobj:
+            return mobj.group(1)
+
     def _verify_video_password(self, url, video_id, webpage):
         password = self._downloader.params.get('videopassword', None)
         if password is None:
@@ -436,11 +452,7 @@ class VimeoChannelIE(InfoExtractor):
         password = self._downloader.params.get('videopassword', None)
         if password is None:
             raise ExtractorError('This album is protected by a password, use the --video-password option', expected=True)
-        fields = dict(re.findall(r'''(?x)<input\s+
-            type="hidden"\s+
-            name="([^"]+)"\s+
-            value="([^"]*)"
-            ''', login_form))
+        fields = self._hidden_inputs(login_form)
         token = self._search_regex(r'xsrft[\s=:"\']+([^"\']+)', webpage, 'login token')
         fields['token'] = token
         fields['password'] = password
index aa3d6ddfd2420524fd87f85819d2611225224e79..92321d66e369626c0adfeda6cb4fae282a6f7abb 100644 (file)
@@ -4,7 +4,29 @@ from .common import InfoExtractor
 from ..utils import int_or_none
 
 
-class VimpleIE(InfoExtractor):
+class SprutoBaseIE(InfoExtractor):
+    def _extract_spruto(self, spruto, video_id):
+        playlist = spruto['playlist'][0]
+        title = playlist['title']
+        video_id = playlist.get('videoId') or video_id
+        thumbnail = playlist.get('posterUrl') or playlist.get('thumbnailUrl')
+        duration = int_or_none(playlist.get('duration'))
+
+        formats = [{
+            'url': f['url'],
+        } for f in playlist['video']]
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'formats': formats,
+        }
+
+
+class VimpleIE(SprutoBaseIE):
     IE_DESC = 'Vimple - one-click video hosting'
     _VALID_URL = r'https?://(?:player\.vimple\.ru/iframe|vimple\.ru)/(?P<id>[\da-f-]{32,36})'
     _TESTS = [
@@ -30,25 +52,9 @@ class VimpleIE(InfoExtractor):
         webpage = self._download_webpage(
             'http://player.vimple.ru/iframe/%s' % video_id, video_id)
 
-        playlist = self._parse_json(
+        spruto = self._parse_json(
             self._search_regex(
                 r'sprutoData\s*:\s*({.+?}),\r\n', webpage, 'spruto data'),
-            video_id)['playlist'][0]
-
-        title = playlist['title']
-        video_id = playlist.get('videoId') or video_id
-        thumbnail = playlist.get('posterUrl') or playlist.get('thumbnailUrl')
-        duration = int_or_none(playlist.get('duration'))
-
-        formats = [{
-            'url': f['url'],
-        } for f in playlist['video']]
-        self._sort_formats(formats)
+            video_id)
 
-        return {
-            'id': video_id,
-            'title': title,
-            'thumbnail': thumbnail,
-            'duration': duration,
-            'formats': formats,
-        }
+        return self._extract_spruto(spruto, video_id)
index 65c459fad39c36798d3d262d37ee95e52a2c313c..c733a48fa26edce6b219d3bf2404267b8c346bf6 100644 (file)
@@ -75,7 +75,7 @@ class VineIE(InfoExtractor):
         return {
             'id': video_id,
             'title': self._og_search_title(webpage),
-            'alt_title': self._og_search_description(webpage),
+            'alt_title': self._og_search_description(webpage, default=None),
             'description': data['description'],
             'thumbnail': data['thumbnailUrl'],
             'upload_date': unified_strdate(data['created']),
index cc384adbf9837f35f90c64d0e8dc0396b0b601ec..c30c5a8e524324a29d7c4a2dad49d5b9d50dda30 100644 (file)
@@ -13,14 +13,26 @@ from ..compat import (
 from ..utils import (
     ExtractorError,
     orderedSet,
+    str_to_int,
     unescapeHTML,
     unified_strdate,
 )
 
 
 class VKIE(InfoExtractor):
-    IE_NAME = 'vk.com'
-    _VALID_URL = r'https?://(?:m\.)?vk\.com/(?:video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)|(?:.+?\?.*?z=)?video(?P<videoid>[^s].*?)(?:\?|%2F|$))'
+    IE_NAME = 'vk'
+    IE_DESC = 'VK'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            (?:m\.)?vk\.com/video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)|
+                            (?:
+                                (?:m\.)?vk\.com/(?:.+?\?.*?z=)?video|
+                                (?:www\.)?biqle\.ru/watch/
+                            )
+                            (?P<videoid>[^s].*?)(?:\?(?:.*\blist=(?P<list_id>[\da-f]+))?|%2F|$)
+                        )
+                    '''
     _NETRC_MACHINE = 'vk'
 
     _TESTS = [
@@ -34,6 +46,7 @@ class VKIE(InfoExtractor):
                 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*',
                 'duration': 195,
                 'upload_date': '20120212',
+                'view_count': int,
             },
         },
         {
@@ -45,7 +58,8 @@ class VKIE(InfoExtractor):
                 'uploader': 'Tom Cruise',
                 'title': 'No name',
                 'duration': 9,
-                'upload_date': '20130721'
+                'upload_date': '20130721',
+                'view_count': int,
             }
         },
         {
@@ -59,6 +73,7 @@ class VKIE(InfoExtractor):
                 'title': 'Lin Dan',
                 'duration': 101,
                 'upload_date': '20120730',
+                'view_count': int,
             }
         },
         {
@@ -73,7 +88,8 @@ class VKIE(InfoExtractor):
                 'uploader': 'Триллеры',
                 'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]',
                 'duration': 8352,
-                'upload_date': '20121218'
+                'upload_date': '20121218',
+                'view_count': int,
             },
             'skip': 'Requires vk account credentials',
         },
@@ -100,14 +116,54 @@ class VKIE(InfoExtractor):
                 'title': 'Книга Илая',
                 'duration': 6771,
                 'upload_date': '20140626',
+                'view_count': int,
             },
             'skip': 'Only works from Russia',
         },
+        {
+            # video (removed?) only available with list id
+            'url': 'https://vk.com/video30481095_171201961?list=8764ae2d21f14088d4',
+            'md5': '091287af5402239a1051c37ec7b92913',
+            'info_dict': {
+                'id': '171201961',
+                'ext': 'mp4',
+                'title': 'ТюменцевВВ_09.07.2015',
+                'uploader': 'Anton Ivanov',
+                'duration': 109,
+                'upload_date': '20150709',
+                'view_count': int,
+            },
+        },
+        {
+            # youtube embed
+            'url': 'https://vk.com/video276849682_170681728',
+            'info_dict': {
+                'id': 'V3K4mi0SYkc',
+                'ext': 'mp4',
+                'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate",
+                'description': 'md5:bf9c26cfa4acdfb146362682edd3827a',
+                'duration': 179,
+                'upload_date': '20130116',
+                'uploader': "Children's Joy Foundation",
+                'uploader_id': 'thecjf',
+                'view_count': int,
+            },
+        },
         {
             # removed video, just testing that we match the pattern
             'url': 'http://vk.com/feed?z=video-43215063_166094326%2Fbb50cacd3177146d7a',
             'only_matching': True,
         },
+        {
+            # age restricted video, requires vk account credentials
+            'url': 'https://vk.com/video205387401_164765225',
+            'only_matching': True,
+        },
+        {
+            # vk wrapper
+            'url': 'http://www.biqle.ru/watch/847655_160197695',
+            'only_matching': True,
+        }
     ]
 
     def _login(self):
@@ -115,20 +171,25 @@ class VKIE(InfoExtractor):
         if username is None:
             return
 
-        login_form = {
-            'act': 'login',
-            'role': 'al_frame',
-            'expire': '1',
-            'email': username,
-            'pass': password,
-        }
+        login_page = self._download_webpage(
+            'https://vk.com', None, 'Downloading login page')
+
+        login_form = self._hidden_inputs(login_page)
+
+        login_form.update({
+            'email': username.encode('cp1251'),
+            'pass': password.encode('cp1251'),
+        })
 
-        request = compat_urllib_request.Request('https://login.vk.com/?act=login',
-                                                compat_urllib_parse.urlencode(login_form).encode('utf-8'))
-        login_page = self._download_webpage(request, None, note='Logging in as %s' % username)
+        request = compat_urllib_request.Request(
+            'https://login.vk.com/?act=login',
+            compat_urllib_parse.urlencode(login_form).encode('utf-8'))
+        login_page = self._download_webpage(
+            request, None, note='Logging in as %s' % username)
 
         if re.search(r'onLoginFailed', login_page):
-            raise ExtractorError('Unable to login, incorrect username and/or password', expected=True)
+            raise ExtractorError(
+                'Unable to login, incorrect username and/or password', expected=True)
 
     def _real_initialize(self):
         self._login()
@@ -140,9 +201,26 @@ class VKIE(InfoExtractor):
         if not video_id:
             video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id'))
 
-        info_url = 'http://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id
+        info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id
+
+        # Some videos (removed?) can only be downloaded with list id specified
+        list_id = mobj.group('list_id')
+        if list_id:
+            info_url += '&list=%s' % list_id
+
         info_page = self._download_webpage(info_url, video_id)
 
+        error_message = self._html_search_regex(
+            r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>',
+            info_page, 'error message', default=None)
+        if error_message:
+            raise ExtractorError(error_message, expected=True)
+
+        if re.search(r'<!>/login\.php\?.*\bact=security_check', info_page):
+            raise ExtractorError(
+                'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.',
+                expected=True)
+
         ERRORS = {
             r'>Видеозапись .*? была изъята из публичного доступа в связи с обращением правообладателя.<':
             'Video %s has been removed from public access due to rightholder complaint.',
@@ -156,16 +234,20 @@ class VKIE(InfoExtractor):
 
             r'<!>Видео временно недоступно':
             'Video %s is temporarily unavailable.',
+
+            r'<!>Access denied':
+            'Access denied to video %s.',
         }
 
         for error_re, error_msg in ERRORS.items():
             if re.search(error_re, info_page):
                 raise ExtractorError(error_msg % video_id, expected=True)
 
-        m_yt = re.search(r'src="(http://www.youtube.com/.*?)"', info_page)
-        if m_yt is not None:
-            self.to_screen('Youtube video detected')
-            return self.url_result(m_yt.group(1), 'Youtube')
+        youtube_url = self._search_regex(
+            r'<iframe[^>]+src="((?:https?:)?//www.youtube.com/embed/[^"]+)"',
+            info_page, 'youtube iframe', default=None)
+        if youtube_url:
+            return self.url_result(youtube_url, 'Youtube')
 
         m_rutube = re.search(
             r'\ssrc="((?:https?:)?//rutube\.ru\\?/video\\?/embed(?:.*?))\\?"', info_page)
@@ -175,25 +257,29 @@ class VKIE(InfoExtractor):
                 m_rutube.group(1).replace('\\', ''))
             return self.url_result(rutube_url)
 
-        m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.*?});', info_page)
+        m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page)
         if m_opts:
-            m_opts_url = re.search(r"url\s*:\s*'([^']+)", m_opts.group(1))
+            m_opts_url = re.search(r"url\s*:\s*'((?!/\b)[^']+)", m_opts.group(1))
             if m_opts_url:
                 opts_url = m_opts_url.group(1)
                 if opts_url.startswith('//'):
                     opts_url = 'http:' + opts_url
                 return self.url_result(opts_url)
 
-        data_json = self._search_regex(r'var vars = ({.*?});', info_page, 'vars')
+        data_json = self._search_regex(r'var\s+vars\s*=\s*({.+?});', info_page, 'vars')
         data = json.loads(data_json)
 
         # Extract upload date
         upload_date = None
-        mobj = re.search(r'id="mv_date_wrap".*?Added ([a-zA-Z]+ [0-9]+), ([0-9]+) at', info_page)
+        mobj = re.search(r'id="mv_date(?:_views)?_wrap"[^>]*>([a-zA-Z]+ [0-9]+), ([0-9]+) at', info_page)
         if mobj is not None:
             mobj.group(1) + ' ' + mobj.group(2)
             upload_date = unified_strdate(mobj.group(1) + ' ' + mobj.group(2))
 
+        view_count = str_to_int(self._search_regex(
+            r'"mv_views_count_number"[^>]*>([\d,.]+) views<',
+            info_page, 'view count', fatal=False))
+
         formats = [{
             'format_id': k,
             'url': v,
@@ -210,29 +296,39 @@ class VKIE(InfoExtractor):
             'uploader': data.get('md_author'),
             'duration': data.get('duration'),
             'upload_date': upload_date,
+            'view_count': view_count,
         }
 
 
 class VKUserVideosIE(InfoExtractor):
-    IE_NAME = 'vk.com:user-videos'
-    IE_DESC = 'vk.com:All of a user\'s videos'
-    _VALID_URL = r'https?://vk\.com/videos(?P<id>[0-9]+)(?:m\?.*)?'
+    IE_NAME = 'vk:uservideos'
+    IE_DESC = "VK - User's Videos"
+    _VALID_URL = r'https?://vk\.com/videos(?P<id>-?[0-9]+)$'
     _TEMPLATE_URL = 'https://vk.com/videos'
-    _TEST = {
+    _TESTS = [{
         'url': 'http://vk.com/videos205387401',
         'info_dict': {
             'id': '205387401',
+            'title': "Tom Cruise's Videos",
         },
         'playlist_mincount': 4,
-    }
+    }, {
+        'url': 'http://vk.com/videos-77521',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         page_id = self._match_id(url)
-        page = self._download_webpage(url, page_id)
-        video_ids = orderedSet(
-            m.group(1) for m in re.finditer(r'href="/video([0-9_]+)"', page))
-        url_entries = [
+
+        webpage = self._download_webpage(url, page_id)
+
+        entries = [
             self.url_result(
                 'http://vk.com/video' + video_id, 'VK', video_id=video_id)
-            for video_id in video_ids]
-        return self.playlist_result(url_entries, page_id)
+            for video_id in orderedSet(re.findall(r'href="/video(-?[0-9_]+)"', webpage))]
+
+        title = unescapeHTML(self._search_regex(
+            r'<title>\s*([^<]+?)\s+\|\s+\d+\s+videos',
+            webpage, 'title', default=page_id))
+
+        return self.playlist_result(entries, page_id, title)
index 1c0966a793511a2ec3a9d147bd75ff22e8fb7209..ccf1928b5d323f277b4e8a47bd4d008e821b147c 100644 (file)
@@ -1,8 +1,6 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
 from ..compat import (
     compat_urllib_parse,
@@ -28,12 +26,7 @@ class VodlockerIE(InfoExtractor):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
 
-        fields = dict(re.findall(r'''(?x)<input\s+
-            type="hidden"\s+
-            name="([^"]+)"\s+
-            (?:id="[^"]+"\s+)?
-            value="([^"]*)"
-            ''', webpage))
+        fields = self._hidden_inputs(webpage)
 
         if fields['op'] == 'download1':
             self._sleep(3, video_id)  # they do detect when requests happen too fast!
diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py
new file mode 100644 (file)
index 0000000..254383d
--- /dev/null
@@ -0,0 +1,99 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_request,
+    compat_urlparse,
+)
+from ..utils import (
+    ExtractorError,
+    determine_ext,
+    int_or_none,
+)
+
+
+class VoiceRepublicIE(InfoExtractor):
+    _VALID_URL = r'https?://voicerepublic\.com/(?:talks|embed)/(?P<id>[0-9a-z-]+)'
+    _TESTS = [{
+        'url': 'http://voicerepublic.com/talks/watching-the-watchers-building-a-sousveillance-state',
+        'md5': '0554a24d1657915aa8e8f84e15dc9353',
+        'info_dict': {
+            'id': '2296',
+            'display_id': 'watching-the-watchers-building-a-sousveillance-state',
+            'ext': 'm4a',
+            'title': 'Watching the Watchers: Building a Sousveillance State',
+            'description': 'md5:715ba964958afa2398df615809cfecb1',
+            'thumbnail': 're:^https?://.*\.(?:png|jpg)$',
+            'duration': 1800,
+            'view_count': int,
+        }
+    }, {
+        'url': 'http://voicerepublic.com/embed/watching-the-watchers-building-a-sousveillance-state',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        req = compat_urllib_request.Request(
+            compat_urlparse.urljoin(url, '/talks/%s' % display_id))
+        # Older versions of Firefox get redirected to an "upgrade browser" page
+        req.add_header('User-Agent', 'youtube-dl')
+        webpage = self._download_webpage(req, display_id)
+
+        if '>Queued for processing, please stand by...<' in webpage:
+            raise ExtractorError(
+                'Audio is still queued for processing', expected=True)
+
+        config = self._search_regex(
+            r'(?s)return ({.+?});\s*\n', webpage,
+            'data', default=None)
+        data = self._parse_json(config, display_id, fatal=False) if config else None
+        if data:
+            title = data['title']
+            description = data.get('teaser')
+            talk_id = data.get('talk_id') or display_id
+            talk = data['talk']
+            duration = int_or_none(talk.get('duration'))
+            formats = [{
+                'url': compat_urlparse.urljoin(url, talk_url),
+                'format_id': format_id,
+                'ext': determine_ext(talk_url) or format_id,
+                'vcodec': 'none',
+            } for format_id, talk_url in talk['links'].items()]
+        else:
+            title = self._og_search_title(webpage)
+            description = self._html_search_regex(
+                r"(?s)<div class='talk-teaser'[^>]*>(.+?)</div>",
+                webpage, 'description', fatal=False)
+            talk_id = self._search_regex(
+                [r"id='jc-(\d+)'", r"data-shareable-id='(\d+)'"],
+                webpage, 'talk id', default=None) or display_id
+            duration = None
+            player = self._search_regex(
+                r"class='vr-player jp-jplayer'([^>]+)>", webpage, 'player')
+            formats = [{
+                'url': compat_urlparse.urljoin(url, talk_url),
+                'format_id': format_id,
+                'ext': determine_ext(talk_url) or format_id,
+                'vcodec': 'none',
+            } for format_id, talk_url in re.findall(r"data-([^=]+)='([^']+)'", player)]
+        self._sort_formats(formats)
+
+        thumbnail = self._og_search_thumbnail(webpage)
+        view_count = int_or_none(self._search_regex(
+            r"class='play-count[^']*'>\s*(\d+) plays",
+            webpage, 'play count', fatal=False))
+
+        return {
+            'id': talk_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'view_count': view_count,
+            'formats': formats,
+        }
index 405cb9db49f41a144a4c842d8f99aeb1c2023da9..149e364677fcab4d0374479c4b96ff741277b17e 100644 (file)
@@ -36,6 +36,7 @@ class VubeIE(InfoExtractor):
                 'comment_count': int,
                 'categories': ['amazing', 'hd', 'best drummer ever', 'william wei', 'bucket drumming', 'street drummer', 'epic street drumming'],
             },
+            'skip': 'Not accessible from Travis CI server',
         }, {
             'url': 'http://vube.com/Chiara+Grispo+Video+Channel/YL2qNPkqon',
             'md5': 'db7aba89d4603dadd627e9d1973946fe',
index c3fde53f5ef06a56b54e94b20b72a7e98c1992a5..a6d9b5fee1f4864d82c7f8bb83e87884c96afe3b 100644 (file)
@@ -49,7 +49,7 @@ class VuClipIE(InfoExtractor):
         links_code = self._search_regex(
             r'''(?xs)
                 (?:
-                    <img\s+src="/im/play.gif".*?>|
+                    <img\s+src="[^"]*/play.gif".*?>|
                     <!--\ player\ end\ -->\s*</div><!--\ thumb\ end-->
                 )
                 (.*?)
index 1eb24a3d67ffa92838ce41301b3b47d401482609..faa167e65861af3bb4803ab96fe931c15597dc00 100644 (file)
@@ -44,7 +44,7 @@ class VultureIE(InfoExtractor):
         query_webpage = self._download_webpage(
             query_url, display_id, note='Downloading query page')
         params_json = self._search_regex(
-            r'(?sm)new MagnifyEmbeddablePlayer\({.*?contentItem:\s*(\{.*?\})\n,\n',
+            r'(?sm)new MagnifyEmbeddablePlayer\({.*?contentItem:\s*(\{.*?\})\n?,\n',
             query_webpage,
             'player params')
         params = json.loads(params_json)
index 73077a312549f6b883fdf549a2b364f6de35db9f..2037d9b3d57cd5876d85e9552ffcc9f387fcc975 100644 (file)
@@ -1,6 +1,8 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import re
+
 from .common import InfoExtractor
 from ..utils import int_or_none
 
@@ -98,3 +100,42 @@ class WebOfStoriesIE(InfoExtractor):
             'description': description,
             'duration': duration,
         }
+
+
+class WebOfStoriesPlaylistIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?webofstories\.com/playAll/(?P<id>[^/]+)'
+    _TEST = {
+        'url': 'http://www.webofstories.com/playAll/donald.knuth',
+        'info_dict': {
+            'id': 'donald.knuth',
+            'title': 'Donald Knuth (Scientist)',
+        },
+        'playlist_mincount': 97,
+    }
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        entries = [
+            self.url_result('http://www.webofstories.com/play/%s' % video_number, 'WebOfStories')
+            for video_number in set(re.findall('href="/playAll/%s\?sId=(\d+)"' % playlist_id, webpage))
+        ]
+
+        title = self._search_regex(
+            r'<div id="speakerName">\s*<span>([^<]+)</span>',
+            webpage, 'speaker', default=None)
+        if title:
+            field = self._search_regex(
+                r'<span id="primaryField">([^<]+)</span>',
+                webpage, 'field', default=None)
+            if field:
+                title += ' (%s)' % field
+
+        if not title:
+            title = self._search_regex(
+                r'<title>Play\s+all\s+stories\s*-\s*([^<]+)\s*-\s*Web\s+of\s+Stories</title>',
+                webpage, 'title')
+
+        return self.playlist_result(entries, playlist_id, title)
index d6dec25ca9e7bb9de539e89c147e22b7381e3719..f69d46a2858077ed76ec9c8fc86166668f27c705 100644 (file)
@@ -37,7 +37,8 @@ class WimpIE(InfoExtractor):
         video_id = mobj.group(1)
         webpage = self._download_webpage(url, video_id)
         video_url = self._search_regex(
-            r"[\"']file[\"']\s*[:,]\s*[\"'](.+?)[\"']", webpage, 'video URL')
+            [r"[\"']file[\"']\s*[:,]\s*[\"'](.+?)[\"']", r"videoId\s*:\s*[\"']([^\"']+)[\"']"],
+            webpage, 'video URL')
         if YoutubeIE.suitable(video_url):
             self.to_screen('Found YouTube video')
             return {
index d5c26a032bcf28a9c8ae79e1d083d67ed29b2726..a3ea26feb38257071c8ae5d3c1702cf0fcd2650a 100644 (file)
@@ -6,8 +6,8 @@ from .common import InfoExtractor
 
 
 class WorldStarHipHopIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
-    _TEST = {
+    _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/(?:videos|android)/video\.php\?v=(?P<id>.*)'
+    _TESTS = [{
         "url": "http://www.worldstarhiphop.com/videos/video.php?v=wshh6a7q1ny0G34ZwuIO",
         "md5": "9d04de741161603bf7071bbf4e883186",
         "info_dict": {
@@ -15,7 +15,15 @@ class WorldStarHipHopIE(InfoExtractor):
             "ext": "mp4",
             "title": "KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!"
         }
-    }
+    }, {
+        'url': 'http://m.worldstarhiphop.com/android/video.php?v=wshh6a7q1ny0G34ZwuIO',
+        'md5': 'dc1c76c83ecc4190bb1eb143899b87d3',
+        'info_dict': {
+            'id': 'wshh6a7q1ny0G34ZwuIO',
+            'ext': 'mp4',
+            "title": "KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!"
+        }
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
@@ -26,19 +34,22 @@ class WorldStarHipHopIE(InfoExtractor):
             return self.url_result('vevo:%s' % m_vevo_id.group(1), ie='Vevo')
 
         video_url = self._search_regex(
-            r'so\.addVariable\("file","(.*?)"\)', webpage, 'video URL')
+            [r'so\.addVariable\("file","(.*?)"\)',
+             r'<div class="artlist">\s*<a[^>]+href="([^"]+)">'],
+            webpage, 'video URL')
 
         if 'youtube' in video_url:
             return self.url_result(video_url, ie='Youtube')
 
         video_title = self._html_search_regex(
-            r'(?s)<div class="content-heading">\s*<h1>(.*?)</h1>',
+            [r'(?s)<div class="content-heading">\s*<h1>(.*?)</h1>',
+             r'<span[^>]+class="tc-sp-pinned-title">(.*)</span>'],
             webpage, 'title')
 
         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
         thumbnail = self._html_search_regex(
             r'rel="image_src" href="(.*)" />', webpage, 'thumbnail',
-            fatal=False)
+            default=None)
         if not thumbnail:
             _title = r'candytitles.*>(.*)</span>'
             mobj = re.search(_title, webpage)
index 80c48c37d32c0849e689d626811ee34c5b414ee0..4ff99e5ca37fb8f4f0b663cc99761c31e75f1cf4 100644 (file)
@@ -1,9 +1,7 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..compat import (
-    compat_urllib_parse,
-)
+from ..compat import compat_urllib_parse_unquote
 
 
 class XBefIE(InfoExtractor):
@@ -30,7 +28,7 @@ class XBefIE(InfoExtractor):
         config_url_enc = self._download_webpage(
             'http://xbef.com/Main/GetVideoURLEncoded/%s' % video_id, video_id,
             note='Retrieving config URL')
-        config_url = compat_urllib_parse.unquote(config_url_enc)
+        config_url = compat_urllib_parse_unquote(config_url_enc)
         config = self._download_xml(
             config_url, video_id, note='Retrieving config')
 
index 4527567f8fc26c45b091aa868dc77b159576b56c..b4ad513a0d18ecbae558deceb6234efaa5ce5415 100644 (file)
@@ -13,7 +13,6 @@ from ..utils import (
 
 
 class XHamsterIE(InfoExtractor):
-    """Information Extractor for xHamster"""
     _VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'
     _TESTS = [
         {
@@ -133,3 +132,36 @@ class XHamsterIE(InfoExtractor):
             'age_limit': age_limit,
             'formats': formats,
         }
+
+
+class XHamsterEmbedIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?xhamster\.com/xembed\.php\?video=(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://xhamster.com/xembed.php?video=3328539',
+        'info_dict': {
+            'id': '3328539',
+            'ext': 'mp4',
+            'title': 'Pen Masturbation',
+            'upload_date': '20140728',
+            'uploader_id': 'anonymous',
+            'duration': 5,
+            'age_limit': 18,
+        }
+    }
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return [url for _, url in re.findall(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?xhamster\.com/xembed\.php\?video=\d+)\1',
+            webpage)]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_url = self._search_regex(
+            r'href="(https?://xhamster\.com/movies/%s/[^"]+\.html[^"]*)"' % video_id,
+            webpage, 'xhamster url')
+
+        return self.url_result(video_url, 'XHamster')
index 8c6241aedf7249343a725ab705968d0af963294a..7c9d8af6f2585207347d58d08fc607ebf4d28900 100644 (file)
@@ -43,7 +43,7 @@ class XMinusIE(InfoExtractor):
             r'minus_track\.dur_sec=\'([0-9]*?)\'',
             webpage, 'duration', fatal=False))
         filesize_approx = parse_filesize(self._html_search_regex(
-            r'<div class="filesize[^"]*"></div>\s*([0-9.]+\s*[a-zA-Z][bB])',
+            r'<div id="finfo"[^>]*>\s*↓\s*([0-9.]+\s*[a-zA-Z][bB])',
             webpage, 'approximate filesize', fatal=False))
         tbr = int_or_none(self._html_search_regex(
             r'<div class="quality[^"]*"></div>\s*([0-9]+)\s*kbps',
@@ -58,7 +58,7 @@ class XMinusIE(InfoExtractor):
             description = re.sub(' *\r *', '\n', description)
 
         enc_token = self._html_search_regex(
-            r'minus_track\.tkn="(.+?)"', webpage, 'enc_token')
+            r'minus_track\.s?tkn="(.+?)"', webpage, 'enc_token')
         token = ''.join(
             c if pos == 3 else compat_chr(compat_ord(c) - 1)
             for pos, c in enumerate(reversed(enc_token)))
index 79ed6c744242bf132afd033ae35949cc1e2263b5..5a41f8ffa0c5a46a3d0431a6aac8e93ba8ca1cb9 100644 (file)
@@ -2,9 +2,7 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..compat import (
-    compat_urllib_parse,
-)
+from ..compat import compat_urllib_parse_unquote
 
 
 class XNXXIE(InfoExtractor):
@@ -26,7 +24,7 @@ class XNXXIE(InfoExtractor):
 
         video_url = self._search_regex(r'flv_url=(.*?)&amp;',
                                        webpage, 'video URL')
-        video_url = compat_urllib_parse.unquote(video_url)
+        video_url = compat_urllib_parse_unquote(video_url)
 
         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XNXX.COM',
                                               webpage, 'title')
diff --git a/youtube_dl/extractor/xstream.py b/youtube_dl/extractor/xstream.py
new file mode 100644 (file)
index 0000000..71584c2
--- /dev/null
@@ -0,0 +1,115 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_iso8601,
+    xpath_with_ns,
+    xpath_text,
+    find_xpath_attr,
+)
+
+
+class XstreamIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    (?:
+                        xstream:|
+                        https?://frontend\.xstream\.(?:dk|net)/
+                    )
+                    (?P<partner_id>[^/]+)
+                    (?:
+                        :|
+                        /feed/video/\?.*?\bid=
+                    )
+                    (?P<id>\d+)
+                    '''
+    _TESTS = [{
+        'url': 'http://frontend.xstream.dk/btno/feed/video/?platform=web&id=86588',
+        'md5': 'd7d17e3337dc80de6d3a540aefbe441b',
+        'info_dict': {
+            'id': '86588',
+            'ext': 'mov',
+            'title': 'Otto Wollertsen',
+            'description': 'Vestlendingen Otto Fredrik Wollertsen',
+            'timestamp': 1430473209,
+            'upload_date': '20150501',
+        },
+    }, {
+        'url': 'http://frontend.xstream.dk/ap/feed/video/?platform=web&id=21039',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        partner_id = mobj.group('partner_id')
+        video_id = mobj.group('id')
+
+        data = self._download_xml(
+            'http://frontend.xstream.dk/%s/feed/video/?platform=web&id=%s'
+            % (partner_id, video_id),
+            video_id)
+
+        NS_MAP = {
+            'atom': 'http://www.w3.org/2005/Atom',
+            'xt': 'http://xstream.dk/',
+            'media': 'http://search.yahoo.com/mrss/',
+        }
+
+        entry = data.find(xpath_with_ns('./atom:entry', NS_MAP))
+
+        title = xpath_text(
+            entry, xpath_with_ns('./atom:title', NS_MAP), 'title')
+        description = xpath_text(
+            entry, xpath_with_ns('./atom:summary', NS_MAP), 'description')
+        timestamp = parse_iso8601(xpath_text(
+            entry, xpath_with_ns('./atom:published', NS_MAP), 'upload date'))
+
+        formats = []
+        media_group = entry.find(xpath_with_ns('./media:group', NS_MAP))
+        for media_content in media_group.findall(xpath_with_ns('./media:content', NS_MAP)):
+            media_url = media_content.get('url')
+            if not media_url:
+                continue
+            tbr = int_or_none(media_content.get('bitrate'))
+            mobj = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', media_url)
+            if mobj:
+                formats.append({
+                    'url': mobj.group('url'),
+                    'play_path': 'mp4:%s' % mobj.group('playpath'),
+                    'app': mobj.group('app'),
+                    'ext': 'flv',
+                    'tbr': tbr,
+                    'format_id': 'rtmp-%d' % tbr,
+                })
+            else:
+                formats.append({
+                    'url': media_url,
+                    'tbr': tbr,
+                })
+        self._sort_formats(formats)
+
+        link = find_xpath_attr(
+            entry, xpath_with_ns('./atom:link', NS_MAP), 'rel', 'original')
+        if link is not None:
+            formats.append({
+                'url': link.get('href'),
+                'format_id': link.get('rel'),
+            })
+
+        thumbnails = [{
+            'url': splash.get('url'),
+            'width': int_or_none(splash.get('width')),
+            'height': int_or_none(splash.get('height')),
+        } for splash in media_group.findall(xpath_with_ns('./xt:splash', NS_MAP))]
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'timestamp': timestamp,
+            'formats': formats,
+            'thumbnails': thumbnails,
+        }
index 1644f53c876329f053406be3d3dc1aa463cddc1b..779e4f46a1dd5315c6a9be3dad09e65c07a205b2 100644 (file)
@@ -5,7 +5,7 @@ import re
 from .common import InfoExtractor
 from ..compat import (
     compat_urllib_request,
-    compat_urllib_parse,
+    compat_urllib_parse_unquote,
 )
 from ..utils import (
     parse_duration,
@@ -59,7 +59,7 @@ class XTubeIE(InfoExtractor):
         for format_id, video_url in re.findall(
                 r'flashvars\.quality_(.+?)\s*=\s*"([^"]+)"', webpage):
             fmt = {
-                'url': compat_urllib_parse.unquote(video_url),
+                'url': compat_urllib_parse_unquote(video_url),
                 'format_id': format_id,
             }
             m = re.search(r'^(?P<height>\d+)[pP]', format_id)
@@ -68,7 +68,7 @@ class XTubeIE(InfoExtractor):
             formats.append(fmt)
 
         if not formats:
-            video_url = compat_urllib_parse.unquote(self._search_regex(
+            video_url = compat_urllib_parse_unquote(self._search_regex(
                 r'flashvars\.video_url\s*=\s*"([^"]+)"',
                 webpage, 'video URL'))
             formats.append({'url': video_url})
index 81d885fdcee1cf788c217e862629df58f386d73c..5aac8adb36e2ad12e798cb4f0c77e5b204c7b91b 100644 (file)
@@ -13,6 +13,7 @@ from ..utils import (
 
 
 class XuiteIE(InfoExtractor):
+    IE_DESC = '隨意窩Xuite影音'
     _REGEX_BASE64 = r'(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?'
     _VALID_URL = r'https?://vlog\.xuite\.net/(?:play|embed)/(?P<id>%s)' % _REGEX_BASE64
     _TESTS = [{
index 2a45dc574263f7e651020e591fcc40bdf987367d..5dcf2fdd12f9140f0bd373fd5db41c93f4b18b38 100644 (file)
@@ -4,11 +4,13 @@ import re
 
 from .common import InfoExtractor
 from ..compat import (
-    compat_urllib_parse,
+    compat_urllib_parse_unquote,
+    compat_urllib_request,
 )
 from ..utils import (
     clean_html,
     ExtractorError,
+    determine_ext,
 )
 
 
@@ -25,6 +27,8 @@ class XVideosIE(InfoExtractor):
         }
     }
 
+    _ANDROID_USER_AGENT = 'Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19'
+
     def _real_extract(self, url):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
@@ -33,16 +37,37 @@ class XVideosIE(InfoExtractor):
         if mobj:
             raise ExtractorError('%s said: %s' % (self.IE_NAME, clean_html(mobj.group(1))), expected=True)
 
-        video_url = compat_urllib_parse.unquote(
+        video_url = compat_urllib_parse_unquote(
             self._search_regex(r'flv_url=(.+?)&', webpage, 'video URL'))
         video_title = self._html_search_regex(
             r'<title>(.*?)\s+-\s+XVID', webpage, 'title')
         video_thumbnail = self._search_regex(
             r'url_bigthumb=(.+?)&amp', webpage, 'thumbnail', fatal=False)
 
+        formats = [{
+            'url': video_url,
+        }]
+
+        android_req = compat_urllib_request.Request(url)
+        android_req.add_header('User-Agent', self._ANDROID_USER_AGENT)
+        android_webpage = self._download_webpage(android_req, video_id, fatal=False)
+
+        if android_webpage is not None:
+            player_params_str = self._search_regex(
+                'mobileReplacePlayerDivTwoQual\(([^)]+)\)',
+                android_webpage, 'player parameters', default='')
+            player_params = list(map(lambda s: s.strip(' \''), player_params_str.split(',')))
+            if player_params:
+                formats.extend([{
+                    'url': param,
+                    'preference': -10,
+                } for param in player_params if determine_ext(param) == 'mp4'])
+
+        self._sort_formats(formats)
+
         return {
             'id': video_id,
-            'url': video_url,
+            'formats': formats,
             'title': video_title,
             'ext': 'flv',
             'thumbnail': video_thumbnail,
index bf4e659ac6981c77f7e5f3c77578c4808634d766..f9afbdbab611e233c7f7014ae7d66e996f2b7c31 100644 (file)
@@ -15,6 +15,7 @@ from ..utils import (
     unescapeHTML,
     ExtractorError,
     int_or_none,
+    mimetype2ext,
 )
 
 from .nbc import NBCSportsVPlayerIE
@@ -236,6 +237,22 @@ class YahooIE(InfoExtractor):
 
         self._sort_formats(formats)
 
+        closed_captions = self._html_search_regex(
+            r'"closedcaptions":(\[[^\]]+\])', webpage, 'closed captions',
+            default='[]')
+
+        cc_json = self._parse_json(closed_captions, video_id, fatal=False)
+        subtitles = {}
+        if cc_json:
+            for closed_caption in cc_json:
+                lang = closed_caption['lang']
+                if lang not in subtitles:
+                    subtitles[lang] = []
+                subtitles[lang].append({
+                    'url': closed_caption['url'],
+                    'ext': mimetype2ext(closed_caption['content_type']),
+                })
+
         return {
             'id': video_id,
             'display_id': display_id,
@@ -244,6 +261,7 @@ class YahooIE(InfoExtractor):
             'description': clean_html(meta['description']),
             'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage),
             'duration': int_or_none(meta.get('duration')),
+            'subtitles': subtitles,
         }
 
 
index 19f8762ae57e814a11f868be1a008e21bbaf8efc..001ee17b6f93d457bdc2fbdaf802b61ef19e1b41 100644 (file)
@@ -9,10 +9,12 @@ from ..utils import (
     float_or_none,
     month_by_abbreviation,
     ExtractorError,
+    get_element_by_attribute,
 )
 
 
 class YamIE(InfoExtractor):
+    IE_DESC = '蕃薯藤yam天空部落'
     _VALID_URL = r'http://mymedia.yam.com/m/(?P<id>\d+)'
 
     _TESTS = [{
@@ -23,6 +25,7 @@ class YamIE(InfoExtractor):
             'id': '2283921',
             'ext': 'mp3',
             'title': '發現 - 趙薇 京華煙雲主題曲',
+            'description': '發現 - 趙薇 京華煙雲主題曲',
             'uploader_id': 'princekt',
             'upload_date': '20080807',
             'duration': 313.0,
@@ -55,6 +58,17 @@ class YamIE(InfoExtractor):
             'ext': 'mp4',
         },
         'skip': 'invalid YouTube URL',
+    }, {
+        'url': 'http://mymedia.yam.com/m/2373534',
+        'md5': '7ff74b91b7a817269d83796f8c5890b1',
+        'info_dict': {
+            'id': '2373534',
+            'ext': 'mp3',
+            'title': '林俊傑&蔡卓妍-小酒窩',
+            'description': 'md5:904003395a0fcce6cfb25028ff468420',
+            'upload_date': '20080928',
+            'uploader_id': 'onliner2',
+        }
     }]
 
     def _real_extract(self, url):
@@ -75,15 +89,19 @@ class YamIE(InfoExtractor):
         if youtube_url:
             return self.url_result(youtube_url, 'Youtube')
 
+        title = self._html_search_regex(
+            r'<h1[^>]+class="heading"[^>]*>\s*(.+)\s*</h1>', page, 'title')
+
         api_page = self._download_webpage(
             'http://mymedia.yam.com/api/a/?pID=' + video_id, video_id,
             note='Downloading API page')
         api_result_obj = compat_urlparse.parse_qs(api_page)
 
+        info_table = get_element_by_attribute('class', 'info', page)
         uploader_id = self._html_search_regex(
-            r'<!-- 發表作者 -->:[\n ]+<a href="/([a-z]+)"',
-            page, 'uploader id', fatal=False)
-        mobj = re.search(r'<!-- 發表於 -->(?P<mon>[A-Z][a-z]{2})  ' +
+            r'<!-- 發表作者 -->:[\n ]+<a href="/([a-z0-9]+)"',
+            info_table, 'uploader id', fatal=False)
+        mobj = re.search(r'<!-- 發表於 -->(?P<mon>[A-Z][a-z]{2})\s+' +
                          r'(?P<day>\d{1,2}), (?P<year>\d{4})', page)
         if mobj:
             upload_date = '%s%02d%02d' % (
@@ -97,7 +115,8 @@ class YamIE(InfoExtractor):
         return {
             'id': video_id,
             'url': api_result_obj['mp3file'][0],
-            'title': self._html_search_meta('description', page),
+            'title': title,
+            'description': self._html_search_meta('description', page),
             'duration': duration,
             'uploader_id': uploader_id,
             'upload_date': upload_date,
diff --git a/youtube_dl/extractor/yinyuetai.py b/youtube_dl/extractor/yinyuetai.py
new file mode 100644 (file)
index 0000000..834d860
--- /dev/null
@@ -0,0 +1,56 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class YinYueTaiIE(InfoExtractor):
+    IE_NAME = 'yinyuetai:video'
+    IE_DESC = '音悦Tai'
+    _VALID_URL = r'https?://v\.yinyuetai\.com/video(?:/h5)?/(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://v.yinyuetai.com/video/2322376',
+        'md5': '6e3abe28d38e3a54b591f9f040595ce0',
+        'info_dict': {
+            'id': '2322376',
+            'ext': 'mp4',
+            'title': '少女时代_PARTY_Music Video Teaser',
+            'creator': '少女时代',
+            'duration': 25,
+            'thumbnail': 're:^https?://.*\.jpg$',
+        },
+    }, {
+        'url': 'http://v.yinyuetai.com/video/h5/2322376',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        info = self._download_json(
+            'http://ext.yinyuetai.com/main/get-h-mv-info?json=true&videoId=%s' % video_id, video_id,
+            'Downloading mv info')['videoInfo']['coreVideoInfo']
+
+        if info['error']:
+            raise ExtractorError(info['errorMsg'], expected=True)
+
+        formats = [{
+            'url': format_info['videoUrl'],
+            'format_id': format_info['qualityLevel'],
+            'format': format_info.get('qualityLevelName'),
+            'filesize': format_info.get('fileSize'),
+            # though URLs ends with .flv, the downloaded files are in fact mp4
+            'ext': 'mp4',
+            'tbr': format_info.get('bitrate'),
+        } for format_info in info['videoUrlModels']]
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': info['videoName'],
+            'thumbnail': info.get('bigHeadImage'),
+            'creator': info.get('artistNames'),
+            'duration': info.get('duration'),
+            'formats': formats,
+        }
index 894678a23dac9d1b03e07f0cd9b2eecc7e690e18..869f3e8190ca0b751366a85f142a0b49fe294fa1 100644 (file)
@@ -5,7 +5,7 @@ import re
 import json
 
 from .common import InfoExtractor
-from ..compat import compat_urllib_parse
+from ..compat import compat_urllib_parse_unquote_plus
 
 
 class YnetIE(InfoExtractor):
@@ -34,7 +34,7 @@ class YnetIE(InfoExtractor):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
 
-        content = compat_urllib_parse.unquote_plus(self._og_search_video_url(webpage))
+        content = compat_urllib_parse_unquote_plus(self._og_search_video_url(webpage))
         config = json.loads(self._search_regex(r'config=({.+?})$', content, 'video config'))
         f4m_url = config['clip']['url']
         title = self._og_search_title(webpage)
index 97b98bbe88715f644da6bec1709d697af2c8e0e0..78caeb8b36e0be8cf4e97365d9e28251723059b7 100644 (file)
 # coding: utf-8
-
 from __future__ import unicode_literals
 
-import math
-import random
-import re
-import time
+import base64
 
 from .common import InfoExtractor
-from ..utils import (
-    ExtractorError,
+from ..utils import ExtractorError
+
+from ..compat import (
+    compat_urllib_parse,
+    compat_ord,
+    compat_urllib_request,
 )
 
 
 class YoukuIE(InfoExtractor):
+    IE_NAME = 'youku'
+    IE_DESC = '优酷'
     _VALID_URL = r'''(?x)
         (?:
             http://(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)|
             youku:)
         (?P<id>[A-Za-z0-9]+)(?:\.html|/v\.swf|)
     '''
-    _TEST = {
-        'url': 'http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html',
-        'md5': 'ffe3f2e435663dc2d1eea34faeff5b5b',
-        'params': {
-            'test': False
+
+    _TESTS = [{
+        'url': 'http://v.youku.com/v_show/id_XMTc1ODE5Njcy.html',
+        'md5': '5f3af4192eabacc4501508d54a8cabd7',
+        'info_dict': {
+            'id': 'XMTc1ODE5Njcy_part1',
+            'title': '★Smile﹗♡ Git Fresh -Booty Music舞蹈.',
+            'ext': 'flv'
+        }
+    }, {
+        'url': 'http://player.youku.com/player.php/sid/XNDgyMDQ2NTQw/v.swf',
+        'only_matching': True,
+    }, {
+        'url': 'http://v.youku.com/v_show/id_XODgxNjg1Mzk2_ev_1.html',
+        'info_dict': {
+            'id': 'XODgxNjg1Mzk2',
+            'title': '武媚娘传奇 85',
         },
+        'playlist_count': 11,
+    }, {
+        'url': 'http://v.youku.com/v_show/id_XMTI1OTczNDM5Mg==.html',
         'info_dict': {
-            'id': 'XNDgyMDQ2NTQw_part00',
-            'ext': 'flv',
-            'title': 'youtube-dl test video "\'/\\ä↭𝕐'
+            'id': 'XMTI1OTczNDM5Mg',
+            'title': '花千骨 04',
+        },
+        'playlist_count': 13,
+        'skip': 'Available in China only',
+    }]
+
+    def construct_video_urls(self, data1, data2):
+        # get sid, token
+        def yk_t(s1, s2):
+            ls = list(range(256))
+            t = 0
+            for i in range(256):
+                t = (t + ls[i] + compat_ord(s1[i % len(s1)])) % 256
+                ls[i], ls[t] = ls[t], ls[i]
+            s = bytearray()
+            x, y = 0, 0
+            for i in range(len(s2)):
+                y = (y + 1) % 256
+                x = (x + ls[y]) % 256
+                ls[x], ls[y] = ls[y], ls[x]
+                s.append(compat_ord(s2[i]) ^ ls[(ls[x] + ls[y]) % 256])
+            return bytes(s)
+
+        sid, token = yk_t(
+            b'becaf9be', base64.b64decode(data2['ep'].encode('ascii'))
+        ).decode('ascii').split('_')
+
+        # get oip
+        oip = data2['ip']
+
+        # get fileid
+        string_ls = list(
+            'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890')
+        shuffled_string_ls = []
+        seed = data1['seed']
+        N = len(string_ls)
+        for ii in range(N):
+            seed = (seed * 0xd3 + 0x754f) % 0x10000
+            idx = seed * len(string_ls) // 0x10000
+            shuffled_string_ls.append(string_ls[idx])
+            del string_ls[idx]
+
+        fileid_dict = {}
+        for format in data1['streamtypes']:
+            streamfileid = [
+                int(i) for i in data1['streamfileids'][format].strip('*').split('*')]
+            fileid = ''.join(
+                [shuffled_string_ls[i] for i in streamfileid])
+            fileid_dict[format] = fileid[:8] + '%s' + fileid[10:]
+
+        def get_fileid(format, n):
+            fileid = fileid_dict[format] % hex(int(n))[2:].upper().zfill(2)
+            return fileid
+
+        # get ep
+        def generate_ep(format, n):
+            fileid = get_fileid(format, n)
+            ep_t = yk_t(
+                b'bf7e5f01',
+                ('%s_%s_%s' % (sid, fileid, token)).encode('ascii')
+            )
+            ep = base64.b64encode(ep_t).decode('ascii')
+            return ep
+
+        # generate video_urls
+        video_urls_dict = {}
+        for format in data1['streamtypes']:
+            video_urls = []
+            for dt in data1['segs'][format]:
+                n = str(int(dt['no']))
+                param = {
+                    'K': dt['k'],
+                    'hd': self.get_hd(format),
+                    'myp': 0,
+                    'ts': dt['seconds'],
+                    'ypp': 0,
+                    'ctype': 12,
+                    'ev': 1,
+                    'token': token,
+                    'oip': oip,
+                    'ep': generate_ep(format, n)
+                }
+                video_url = \
+                    'http://k.youku.com/player/getFlvPath/' + \
+                    'sid/' + sid + \
+                    '_' + str(int(n) + 1).zfill(2) + \
+                    '/st/' + self.parse_ext_l(format) + \
+                    '/fileid/' + get_fileid(format, n) + '?' + \
+                    compat_urllib_parse.urlencode(param)
+                video_urls.append(video_url)
+            video_urls_dict[format] = video_urls
+
+        return video_urls_dict
+
+    def get_hd(self, fm):
+        hd_id_dict = {
+            'flv': '0',
+            'mp4': '1',
+            'hd2': '2',
+            'hd3': '3',
+            '3gp': '0',
+            '3gphd': '1'
         }
-    }
-
-    def _gen_sid(self):
-        nowTime = int(time.time() * 1000)
-        random1 = random.randint(1000, 1998)
-        random2 = random.randint(1000, 9999)
-
-        return "%d%d%d" % (nowTime, random1, random2)
-
-    def _get_file_ID_mix_string(self, seed):
-        mixed = []
-        source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
-        seed = float(seed)
-        for i in range(len(source)):
-            seed = (seed * 211 + 30031) % 65536
-            index = math.floor(seed / 65536 * len(source))
-            mixed.append(source[int(index)])
-            source.remove(source[int(index)])
-        # return ''.join(mixed)
-        return mixed
-
-    def _get_file_id(self, fileId, seed):
-        mixed = self._get_file_ID_mix_string(seed)
-        ids = fileId.split('*')
-        realId = []
-        for ch in ids:
-            if ch:
-                realId.append(mixed[int(ch)])
-        return ''.join(realId)
+        return hd_id_dict[fm]
+
+    def parse_ext_l(self, fm):
+        ext_dict = {
+            'flv': 'flv',
+            'mp4': 'mp4',
+            'hd2': 'flv',
+            'hd3': 'flv',
+            '3gp': 'flv',
+            '3gphd': 'mp4'
+        }
+        return ext_dict[fm]
+
+    def get_format_name(self, fm):
+        _dict = {
+            '3gp': 'h6',
+            '3gphd': 'h5',
+            'flv': 'h4',
+            'mp4': 'h3',
+            'hd2': 'h2',
+            'hd3': 'h1'
+        }
+        return _dict[fm]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-
-        info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
+        video_id = self._match_id(url)
 
-        config = self._download_json(info_url, video_id)
+        def retrieve_data(req_url, note):
+            req = compat_urllib_request.Request(req_url)
 
-        error_code = config['data'][0].get('error_code')
-        if error_code:
-            # -8 means blocked outside China.
-            error = config['data'][0].get('error')  # Chinese and English, separated by newline.
-            raise ExtractorError(error or 'Server reported error %i' % error_code,
-                                 expected=True)
+            cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')
+            if cn_verification_proxy:
+                req.add_header('Ytdl-request-proxy', cn_verification_proxy)
 
-        video_title = config['data'][0]['title']
-        seed = config['data'][0]['seed']
+            raw_data = self._download_json(req, video_id, note=note)
+            return raw_data['data'][0]
 
-        format = self._downloader.params.get('format', None)
-        supported_format = list(config['data'][0]['streamfileids'].keys())
+        # request basic data
+        data1 = retrieve_data(
+            'http://v.youku.com/player/getPlayList/VideoIDS/%s' % video_id,
+            'Downloading JSON metadata 1')
+        data2 = retrieve_data(
+            'http://v.youku.com/player/getPlayList/VideoIDS/%s/Pf/4/ctype/12/ev/1' % video_id,
+            'Downloading JSON metadata 2')
 
-        # TODO proper format selection
-        if format is None or format == 'best':
-            if 'hd2' in supported_format:
-                format = 'hd2'
+        error_code = data1.get('error_code')
+        if error_code:
+            error = data1.get('error')
+            if error is not None and '因版权原因无法观看此视频' in error:
+                raise ExtractorError(
+                    'Youku said: Sorry, this video is available in China only', expected=True)
             else:
-                format = 'flv'
-            ext = 'flv'
-        elif format == 'worst':
-            format = 'mp4'
-            ext = 'mp4'
-        else:
-            format = 'flv'
-            ext = 'flv'
-
-        fileid = config['data'][0]['streamfileids'][format]
-        keys = [s['k'] for s in config['data'][0]['segs'][format]]
-        # segs is usually a dictionary, but an empty *list* if an error occured.
-
-        files_info = []
-        sid = self._gen_sid()
-        fileid = self._get_file_id(fileid, seed)
-
-        # column 8,9 of fileid represent the segment number
-        # fileid[7:9] should be changed
-        for index, key in enumerate(keys):
-            temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
-            download_url = 'http://k.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
-
-            info = {
-                'id': '%s_part%02d' % (video_id, index),
-                'url': download_url,
-                'uploader': None,
-                'upload_date': None,
-                'title': video_title,
-                'ext': ext,
-            }
-            files_info.append(info)
-
-        return files_info
+                msg = 'Youku server reported error %i' % error_code
+                if error is not None:
+                    msg += ': ' + error
+                raise ExtractorError(msg)
+
+        title = data1['title']
+
+        # generate video_urls_dict
+        video_urls_dict = self.construct_video_urls(data1, data2)
+
+        # construct info
+        entries = [{
+            'id': '%s_part%d' % (video_id, i + 1),
+            'title': title,
+            'formats': [],
+            # some formats are not available for all parts, we have to detect
+            # which one has all
+        } for i in range(max(len(v) for v in data1['segs'].values()))]
+        for fm in data1['streamtypes']:
+            video_urls = video_urls_dict[fm]
+            for video_url, seg, entry in zip(video_urls, data1['segs'][fm], entries):
+                entry['formats'].append({
+                    'url': video_url,
+                    'format_id': self.get_format_name(fm),
+                    'ext': self.parse_ext_l(fm),
+                    'filesize': int(seg['size']),
+                })
+
+        return {
+            '_type': 'multi_video',
+            'id': video_id,
+            'title': title,
+            'entries': entries,
+        }
index 0869c9fd468286e9c4d7125a76b41f2a51b7d29a..4023a6e50bafd90bea320ef70ae4138961251d5a 100644 (file)
@@ -17,6 +17,9 @@ from ..compat import (
     compat_chr,
     compat_parse_qs,
     compat_urllib_parse,
+    compat_urllib_parse_unquote,
+    compat_urllib_parse_unquote_plus,
+    compat_urllib_parse_urlparse,
     compat_urllib_request,
     compat_urlparse,
     compat_str,
@@ -29,9 +32,12 @@ from ..utils import (
     get_element_by_id,
     int_or_none,
     orderedSet,
+    parse_duration,
+    str_to_int,
     unescapeHTML,
     unified_strdate,
     uppercase_escape,
+    ISO3166Utils,
 )
 
 
@@ -49,6 +55,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
             # YouTube sets the expire time to about two months
             expire_time=time.time() + 2 * 30 * 24 * 3600)
 
+    def _ids_to_results(self, ids):
+        return [
+            self.url_result(vid_id, 'Youtube', video_id=vid_id)
+            for vid_id in ids]
+
     def _login(self):
         """
         Attempt to log in to YouTube.
@@ -229,6 +240,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         '44': {'ext': 'webm', 'width': 854, 'height': 480},
         '45': {'ext': 'webm', 'width': 1280, 'height': 720},
         '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
+        '59': {'ext': 'mp4', 'width': 854, 'height': 480},
+        '78': {'ext': 'mp4', 'width': 854, 'height': 480},
 
 
         # 3d videos
@@ -306,7 +319,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
     IE_NAME = 'youtube'
     _TESTS = [
         {
-            'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
+            'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&t=1s&end=9',
             'info_dict': {
                 'id': 'BaW_jenozKc',
                 'ext': 'mp4',
@@ -318,6 +331,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'categories': ['Science & Technology'],
                 'like_count': int,
                 'dislike_count': int,
+                'start_time': 1,
+                'end_time': 9,
             }
         },
         {
@@ -511,6 +526,38 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'skip_download': 'requires avconv',
             }
         },
+        # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
+        {
+            'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
+            'info_dict': {
+                'id': 'FIl7x6_3R5Y',
+                'ext': 'mp4',
+                'title': 'md5:7b81415841e02ecd4313668cde88737a',
+                'description': 'md5:116377fd2963b81ec4ce64b542173306',
+                'upload_date': '20150625',
+                'uploader_id': 'dorappi2000',
+                'uploader': 'dorappi2000',
+                'formats': 'mincount:33',
+            },
+        },
+        # DASH manifest with segment_list
+        {
+            'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
+            'md5': '8ce563a1d667b599d21064e982ab9e31',
+            'info_dict': {
+                'id': 'CsmdDsKjzN8',
+                'ext': 'mp4',
+                'upload_date': '20150501',  # According to '<meta itemprop="datePublished"', but in other places it's 20150510
+                'uploader': 'Airtek',
+                'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
+                'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
+                'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
+            },
+            'params': {
+                'youtube_include_dash_manifest': True,
+                'format': '135',  # bestvideo
+            }
+        },
     ]
 
     def __init__(self, *args, **kwargs):
@@ -775,16 +822,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
 
     def _parse_dash_manifest(
-            self, video_id, dash_manifest_url, player_url, age_gate):
+            self, video_id, dash_manifest_url, player_url, age_gate, fatal=True):
         def decrypt_sig(mobj):
             s = mobj.group(1)
             dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
             return '/signature/%s' % dec_s
-        dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
+        dash_manifest_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, dash_manifest_url)
         dash_doc = self._download_xml(
             dash_manifest_url, video_id,
             note='Downloading DASH manifest',
-            errnote='Could not download DASH manifest')
+            errnote='Could not download DASH manifest',
+            fatal=fatal)
+
+        if dash_doc is False:
+            return []
 
         formats = []
         for a in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}AdaptationSet'):
@@ -797,6 +848,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                     # TODO implement WebVTT downloading
                     pass
                 elif mime_type.startswith('audio/') or mime_type.startswith('video/'):
+                    segment_list = r.find('{urn:mpeg:DASH:schema:MPD:2011}SegmentList')
                     format_id = r.attrib['id']
                     video_url = url_el.text
                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
@@ -810,6 +862,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                         'filesize': filesize,
                         'fps': int_or_none(r.attrib.get('frameRate')),
                     }
+                    if segment_list is not None:
+                        f.update({
+                            'initialization_url': segment_list.find('{urn:mpeg:DASH:schema:MPD:2011}Initialization').attrib['sourceURL'],
+                            'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')],
+                            'protocol': 'http_dash_segments',
+                        })
                     try:
                         existing_format = next(
                             fo for fo in formats
@@ -817,6 +875,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                     except StopIteration:
                         full_info = self._formats.get(format_id, {}).copy()
                         full_info.update(f)
+                        codecs = r.attrib.get('codecs')
+                        if codecs:
+                            if full_info.get('acodec') == 'none' and 'vcodec' not in full_info:
+                                full_info['vcodec'] = codecs
+                            elif full_info.get('vcodec') == 'none' and 'acodec' not in full_info:
+                                full_info['acodec'] = codecs
                         formats.append(full_info)
                     else:
                         existing_format.update(f)
@@ -829,10 +893,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             'http' if self._downloader.params.get('prefer_insecure', False)
             else 'https')
 
+        start_time = None
+        end_time = None
+        parsed_url = compat_urllib_parse_urlparse(url)
+        for component in [parsed_url.fragment, parsed_url.query]:
+            query = compat_parse_qs(component)
+            if start_time is None and 't' in query:
+                start_time = parse_duration(query['t'][0])
+            if start_time is None and 'start' in query:
+                start_time = parse_duration(query['start'][0])
+            if end_time is None and 'end' in query:
+                end_time = parse_duration(query['end'][0])
+
         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
         mobj = re.search(self._NEXT_URL_RE, url)
         if mobj:
-            url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
+            url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
         video_id = self.extract_id(url)
 
         # Get video webpage
@@ -846,8 +922,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         else:
             player_url = None
 
+        dash_mpds = []
+
+        def add_dash_mpd(video_info):
+            dash_mpd = video_info.get('dashmpd')
+            if dash_mpd and dash_mpd[0] not in dash_mpds:
+                dash_mpds.append(dash_mpd[0])
+
         # Get video info
         embed_webpage = None
+        is_live = None
         if re.search(r'player-age-gate-content">', video_webpage) is not None:
             age_gate = True
             # We simulate the access to the video from www.youtube.com/v/{video_id}
@@ -866,24 +950,31 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 note='Refetching age-gated info webpage',
                 errnote='unable to download video info webpage')
             video_info = compat_parse_qs(video_info_webpage)
+            add_dash_mpd(video_info)
         else:
             age_gate = False
-            try:
-                # Try looking directly into the video webpage
-                mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
-                if not mobj:
-                    raise ValueError('Could not find ytplayer.config')  # caught below
+            video_info = None
+            # Try looking directly into the video webpage
+            mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
+            if mobj:
                 json_code = uppercase_escape(mobj.group(1))
                 ytplayer_config = json.loads(json_code)
                 args = ytplayer_config['args']
-                # Convert to the same format returned by compat_parse_qs
-                video_info = dict((k, [v]) for k, v in args.items())
-                if not args.get('url_encoded_fmt_stream_map'):
-                    raise ValueError('No stream_map present')  # caught below
-            except ValueError:
-                # We fallback to the get_video_info pages (used by the embed page)
+                if args.get('url_encoded_fmt_stream_map'):
+                    # Convert to the same format returned by compat_parse_qs
+                    video_info = dict((k, [v]) for k, v in args.items())
+                    add_dash_mpd(video_info)
+                if args.get('livestream') == '1' or args.get('live_playback') == 1:
+                    is_live = True
+            if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
+                # We also try looking in get_video_info since it may contain different dashmpd
+                # URL that points to a DASH manifest with possibly different itag set (some itags
+                # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
+                # manifest pointed by get_video_info's dashmpd).
+                # The general idea is to take a union of itags of both DASH manifests (for example
+                # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
                 self.report_video_info_webpage_download(video_id)
-                for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
+                for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
                     video_info_url = (
                         '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
                         % (proto, video_id, el_type))
@@ -891,11 +982,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                         video_info_url,
                         video_id, note=False,
                         errnote='unable to download video info webpage')
-                    video_info = compat_parse_qs(video_info_webpage)
-                    if 'token' in video_info:
+                    get_video_info = compat_parse_qs(video_info_webpage)
+                    if get_video_info.get('use_cipher_signature') != ['True']:
+                        add_dash_mpd(get_video_info)
+                    if not video_info:
+                        video_info = get_video_info
+                    if 'token' in get_video_info:
                         break
         if 'token' not in video_info:
             if 'reason' in video_info:
+                if 'The uploader has not made this video available in your country.' in video_info['reason']:
+                    regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None)
+                    if regions_allowed:
+                        raise ExtractorError('YouTube said: This video is available in %s only' % (
+                            ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))),
+                            expected=True)
                 raise ExtractorError(
                     'YouTube said: %s' % video_info['reason'][0],
                     expected=True, video_id=video_id)
@@ -919,7 +1020,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         # uploader
         if 'author' not in video_info:
             raise ExtractorError('Unable to extract uploader name')
-        video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
+        video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0])
 
         # uploader_id
         video_uploader_id = None
@@ -946,18 +1047,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             self._downloader.report_warning('unable to extract video thumbnail')
             video_thumbnail = None
         else:   # don't panic if we can't find it
-            video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
+            video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
 
         # upload date
-        upload_date = None
-        mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
-        if mobj is None:
-            mobj = re.search(
-                r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
-                video_webpage)
-        if mobj is not None:
-            upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
-            upload_date = unified_strdate(upload_date)
+        upload_date = self._html_search_meta(
+            'datePublished', video_webpage, 'upload date', default=None)
+        if not upload_date:
+            upload_date = self._search_regex(
+                [r'(?s)id="eow-date.*?>(.*?)</span>',
+                 r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'],
+                video_webpage, 'upload date', default=None)
+            if upload_date:
+                upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
+        upload_date = unified_strdate(upload_date)
 
         m_cat_container = self._search_regex(
             r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
@@ -991,12 +1093,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 video_description = ''
 
         def _extract_count(count_name):
-            count = self._search_regex(
-                r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
-                video_webpage, count_name, default=None)
-            if count is not None:
-                return int(count.replace(',', ''))
-            return None
+            return str_to_int(self._search_regex(
+                r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
+                % re.escape(count_name),
+                video_webpage, count_name, default=None))
+
         like_count = _extract_count('like')
         dislike_count = _extract_count('dislike')
 
@@ -1008,7 +1109,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             self._downloader.report_warning('unable to extract video duration')
             video_duration = None
         else:
-            video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
+            video_duration = int(compat_urllib_parse_unquote_plus(video_info['length_seconds'][0]))
 
         # annotations
         video_annotations = None
@@ -1111,23 +1212,32 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
 
         # Look for the DASH manifest
         if self._downloader.params.get('youtube_include_dash_manifest', True):
-            dash_mpd = video_info.get('dashmpd')
-            if dash_mpd:
-                dash_manifest_url = dash_mpd[0]
+            dash_mpd_fatal = True
+            for dash_manifest_url in dash_mpds:
+                dash_formats = {}
                 try:
-                    dash_formats = self._parse_dash_manifest(
-                        video_id, dash_manifest_url, player_url, age_gate)
+                    for df in self._parse_dash_manifest(
+                            video_id, dash_manifest_url, player_url, age_gate, dash_mpd_fatal):
+                        # Do not overwrite DASH format found in some previous DASH manifest
+                        if df['format_id'] not in dash_formats:
+                            dash_formats[df['format_id']] = df
+                        # Additional DASH manifests may end up in HTTP Error 403 therefore
+                        # allow them to fail without bug report message if we already have
+                        # some DASH manifest succeeded. This is temporary workaround to reduce
+                        # burst of bug reports until we figure out the reason and whether it
+                        # can be fixed at all.
+                        dash_mpd_fatal = False
                 except (ExtractorError, KeyError) as e:
                     self.report_warning(
                         'Skipping DASH manifest: %r' % e, video_id)
-                else:
-                    # Hide the formats we found through non-DASH
-                    dash_keys = set(df['format_id'] for df in dash_formats)
-                    for f in formats:
-                        if f['format_id'] in dash_keys:
-                            f['format_id'] = 'nondash-%s' % f['format_id']
-                            f['preference'] = f.get('preference', 0) - 10000
-                    formats.extend(dash_formats)
+                if dash_formats:
+                    # Remove the formats we found through non-DASH, they
+                    # contain less info and it can be wrong, because we use
+                    # fixed values (for example the resolution). See
+                    # https://github.com/rg3/youtube-dl/issues/5774 for an
+                    # example.
+                    formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
+                    formats.extend(dash_formats.values())
 
         # Check for malformed aspect ratio
         stretched_m = re.search(
@@ -1161,6 +1271,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             'dislike_count': dislike_count,
             'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
             'formats': formats,
+            'is_live': is_live,
+            'start_time': start_time,
+            'end_time': end_time,
         }
 
 
@@ -1261,11 +1374,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
     def _real_initialize(self):
         self._login()
 
-    def _ids_to_results(self, ids):
-        return [
-            self.url_result(vid_id, 'Youtube', video_id=vid_id)
-            for vid_id in ids]
-
     def _extract_mix(self, playlist_id):
         # The mixes are generated from a single video
         # the id of the playlist is just 'RD' + video_id
@@ -1289,7 +1397,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
     def _extract_playlist(self, playlist_id):
         url = self._TEMPLATE_URL % playlist_id
         page = self._download_webpage(url, playlist_id)
-        more_widget_html = content_html = page
 
         for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):
             match = match.strip()
@@ -1309,36 +1416,36 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
                 self.report_warning('Youtube gives an alert message: ' + match)
 
         # Extract the video ids from the playlist pages
-        ids = []
-
-        for page_num in itertools.count(1):
-            matches = re.finditer(self._VIDEO_RE, content_html)
-            # We remove the duplicates and the link with index 0
-            # (it's not the first video of the playlist)
-            new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
-            ids.extend(new_ids)
-
-            mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
-            if not mobj:
-                break
+        def _entries():
+            more_widget_html = content_html = page
+            for page_num in itertools.count(1):
+                matches = re.finditer(self._VIDEO_RE, content_html)
+                # We remove the duplicates and the link with index 0
+                # (it's not the first video of the playlist)
+                new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
+                for vid_id in new_ids:
+                    yield self.url_result(vid_id, 'Youtube', video_id=vid_id)
+
+                mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
+                if not mobj:
+                    break
 
-            more = self._download_json(
-                'https://youtube.com/%s' % mobj.group('more'), playlist_id,
-                'Downloading page #%s' % page_num,
-                transform_source=uppercase_escape)
-            content_html = more['content_html']
-            if not content_html.strip():
-                # Some webpages show a "Load more" button but they don't
-                # have more videos
-                break
-            more_widget_html = more['load_more_widget_html']
+                more = self._download_json(
+                    'https://youtube.com/%s' % mobj.group('more'), playlist_id,
+                    'Downloading page #%s' % page_num,
+                    transform_source=uppercase_escape)
+                content_html = more['content_html']
+                if not content_html.strip():
+                    # Some webpages show a "Load more" button but they don't
+                    # have more videos
+                    break
+                more_widget_html = more['load_more_widget_html']
 
         playlist_title = self._html_search_regex(
             r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
             page, 'title')
 
-        url_results = self._ids_to_results(ids)
-        return self.playlist_result(url_results, playlist_id, playlist_title)
+        return self.playlist_result(_entries(), playlist_id, playlist_title)
 
     def _real_extract(self, url):
         # Extract playlist id
@@ -1398,6 +1505,24 @@ class YoutubeChannelIE(InfoExtractor):
         channel_id = self._match_id(url)
 
         url = self._TEMPLATE_URL % channel_id
+
+        # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
+        # Workaround by extracting as a playlist if managed to obtain channel playlist URL
+        # otherwise fallback on channel by page extraction
+        channel_page = self._download_webpage(
+            url + '?view=57', channel_id,
+            'Downloading channel page', fatal=False)
+        channel_playlist_id = self._html_search_meta(
+            'channelId', channel_page, 'channel id', default=None)
+        if not channel_playlist_id:
+            channel_playlist_id = self._search_regex(
+                r'data-channel-external-id="([^"]+)"',
+                channel_page, 'channel id', default=None)
+        if channel_playlist_id and channel_playlist_id.startswith('UC'):
+            playlist_id = 'UU' + channel_playlist_id[2:]
+            return self.url_result(
+                compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
+
         channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
         autogenerated = re.search(r'''(?x)
                 class="[^"]*?(?:
@@ -1486,7 +1611,7 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
 
         for pagenum in itertools.count(1):
             url_query = {
-                'search_query': query,
+                'search_query': query.encode('utf-8'),
                 'page': pagenum,
                 'spf': 'navigate',
             }
@@ -1534,7 +1659,7 @@ class YoutubeSearchURLIE(InfoExtractor):
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        query = compat_urllib_parse.unquote_plus(mobj.group('query'))
+        query = compat_urllib_parse_unquote_plus(mobj.group('query'))
 
         webpage = self._download_webpage(url, query)
         result_code = self._search_regex(
@@ -1601,20 +1726,10 @@ class YoutubeShowIE(InfoExtractor):
 
 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
     """
-    Base class for extractors that fetch info from
-    http://www.youtube.com/feed_ajax
+    Base class for feed extractors
     Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
     """
     _LOGIN_REQUIRED = True
-    # use action_load_personal_feed instead of action_load_system_feed
-    _PERSONAL_FEED = False
-
-    @property
-    def _FEED_TEMPLATE(self):
-        action = 'action_load_system_feed'
-        if self._PERSONAL_FEED:
-            action = 'action_load_personal_feed'
-        return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
 
     @property
     def IE_NAME(self):
@@ -1624,36 +1739,38 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
         self._login()
 
     def _real_extract(self, url):
-        feed_entries = []
-        paging = 0
-        for i in itertools.count(1):
-            info = self._download_json(
-                self._FEED_TEMPLATE % paging,
-                '%s feed' % self._FEED_NAME,
-                'Downloading page %s' % i,
-                transform_source=uppercase_escape)
-            feed_html = info.get('feed_html') or info.get('content_html')
-            load_more_widget_html = info.get('load_more_widget_html') or feed_html
-            m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
-            ids = orderedSet(m.group(1) for m in m_ids)
-            feed_entries.extend(
-                self.url_result(video_id, 'Youtube', video_id=video_id)
-                for video_id in ids)
-            mobj = re.search(
-                r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
-                load_more_widget_html)
-            if mobj is None:
+        page = self._download_webpage(
+            'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)
+
+        # The extraction process is the same as for playlists, but the regex
+        # for the video ids doesn't contain an index
+        ids = []
+        more_widget_html = content_html = page
+        for page_num in itertools.count(1):
+            matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
+
+            # 'recommended' feed has infinite 'load more' and each new portion spins
+            # the same videos in (sometimes) slightly different order, so we'll check
+            # for unicity and break when portion has no new videos
+            new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
+            if not new_ids:
                 break
-            paging = mobj.group('paging')
-        return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
 
+            ids.extend(new_ids)
 
-class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
-    IE_NAME = 'youtube:recommended'
-    IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
-    _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
-    _FEED_NAME = 'recommended'
-    _PLAYLIST_TITLE = 'Youtube Recommended videos'
+            mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
+            if not mobj:
+                break
+
+            more = self._download_json(
+                'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
+                'Downloading page #%s' % page_num,
+                transform_source=uppercase_escape)
+            content_html = more['content_html']
+            more_widget_html = more['load_more_widget_html']
+
+        return self.playlist_result(
+            self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)
 
 
 class YoutubeWatchLaterIE(YoutubePlaylistIE):
@@ -1667,15 +1784,6 @@ class YoutubeWatchLaterIE(YoutubePlaylistIE):
         return self._extract_playlist('WL')
 
 
-class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
-    IE_NAME = 'youtube:history'
-    IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
-    _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
-    _FEED_NAME = 'history'
-    _PERSONAL_FEED = True
-    _PLAYLIST_TITLE = 'Youtube Watch History'
-
-
 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
     IE_NAME = 'youtube:favorites'
     IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
@@ -1688,42 +1796,25 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
         return self.url_result(playlist_id, 'YoutubePlaylist')
 
 
-class YoutubeSubscriptionsIE(YoutubePlaylistIE):
-    IE_NAME = 'youtube:subscriptions'
-    IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
-    _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
-    _TESTS = []
-
-    def _real_extract(self, url):
-        title = 'Youtube Subscriptions'
-        page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
-
-        # The extraction process is the same as for playlists, but the regex
-        # for the video ids doesn't contain an index
-        ids = []
-        more_widget_html = content_html = page
+class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
+    IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
+    _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
+    _FEED_NAME = 'recommended'
+    _PLAYLIST_TITLE = 'Youtube Recommended videos'
 
-        for page_num in itertools.count(1):
-            matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
-            new_ids = orderedSet(matches)
-            ids.extend(new_ids)
 
-            mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
-            if not mobj:
-                break
+class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
+    IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
+    _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
+    _FEED_NAME = 'subscriptions'
+    _PLAYLIST_TITLE = 'Youtube Subscriptions'
 
-            more = self._download_json(
-                'https://youtube.com/%s' % mobj.group('more'), title,
-                'Downloading page #%s' % page_num,
-                transform_source=uppercase_escape)
-            content_html = more['content_html']
-            more_widget_html = more['load_more_widget_html']
 
-        return {
-            '_type': 'playlist',
-            'title': title,
-            'entries': self._ids_to_results(ids),
-        }
+class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
+    IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
+    _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
+    _FEED_NAME = 'history'
+    _PLAYLIST_TITLE = 'Youtube History'
 
 
 class YoutubeTruncatedURLIE(InfoExtractor):
index 1afbe68ed68e084028cda0c0f9d7d80a385118fb..7dc1e2f2bd3f36e2b71e199fb5e5f6f2cc4e18e9 100644 (file)
@@ -4,12 +4,18 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..utils import ExtractorError
 
 
 class ZingMp3BaseInfoExtractor(InfoExtractor):
 
-    @staticmethod
-    def _extract_item(item):
+    def _extract_item(self, item):
+        error_message = item.find('./errormessage').text
+        if error_message:
+            raise ExtractorError(
+                '%s returned error: %s' % (self.IE_NAME, error_message),
+                expected=True)
+
         title = item.find('./title').text.strip()
         source = item.find('./source').text
         extension = item.attrib['type']
index 22dbc3aec7866ad3f5d048c35737486a8fdac8fc..9016e34983d3fed5e0fab72e9a8626124cdee859 100644 (file)
@@ -145,11 +145,15 @@ def parseOpts(overrideArguments=None):
     general.add_option(
         '--list-extractors',
         action='store_true', dest='list_extractors', default=False,
-        help='List all supported extractors and the URLs they would handle')
+        help='List all supported extractors')
     general.add_option(
         '--extractor-descriptions',
         action='store_true', dest='list_extractor_descriptions', default=False,
         help='Output descriptions of all supported extractors')
+    general.add_option(
+        '--force-generic-extractor',
+        action='store_true', dest='force_generic_extractor', default=False,
+        help='Force extraction to use the generic extractor')
     general.add_option(
         '--default-search',
         dest='default_search', metavar='PREFIX',
@@ -215,7 +219,7 @@ def parseOpts(overrideArguments=None):
     selection.add_option(
         '--playlist-items',
         dest='playlist_items', metavar='ITEM_SPEC', default=None,
-        help='Playlist video items to download. Specify indices of the videos in the playlist seperated by commas like: "--playlist-items 1,2,5,8" if you want to download videos indexed 1, 2, 5, 8 in the playlist. You can specify range: "--playlist-items 1-3,7,10-13", it will download the videos at index 1, 2, 3, 7, 10, 11, 12 and 13.')
+        help='Playlist video items to download. Specify indices of the videos in the playlist separated by commas like: "--playlist-items 1,2,5,8" if you want to download videos indexed 1, 2, 5, 8 in the playlist. You can specify range: "--playlist-items 1-3,7,10-13", it will download the videos at index 1, 2, 3, 7, 10, 11, 12 and 13.')
     selection.add_option(
         '--match-title',
         dest='matchtitle', metavar='REGEX',
@@ -342,12 +346,13 @@ def parseOpts(overrideArguments=None):
     video_format.add_option(
         '--youtube-skip-dash-manifest',
         action='store_false', dest='youtube_include_dash_manifest',
-        help='Do not download the DASH manifest on YouTube videos')
+        help='Do not download the DASH manifests and related data on YouTube videos')
     video_format.add_option(
         '--merge-output-format',
         action='store', dest='merge_output_format', metavar='FORMAT', default=None,
         help=(
-            'If a merge is required (e.g. bestvideo+bestaudio), output to given container format. One of mkv, mp4, ogg, webm, flv.'
+            'If a merge is required (e.g. bestvideo+bestaudio), '
+            'output to given container format. One of mkv, mp4, ogg, webm, flv. '
             'Ignored if no merge is required'))
 
     subtitles = optparse.OptionGroup(parser, 'Subtitle Options')
@@ -537,7 +542,7 @@ def parseOpts(overrideArguments=None):
     verbosity.add_option(
         '--dump-pages', '--dump-intermediate-pages',
         action='store_true', dest='dump_intermediate_pages', default=False,
-        help='Print downloaded pages to debug problems (very verbose)')
+        help='Print downloaded pages encoded using base64 to debug problems (very verbose)')
     verbosity.add_option(
         '--write-pages',
         action='store_true', dest='write_pages', default=False,
@@ -686,7 +691,11 @@ def parseOpts(overrideArguments=None):
     postproc.add_option(
         '--recode-video',
         metavar='FORMAT', dest='recodevideo', default=None,
-        help='Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv)')
+        help='Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv|avi)')
+    postproc.add_option(
+        '--postprocessor-args',
+        dest='postprocessor_args', metavar='ARGS',
+        help='Give these arguments to the postprocessor')
     postproc.add_option(
         '-k', '--keep-video',
         action='store_true', dest='keepvideo', default=False,
@@ -713,7 +722,7 @@ def parseOpts(overrideArguments=None):
         help='Parse additional metadata like song title / artist from the video title. '
              'The format syntax is the same as --output, '
              'the parsed parameters replace existing values. '
-             'Additional templates: %(album), %(artist). '
+             'Additional templates: %(album)s, %(artist)s. '
              'Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like '
              '"Coldplay - Paradise"')
     postproc.add_option(
@@ -725,7 +734,7 @@ def parseOpts(overrideArguments=None):
         metavar='POLICY', dest='fixup', default='detect_or_warn',
         help='Automatically correct known faults of the file. '
              'One of never (do nothing), warn (only emit a warning), '
-             'detect_or_warn(the default; fix file if we can, warn otherwise)')
+             'detect_or_warn (the default; fix file if we can, warn otherwise)')
     postproc.add_option(
         '--prefer-avconv',
         action='store_false', dest='prefer_ffmpeg',
index 3b0e8ddd8bfed92f2e9043b0adea2655a9e5f67b..4191d040bb1da468e248d57b78df1afdacf64cd0 100644 (file)
@@ -23,6 +23,9 @@ class PostProcessor(object):
 
     PostProcessor objects follow a "mutual registration" process similar
     to InfoExtractor objects.
+
+    Optionally PostProcessor can use a list of additional command-line arguments
+    with self._configuration_args.
     """
 
     _downloader = None
@@ -57,6 +60,13 @@ class PostProcessor(object):
         except Exception:
             self._downloader.report_warning(errnote)
 
+    def _configuration_args(self, default=[]):
+        pp_args = self._downloader.params.get('postprocessor_args')
+        if pp_args is None:
+            return default
+        assert isinstance(pp_args, list)
+        return pp_args
+
 
 class AudioConversionError(PostProcessingError):
     pass
index 4868a42fdca9f486bed5e1def3ffaf08b26fda39..e19dbf73d5fe36c602d9ffb83cd2d02ab39cb5e1 100644 (file)
@@ -7,12 +7,9 @@ import subprocess
 
 from .ffmpeg import FFmpegPostProcessor
 
-from ..compat import (
-    compat_urlretrieve,
-)
 from ..utils import (
-    determine_ext,
     check_executable,
+    encodeArgument,
     encodeFilename,
     PostProcessingError,
     prepend_extension,
@@ -25,34 +22,48 @@ class EmbedThumbnailPPError(PostProcessingError):
 
 
 class EmbedThumbnailPP(FFmpegPostProcessor):
+    def __init__(self, downloader=None, already_have_thumbnail=False):
+        super(EmbedThumbnailPP, self).__init__(downloader)
+        self._already_have_thumbnail = already_have_thumbnail
+
     def run(self, info):
         filename = info['filepath']
         temp_filename = prepend_extension(filename, 'temp')
-        temp_thumbnail = filename + '.' + determine_ext(info['thumbnail'])
 
-        if not info.get('thumbnail'):
+        if not info.get('thumbnails'):
             raise EmbedThumbnailPPError('Thumbnail was not found. Nothing to do.')
 
-        compat_urlretrieve(info['thumbnail'], temp_thumbnail)
+        thumbnail_filename = info['thumbnails'][-1]['filename']
+
+        if not os.path.exists(encodeFilename(thumbnail_filename)):
+            self._downloader.report_warning(
+                'Skipping embedding the thumbnail because the file is missing.')
+            return [], info
 
         if info['ext'] == 'mp3':
             options = [
-                '-i', temp_thumbnail, '-c', 'copy', '-map', '0', '-map', '1',
+                '-c', 'copy', '-map', '0', '-map', '1',
                 '-metadata:s:v', 'title="Album cover"', '-metadata:s:v', 'comment="Cover (Front)"']
 
             self._downloader.to_screen('[ffmpeg] Adding thumbnail to "%s"' % filename)
 
-            self.run_ffmpeg(filename, temp_filename, options)
+            self.run_ffmpeg_multiple_files([filename, thumbnail_filename], temp_filename, options)
 
-            os.remove(encodeFilename(temp_thumbnail))
+            if not self._already_have_thumbnail:
+                os.remove(encodeFilename(thumbnail_filename))
             os.remove(encodeFilename(filename))
             os.rename(encodeFilename(temp_filename), encodeFilename(filename))
 
-        elif info['ext'] == 'm4a':
+        elif info['ext'] in ['m4a', 'mp4']:
             if not check_executable('AtomicParsley', ['-v']):
                 raise EmbedThumbnailPPError('AtomicParsley was not found. Please install.')
 
-            cmd = ['AtomicParsley', filename, '--artwork', temp_thumbnail, '-o', temp_filename]
+            cmd = [encodeFilename('AtomicParsley', True),
+                   encodeFilename(filename, True),
+                   encodeArgument('--artwork'),
+                   encodeFilename(thumbnail_filename, True),
+                   encodeArgument('-o'),
+                   encodeFilename(temp_filename, True)]
 
             self._downloader.to_screen('[atomicparsley] Adding thumbnail to "%s"' % filename)
 
@@ -66,7 +77,8 @@ class EmbedThumbnailPP(FFmpegPostProcessor):
                 msg = stderr.decode('utf-8', 'replace').strip()
                 raise EmbedThumbnailPPError(msg)
 
-            os.remove(encodeFilename(temp_thumbnail))
+            if not self._already_have_thumbnail:
+                os.remove(encodeFilename(thumbnail_filename))
             # for formats that don't support thumbnails (like 3gp) AtomicParsley
             # won't create to the temporary file
             if b'No changes' in stdout:
@@ -75,6 +87,6 @@ class EmbedThumbnailPP(FFmpegPostProcessor):
                 os.remove(encodeFilename(filename))
                 os.rename(encodeFilename(temp_filename), encodeFilename(filename))
         else:
-            raise EmbedThumbnailPPError('Only mp3 and m4a are supported for thumbnail embedding for now.')
+            raise EmbedThumbnailPPError('Only mp3 and m4a/mp4 are supported for thumbnail embedding for now.')
 
         return [], info
index 3414375751642c6cea7e6e79cdf81c786f114284..13794b7ba8653b179a08a348744441ae5c296852 100644 (file)
@@ -8,8 +8,8 @@ from ..utils import PostProcessingError
 
 
 class ExecAfterDownloadPP(PostProcessor):
-    def __init__(self, downloader=None, verboseOutput=None, exec_cmd=None):
-        self.verboseOutput = verboseOutput
+    def __init__(self, downloader, exec_cmd):
+        super(ExecAfterDownloadPP, self).__init__(downloader)
         self.exec_cmd = exec_cmd
 
     def run(self, information):
index 214de39f9aa80e6f042b63e2069ded1ab6d123bd..1f723908be8d4ff0247affc5aed9ffc44e777602 100644 (file)
@@ -21,6 +21,7 @@ from ..utils import (
     shell_quote,
     subtitles_filename,
     dfxp2srt,
+    ISO639Utils,
 )
 
 
@@ -130,6 +131,8 @@ class FFmpegPostProcessor(PostProcessor):
         oldest_mtime = min(
             os.stat(encodeFilename(path)).st_mtime for path in input_paths)
 
+        opts += self._configuration_args()
+
         files_cmd = []
         for path in input_paths:
             files_cmd.extend([encodeArgument('-i'), encodeFilename(path, True)])
@@ -262,7 +265,7 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):
         # If we download foo.mp3 and convert it to... foo.mp3, then don't delete foo.mp3, silly.
         if (new_path == path or
                 (self._nopostoverwrites and os.path.exists(encodeFilename(new_path)))):
-            self._downloader.to_screen('[youtube] Post-process file %s exists, skipping' % new_path)
+            self._downloader.to_screen('[ffmpeg] Post-process file %s exists, skipping' % new_path)
             return [], information
 
         try:
@@ -293,13 +296,16 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor):
 
     def run(self, information):
         path = information['filepath']
-        prefix, sep, ext = path.rpartition('.')
-        outpath = prefix + sep + self._preferedformat
         if information['ext'] == self._preferedformat:
             self._downloader.to_screen('[ffmpeg] Not converting video file %s - already is in target format %s' % (path, self._preferedformat))
             return [], information
+        options = []
+        if self._preferedformat == 'avi':
+            options.extend(['-c:v', 'libxvid', '-vtag', 'XVID'])
+        prefix, sep, ext = path.rpartition('.')
+        outpath = prefix + sep + self._preferedformat
         self._downloader.to_screen('[' + 'ffmpeg' + '] Converting video from %s to %s, Destination: ' % (information['ext'], self._preferedformat) + outpath)
-        self.run_ffmpeg(path, outpath, [])
+        self.run_ffmpeg(path, outpath, options)
         information['filepath'] = outpath
         information['format'] = self._preferedformat
         information['ext'] = self._preferedformat
@@ -307,199 +313,6 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor):
 
 
 class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
-    # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
-    _lang_map = {
-        'aa': 'aar',
-        'ab': 'abk',
-        'ae': 'ave',
-        'af': 'afr',
-        'ak': 'aka',
-        'am': 'amh',
-        'an': 'arg',
-        'ar': 'ara',
-        'as': 'asm',
-        'av': 'ava',
-        'ay': 'aym',
-        'az': 'aze',
-        'ba': 'bak',
-        'be': 'bel',
-        'bg': 'bul',
-        'bh': 'bih',
-        'bi': 'bis',
-        'bm': 'bam',
-        'bn': 'ben',
-        'bo': 'bod',
-        'br': 'bre',
-        'bs': 'bos',
-        'ca': 'cat',
-        'ce': 'che',
-        'ch': 'cha',
-        'co': 'cos',
-        'cr': 'cre',
-        'cs': 'ces',
-        'cu': 'chu',
-        'cv': 'chv',
-        'cy': 'cym',
-        'da': 'dan',
-        'de': 'deu',
-        'dv': 'div',
-        'dz': 'dzo',
-        'ee': 'ewe',
-        'el': 'ell',
-        'en': 'eng',
-        'eo': 'epo',
-        'es': 'spa',
-        'et': 'est',
-        'eu': 'eus',
-        'fa': 'fas',
-        'ff': 'ful',
-        'fi': 'fin',
-        'fj': 'fij',
-        'fo': 'fao',
-        'fr': 'fra',
-        'fy': 'fry',
-        'ga': 'gle',
-        'gd': 'gla',
-        'gl': 'glg',
-        'gn': 'grn',
-        'gu': 'guj',
-        'gv': 'glv',
-        'ha': 'hau',
-        'he': 'heb',
-        'hi': 'hin',
-        'ho': 'hmo',
-        'hr': 'hrv',
-        'ht': 'hat',
-        'hu': 'hun',
-        'hy': 'hye',
-        'hz': 'her',
-        'ia': 'ina',
-        'id': 'ind',
-        'ie': 'ile',
-        'ig': 'ibo',
-        'ii': 'iii',
-        'ik': 'ipk',
-        'io': 'ido',
-        'is': 'isl',
-        'it': 'ita',
-        'iu': 'iku',
-        'ja': 'jpn',
-        'jv': 'jav',
-        'ka': 'kat',
-        'kg': 'kon',
-        'ki': 'kik',
-        'kj': 'kua',
-        'kk': 'kaz',
-        'kl': 'kal',
-        'km': 'khm',
-        'kn': 'kan',
-        'ko': 'kor',
-        'kr': 'kau',
-        'ks': 'kas',
-        'ku': 'kur',
-        'kv': 'kom',
-        'kw': 'cor',
-        'ky': 'kir',
-        'la': 'lat',
-        'lb': 'ltz',
-        'lg': 'lug',
-        'li': 'lim',
-        'ln': 'lin',
-        'lo': 'lao',
-        'lt': 'lit',
-        'lu': 'lub',
-        'lv': 'lav',
-        'mg': 'mlg',
-        'mh': 'mah',
-        'mi': 'mri',
-        'mk': 'mkd',
-        'ml': 'mal',
-        'mn': 'mon',
-        'mr': 'mar',
-        'ms': 'msa',
-        'mt': 'mlt',
-        'my': 'mya',
-        'na': 'nau',
-        'nb': 'nob',
-        'nd': 'nde',
-        'ne': 'nep',
-        'ng': 'ndo',
-        'nl': 'nld',
-        'nn': 'nno',
-        'no': 'nor',
-        'nr': 'nbl',
-        'nv': 'nav',
-        'ny': 'nya',
-        'oc': 'oci',
-        'oj': 'oji',
-        'om': 'orm',
-        'or': 'ori',
-        'os': 'oss',
-        'pa': 'pan',
-        'pi': 'pli',
-        'pl': 'pol',
-        'ps': 'pus',
-        'pt': 'por',
-        'qu': 'que',
-        'rm': 'roh',
-        'rn': 'run',
-        'ro': 'ron',
-        'ru': 'rus',
-        'rw': 'kin',
-        'sa': 'san',
-        'sc': 'srd',
-        'sd': 'snd',
-        'se': 'sme',
-        'sg': 'sag',
-        'si': 'sin',
-        'sk': 'slk',
-        'sl': 'slv',
-        'sm': 'smo',
-        'sn': 'sna',
-        'so': 'som',
-        'sq': 'sqi',
-        'sr': 'srp',
-        'ss': 'ssw',
-        'st': 'sot',
-        'su': 'sun',
-        'sv': 'swe',
-        'sw': 'swa',
-        'ta': 'tam',
-        'te': 'tel',
-        'tg': 'tgk',
-        'th': 'tha',
-        'ti': 'tir',
-        'tk': 'tuk',
-        'tl': 'tgl',
-        'tn': 'tsn',
-        'to': 'ton',
-        'tr': 'tur',
-        'ts': 'tso',
-        'tt': 'tat',
-        'tw': 'twi',
-        'ty': 'tah',
-        'ug': 'uig',
-        'uk': 'ukr',
-        'ur': 'urd',
-        'uz': 'uzb',
-        've': 'ven',
-        'vi': 'vie',
-        'vo': 'vol',
-        'wa': 'wln',
-        'wo': 'wol',
-        'xh': 'xho',
-        'yi': 'yid',
-        'yo': 'yor',
-        'za': 'zha',
-        'zh': 'zho',
-        'zu': 'zul',
-    }
-
-    @classmethod
-    def _conver_lang_code(cls, code):
-        """Convert language code from ISO 639-1 to ISO 639-2/T"""
-        return cls._lang_map.get(code[:2])
-
     def run(self, information):
         if information['ext'] not in ['mp4', 'mkv']:
             self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4 or mkv files')
@@ -525,7 +338,7 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
             opts += ['-c:s', 'mov_text']
         for (i, lang) in enumerate(sub_langs):
             opts.extend(['-map', '%d:0' % (i + 1)])
-            lang_code = self._conver_lang_code(lang)
+            lang_code = ISO639Utils.short2long(lang)
             if lang_code is not None:
                 opts.extend(['-metadata:s:s:%d' % i, 'language=%s' % lang_code])
 
@@ -591,6 +404,23 @@ class FFmpegMergerPP(FFmpegPostProcessor):
         os.rename(encodeFilename(temp_filename), encodeFilename(filename))
         return info['__files_to_merge'], info
 
+    def can_merge(self):
+        # TODO: figure out merge-capable ffmpeg version
+        if self.basename != 'avconv':
+            return True
+
+        required_version = '10-0'
+        if is_outdated_version(
+                self._versions[self.basename], required_version):
+            warning = ('Your copy of %s is outdated and unable to properly mux separate video and audio files, '
+                       'youtube-dl will download single file media. '
+                       'Update %s to version %s or newer to fix this.') % (
+                           self.basename, self.basename, required_version)
+            if self._downloader:
+                self._downloader.report_warning(warning)
+            return False
+        return True
+
 
 class FFmpegFixupStretchedPP(FFmpegPostProcessor):
     def run(self, info):
index 93d0abcf6d1b563109c5488d10b67a012f916103..7d88e130820e073af1b6fd527390cb1cb5dc8dec 100644 (file)
@@ -3,18 +3,34 @@ from __future__ import unicode_literals
 import os
 import subprocess
 import sys
+import errno
 
 from .common import PostProcessor
-from ..compat import (
-    subprocess_check_output
-)
 from ..utils import (
     check_executable,
     hyphenate_date,
     version_tuple,
+    PostProcessingError,
+    encodeArgument,
+    encodeFilename,
 )
 
 
+class XAttrMetadataError(PostProcessingError):
+    def __init__(self, code=None, msg='Unknown error'):
+        super(XAttrMetadataError, self).__init__(msg)
+        self.code = code
+
+        # Parsing code and msg
+        if (self.code in (errno.ENOSPC, errno.EDQUOT) or
+                'No space left' in self.msg or 'Disk quota excedded' in self.msg):
+            self.reason = 'NO_SPACE'
+        elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
+            self.reason = 'VALUE_TOO_LONG'
+        else:
+            self.reason = 'NOT_SUPPORTED'
+
+
 class XAttrMetadataPP(PostProcessor):
 
     #
@@ -51,7 +67,10 @@ class XAttrMetadataPP(PostProcessor):
                 raise ImportError
 
             def write_xattr(path, key, value):
-                return xattr.setxattr(path, key, value)
+                try:
+                    xattr.set(path, key, value)
+                except EnvironmentError as e:
+                    raise XAttrMetadataError(e.errno, e.strerror)
 
         except ImportError:
             if os.name == 'nt':
@@ -62,8 +81,11 @@ class XAttrMetadataPP(PostProcessor):
                     assert os.path.exists(path)
 
                     ads_fn = path + ":" + key
-                    with open(ads_fn, "wb") as f:
-                        f.write(value)
+                    try:
+                        with open(ads_fn, "wb") as f:
+                            f.write(value)
+                    except EnvironmentError as e:
+                        raise XAttrMetadataError(e.errno, e.strerror)
             else:
                 user_has_setfattr = check_executable("setfattr", ['--version'])
                 user_has_xattr = check_executable("xattr", ['-h'])
@@ -71,12 +93,27 @@ class XAttrMetadataPP(PostProcessor):
                 if user_has_setfattr or user_has_xattr:
 
                     def write_xattr(path, key, value):
+                        value = value.decode('utf-8')
                         if user_has_setfattr:
-                            cmd = ['setfattr', '-n', key, '-v', value, path]
+                            executable = 'setfattr'
+                            opts = ['-n', key, '-v', value]
                         elif user_has_xattr:
-                            cmd = ['xattr', '-w', key, value, path]
-
-                        subprocess_check_output(cmd)
+                            executable = 'xattr'
+                            opts = ['-w', key, value]
+
+                        cmd = ([encodeFilename(executable, True)] +
+                               [encodeArgument(o) for o in opts] +
+                               [encodeFilename(path, True)])
+
+                        try:
+                            p = subprocess.Popen(
+                                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
+                        except EnvironmentError as e:
+                            raise XAttrMetadataError(e.errno, e.strerror)
+                        stdout, stderr = p.communicate()
+                        stderr = stderr.decode('utf-8', 'replace')
+                        if p.returncode != 0:
+                            raise XAttrMetadataError(p.returncode, stderr)
 
                 else:
                     # On Unix, and can't find pyxattr, setfattr, or xattr.
@@ -121,6 +158,19 @@ class XAttrMetadataPP(PostProcessor):
 
             return [], info
 
-        except (subprocess.CalledProcessError, OSError):
-            self._downloader.report_error("This filesystem doesn't support extended attributes. (You may have to enable them in your /etc/fstab)")
+        except XAttrMetadataError as e:
+            if e.reason == 'NO_SPACE':
+                self._downloader.report_warning(
+                    'There\'s no disk space left or disk quota exceeded. ' +
+                    'Extended attributes are not written.')
+            elif e.reason == 'VALUE_TOO_LONG':
+                self._downloader.report_warning(
+                    'Unable to write extended attributes due to too long values.')
+            else:
+                msg = 'This filesystem doesn\'t support extended attributes. '
+                if os.name == 'nt':
+                    msg += 'You need to use NTFS.'
+                else:
+                    msg += '(You may have to enable them in your /etc/fstab)'
+                self._downloader.report_error(msg)
             return [], info
index de3169eef1d6ec29d82a60b2f4b6a68f49d7dd4e..fc7ac8305d71c8cce077ef3040cd0903ac9f09c5 100644 (file)
@@ -50,7 +50,7 @@ def rsa_verify(message, signature, key):
 def update_self(to_screen, verbose):
     """Update the program file with the latest version from the repository"""
 
-    UPDATE_URL = "http://rg3.github.io/youtube-dl/update/"
+    UPDATE_URL = "https://rg3.github.io/youtube-dl/update/"
     VERSION_URL = UPDATE_URL + 'LATEST_VERSION'
     JSON_URL = UPDATE_URL + 'versions.json'
     UPDATES_RSA_KEY = (0x9d60ee4d8f805312fdb15a62f87b95bd66177b91df176765d13514a0f1754bcd2057295c5b6f1d35daa6742c3ffc9a82d3e118861c207995a8031e151d863c9927e304576bc80692bc8e094896fcf11b66f3e29e04e3a71e9a11558558acea1840aec37fc396fb6b65dc81a1c4144e03bd1c011de62e3f1357b327d08426fe93, 65537)
index 1013f7c1879e0afac035327e1e60c13ee295ee21..ae813099dded05c15ed57d27e51be7704e778520 100644 (file)
@@ -62,6 +62,8 @@ std_headers = {
 }
 
 
+NO_DEFAULT = object()
+
 ENGLISH_MONTH_NAMES = [
     'January', 'February', 'March', 'April', 'May', 'June',
     'July', 'August', 'September', 'October', 'November', 'December']
@@ -171,13 +173,15 @@ def xpath_with_ns(path, ns_map):
     return '/'.join(replaced)
 
 
-def xpath_text(node, xpath, name=None, fatal=False):
+def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
     if sys.version_info < (2, 7):  # Crazy 2.6
         xpath = xpath.encode('ascii')
 
     n = node.find(xpath)
     if n is None or n.text is None:
-        if fatal:
+        if default is not NO_DEFAULT:
+            return default
+        elif fatal:
             name = xpath if name is None else name
             raise ExtractorError('Could not find XML element %s' % name)
         else:
@@ -327,13 +331,6 @@ def sanitize_path(s):
     return os.path.join(*sanitized_path)
 
 
-def sanitize_url_path_consecutive_slashes(url):
-    """Collapses consecutive slashes in URLs' path"""
-    parsed_url = list(compat_urlparse.urlparse(url))
-    parsed_url[2] = re.sub(r'/{2,}', '/', parsed_url[2])
-    return compat_urlparse.urlunparse(parsed_url)
-
-
 def orderedSet(iterable):
     """ Remove all duplicates from the input iterable """
     res = []
@@ -1312,10 +1309,10 @@ def parse_duration(s):
     m = re.match(
         r'''(?ix)(?:P?T)?
         (?:
-            (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
+            (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
 
-            \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*|
+            \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
             (?:
                 (?:
                     (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
@@ -1380,7 +1377,7 @@ def get_exe_version(exe, args=['--version'],
     or False if the executable is not present """
     try:
         out, _ = subprocess.Popen(
-            [exe] + args,
+            [encodeArgument(exe)] + args,
             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
     except OSError:
         return False
@@ -1672,6 +1669,7 @@ def mimetype2ext(mt):
     return {
         'x-ms-wmv': 'wmv',
         'x-mp4-fragmented': 'mp4',
+        'ttml+xml': 'ttml',
     }.get(res, res)
 
 
@@ -1842,16 +1840,15 @@ def parse_dfxp_time_expr(time_expr):
         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
 
 
-def format_srt_time(seconds):
-    (mins, secs) = divmod(seconds, 60)
-    (hours, mins) = divmod(mins, 60)
-    millisecs = (secs - int(secs)) * 1000
-    secs = int(secs)
-    return '%02d:%02d:%02d,%03d' % (hours, mins, secs, millisecs)
+def srt_subtitles_timecode(seconds):
+    return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
 
 
 def dfxp2srt(dfxp_data):
-    _x = functools.partial(xpath_with_ns, ns_map={'ttml': 'http://www.w3.org/ns/ttml'})
+    _x = functools.partial(xpath_with_ns, ns_map={
+        'ttml': 'http://www.w3.org/ns/ttml',
+        'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
+    })
 
     def parse_node(node):
         str_or_empty = functools.partial(str_or_none, default='')
@@ -1859,9 +1856,9 @@ def dfxp2srt(dfxp_data):
         out = str_or_empty(node.text)
 
         for child in node:
-            if child.tag == _x('ttml:br'):
+            if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
                 out += '\n' + str_or_empty(child.tail)
-            elif child.tag == _x('ttml:span'):
+            elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
                 out += str_or_empty(parse_node(child))
             else:
                 out += str_or_empty(xml.etree.ElementTree.tostring(child))
@@ -1870,18 +1867,487 @@ def dfxp2srt(dfxp_data):
 
     dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
     out = []
-    paras = dfxp.findall(_x('.//ttml:p'))
+    paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
+
+    if not paras:
+        raise ValueError('Invalid dfxp/TTML subtitle')
 
     for para, index in zip(paras, itertools.count(1)):
+        begin_time = parse_dfxp_time_expr(para.attrib['begin'])
+        end_time = parse_dfxp_time_expr(para.attrib.get('end'))
+        if not end_time:
+            end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
         out.append('%d\n%s --> %s\n%s\n\n' % (
             index,
-            format_srt_time(parse_dfxp_time_expr(para.attrib.get('begin'))),
-            format_srt_time(parse_dfxp_time_expr(para.attrib.get('end'))),
+            srt_subtitles_timecode(begin_time),
+            srt_subtitles_timecode(end_time),
             parse_node(para)))
 
     return ''.join(out)
 
 
+class ISO639Utils(object):
+    # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
+    _lang_map = {
+        'aa': 'aar',
+        'ab': 'abk',
+        'ae': 'ave',
+        'af': 'afr',
+        'ak': 'aka',
+        'am': 'amh',
+        'an': 'arg',
+        'ar': 'ara',
+        'as': 'asm',
+        'av': 'ava',
+        'ay': 'aym',
+        'az': 'aze',
+        'ba': 'bak',
+        'be': 'bel',
+        'bg': 'bul',
+        'bh': 'bih',
+        'bi': 'bis',
+        'bm': 'bam',
+        'bn': 'ben',
+        'bo': 'bod',
+        'br': 'bre',
+        'bs': 'bos',
+        'ca': 'cat',
+        'ce': 'che',
+        'ch': 'cha',
+        'co': 'cos',
+        'cr': 'cre',
+        'cs': 'ces',
+        'cu': 'chu',
+        'cv': 'chv',
+        'cy': 'cym',
+        'da': 'dan',
+        'de': 'deu',
+        'dv': 'div',
+        'dz': 'dzo',
+        'ee': 'ewe',
+        'el': 'ell',
+        'en': 'eng',
+        'eo': 'epo',
+        'es': 'spa',
+        'et': 'est',
+        'eu': 'eus',
+        'fa': 'fas',
+        'ff': 'ful',
+        'fi': 'fin',
+        'fj': 'fij',
+        'fo': 'fao',
+        'fr': 'fra',
+        'fy': 'fry',
+        'ga': 'gle',
+        'gd': 'gla',
+        'gl': 'glg',
+        'gn': 'grn',
+        'gu': 'guj',
+        'gv': 'glv',
+        'ha': 'hau',
+        'he': 'heb',
+        'hi': 'hin',
+        'ho': 'hmo',
+        'hr': 'hrv',
+        'ht': 'hat',
+        'hu': 'hun',
+        'hy': 'hye',
+        'hz': 'her',
+        'ia': 'ina',
+        'id': 'ind',
+        'ie': 'ile',
+        'ig': 'ibo',
+        'ii': 'iii',
+        'ik': 'ipk',
+        'io': 'ido',
+        'is': 'isl',
+        'it': 'ita',
+        'iu': 'iku',
+        'ja': 'jpn',
+        'jv': 'jav',
+        'ka': 'kat',
+        'kg': 'kon',
+        'ki': 'kik',
+        'kj': 'kua',
+        'kk': 'kaz',
+        'kl': 'kal',
+        'km': 'khm',
+        'kn': 'kan',
+        'ko': 'kor',
+        'kr': 'kau',
+        'ks': 'kas',
+        'ku': 'kur',
+        'kv': 'kom',
+        'kw': 'cor',
+        'ky': 'kir',
+        'la': 'lat',
+        'lb': 'ltz',
+        'lg': 'lug',
+        'li': 'lim',
+        'ln': 'lin',
+        'lo': 'lao',
+        'lt': 'lit',
+        'lu': 'lub',
+        'lv': 'lav',
+        'mg': 'mlg',
+        'mh': 'mah',
+        'mi': 'mri',
+        'mk': 'mkd',
+        'ml': 'mal',
+        'mn': 'mon',
+        'mr': 'mar',
+        'ms': 'msa',
+        'mt': 'mlt',
+        'my': 'mya',
+        'na': 'nau',
+        'nb': 'nob',
+        'nd': 'nde',
+        'ne': 'nep',
+        'ng': 'ndo',
+        'nl': 'nld',
+        'nn': 'nno',
+        'no': 'nor',
+        'nr': 'nbl',
+        'nv': 'nav',
+        'ny': 'nya',
+        'oc': 'oci',
+        'oj': 'oji',
+        'om': 'orm',
+        'or': 'ori',
+        'os': 'oss',
+        'pa': 'pan',
+        'pi': 'pli',
+        'pl': 'pol',
+        'ps': 'pus',
+        'pt': 'por',
+        'qu': 'que',
+        'rm': 'roh',
+        'rn': 'run',
+        'ro': 'ron',
+        'ru': 'rus',
+        'rw': 'kin',
+        'sa': 'san',
+        'sc': 'srd',
+        'sd': 'snd',
+        'se': 'sme',
+        'sg': 'sag',
+        'si': 'sin',
+        'sk': 'slk',
+        'sl': 'slv',
+        'sm': 'smo',
+        'sn': 'sna',
+        'so': 'som',
+        'sq': 'sqi',
+        'sr': 'srp',
+        'ss': 'ssw',
+        'st': 'sot',
+        'su': 'sun',
+        'sv': 'swe',
+        'sw': 'swa',
+        'ta': 'tam',
+        'te': 'tel',
+        'tg': 'tgk',
+        'th': 'tha',
+        'ti': 'tir',
+        'tk': 'tuk',
+        'tl': 'tgl',
+        'tn': 'tsn',
+        'to': 'ton',
+        'tr': 'tur',
+        'ts': 'tso',
+        'tt': 'tat',
+        'tw': 'twi',
+        'ty': 'tah',
+        'ug': 'uig',
+        'uk': 'ukr',
+        'ur': 'urd',
+        'uz': 'uzb',
+        've': 'ven',
+        'vi': 'vie',
+        'vo': 'vol',
+        'wa': 'wln',
+        'wo': 'wol',
+        'xh': 'xho',
+        'yi': 'yid',
+        'yo': 'yor',
+        'za': 'zha',
+        'zh': 'zho',
+        'zu': 'zul',
+    }
+
+    @classmethod
+    def short2long(cls, code):
+        """Convert language code from ISO 639-1 to ISO 639-2/T"""
+        return cls._lang_map.get(code[:2])
+
+    @classmethod
+    def long2short(cls, code):
+        """Convert language code from ISO 639-2/T to ISO 639-1"""
+        for short_name, long_name in cls._lang_map.items():
+            if long_name == code:
+                return short_name
+
+
+class ISO3166Utils(object):
+    # From http://data.okfn.org/data/core/country-list
+    _country_map = {
+        'AF': 'Afghanistan',
+        'AX': 'Åland Islands',
+        'AL': 'Albania',
+        'DZ': 'Algeria',
+        'AS': 'American Samoa',
+        'AD': 'Andorra',
+        'AO': 'Angola',
+        'AI': 'Anguilla',
+        'AQ': 'Antarctica',
+        'AG': 'Antigua and Barbuda',
+        'AR': 'Argentina',
+        'AM': 'Armenia',
+        'AW': 'Aruba',
+        'AU': 'Australia',
+        'AT': 'Austria',
+        'AZ': 'Azerbaijan',
+        'BS': 'Bahamas',
+        'BH': 'Bahrain',
+        'BD': 'Bangladesh',
+        'BB': 'Barbados',
+        'BY': 'Belarus',
+        'BE': 'Belgium',
+        'BZ': 'Belize',
+        'BJ': 'Benin',
+        'BM': 'Bermuda',
+        'BT': 'Bhutan',
+        'BO': 'Bolivia, Plurinational State of',
+        'BQ': 'Bonaire, Sint Eustatius and Saba',
+        'BA': 'Bosnia and Herzegovina',
+        'BW': 'Botswana',
+        'BV': 'Bouvet Island',
+        'BR': 'Brazil',
+        'IO': 'British Indian Ocean Territory',
+        'BN': 'Brunei Darussalam',
+        'BG': 'Bulgaria',
+        'BF': 'Burkina Faso',
+        'BI': 'Burundi',
+        'KH': 'Cambodia',
+        'CM': 'Cameroon',
+        'CA': 'Canada',
+        'CV': 'Cape Verde',
+        'KY': 'Cayman Islands',
+        'CF': 'Central African Republic',
+        'TD': 'Chad',
+        'CL': 'Chile',
+        'CN': 'China',
+        'CX': 'Christmas Island',
+        'CC': 'Cocos (Keeling) Islands',
+        'CO': 'Colombia',
+        'KM': 'Comoros',
+        'CG': 'Congo',
+        'CD': 'Congo, the Democratic Republic of the',
+        'CK': 'Cook Islands',
+        'CR': 'Costa Rica',
+        'CI': 'Côte d\'Ivoire',
+        'HR': 'Croatia',
+        'CU': 'Cuba',
+        'CW': 'Curaçao',
+        'CY': 'Cyprus',
+        'CZ': 'Czech Republic',
+        'DK': 'Denmark',
+        'DJ': 'Djibouti',
+        'DM': 'Dominica',
+        'DO': 'Dominican Republic',
+        'EC': 'Ecuador',
+        'EG': 'Egypt',
+        'SV': 'El Salvador',
+        'GQ': 'Equatorial Guinea',
+        'ER': 'Eritrea',
+        'EE': 'Estonia',
+        'ET': 'Ethiopia',
+        'FK': 'Falkland Islands (Malvinas)',
+        'FO': 'Faroe Islands',
+        'FJ': 'Fiji',
+        'FI': 'Finland',
+        'FR': 'France',
+        'GF': 'French Guiana',
+        'PF': 'French Polynesia',
+        'TF': 'French Southern Territories',
+        'GA': 'Gabon',
+        'GM': 'Gambia',
+        'GE': 'Georgia',
+        'DE': 'Germany',
+        'GH': 'Ghana',
+        'GI': 'Gibraltar',
+        'GR': 'Greece',
+        'GL': 'Greenland',
+        'GD': 'Grenada',
+        'GP': 'Guadeloupe',
+        'GU': 'Guam',
+        'GT': 'Guatemala',
+        'GG': 'Guernsey',
+        'GN': 'Guinea',
+        'GW': 'Guinea-Bissau',
+        'GY': 'Guyana',
+        'HT': 'Haiti',
+        'HM': 'Heard Island and McDonald Islands',
+        'VA': 'Holy See (Vatican City State)',
+        'HN': 'Honduras',
+        'HK': 'Hong Kong',
+        'HU': 'Hungary',
+        'IS': 'Iceland',
+        'IN': 'India',
+        'ID': 'Indonesia',
+        'IR': 'Iran, Islamic Republic of',
+        'IQ': 'Iraq',
+        'IE': 'Ireland',
+        'IM': 'Isle of Man',
+        'IL': 'Israel',
+        'IT': 'Italy',
+        'JM': 'Jamaica',
+        'JP': 'Japan',
+        'JE': 'Jersey',
+        'JO': 'Jordan',
+        'KZ': 'Kazakhstan',
+        'KE': 'Kenya',
+        'KI': 'Kiribati',
+        'KP': 'Korea, Democratic People\'s Republic of',
+        'KR': 'Korea, Republic of',
+        'KW': 'Kuwait',
+        'KG': 'Kyrgyzstan',
+        'LA': 'Lao People\'s Democratic Republic',
+        'LV': 'Latvia',
+        'LB': 'Lebanon',
+        'LS': 'Lesotho',
+        'LR': 'Liberia',
+        'LY': 'Libya',
+        'LI': 'Liechtenstein',
+        'LT': 'Lithuania',
+        'LU': 'Luxembourg',
+        'MO': 'Macao',
+        'MK': 'Macedonia, the Former Yugoslav Republic of',
+        'MG': 'Madagascar',
+        'MW': 'Malawi',
+        'MY': 'Malaysia',
+        'MV': 'Maldives',
+        'ML': 'Mali',
+        'MT': 'Malta',
+        'MH': 'Marshall Islands',
+        'MQ': 'Martinique',
+        'MR': 'Mauritania',
+        'MU': 'Mauritius',
+        'YT': 'Mayotte',
+        'MX': 'Mexico',
+        'FM': 'Micronesia, Federated States of',
+        'MD': 'Moldova, Republic of',
+        'MC': 'Monaco',
+        'MN': 'Mongolia',
+        'ME': 'Montenegro',
+        'MS': 'Montserrat',
+        'MA': 'Morocco',
+        'MZ': 'Mozambique',
+        'MM': 'Myanmar',
+        'NA': 'Namibia',
+        'NR': 'Nauru',
+        'NP': 'Nepal',
+        'NL': 'Netherlands',
+        'NC': 'New Caledonia',
+        'NZ': 'New Zealand',
+        'NI': 'Nicaragua',
+        'NE': 'Niger',
+        'NG': 'Nigeria',
+        'NU': 'Niue',
+        'NF': 'Norfolk Island',
+        'MP': 'Northern Mariana Islands',
+        'NO': 'Norway',
+        'OM': 'Oman',
+        'PK': 'Pakistan',
+        'PW': 'Palau',
+        'PS': 'Palestine, State of',
+        'PA': 'Panama',
+        'PG': 'Papua New Guinea',
+        'PY': 'Paraguay',
+        'PE': 'Peru',
+        'PH': 'Philippines',
+        'PN': 'Pitcairn',
+        'PL': 'Poland',
+        'PT': 'Portugal',
+        'PR': 'Puerto Rico',
+        'QA': 'Qatar',
+        'RE': 'Réunion',
+        'RO': 'Romania',
+        'RU': 'Russian Federation',
+        'RW': 'Rwanda',
+        'BL': 'Saint Barthélemy',
+        'SH': 'Saint Helena, Ascension and Tristan da Cunha',
+        'KN': 'Saint Kitts and Nevis',
+        'LC': 'Saint Lucia',
+        'MF': 'Saint Martin (French part)',
+        'PM': 'Saint Pierre and Miquelon',
+        'VC': 'Saint Vincent and the Grenadines',
+        'WS': 'Samoa',
+        'SM': 'San Marino',
+        'ST': 'Sao Tome and Principe',
+        'SA': 'Saudi Arabia',
+        'SN': 'Senegal',
+        'RS': 'Serbia',
+        'SC': 'Seychelles',
+        'SL': 'Sierra Leone',
+        'SG': 'Singapore',
+        'SX': 'Sint Maarten (Dutch part)',
+        'SK': 'Slovakia',
+        'SI': 'Slovenia',
+        'SB': 'Solomon Islands',
+        'SO': 'Somalia',
+        'ZA': 'South Africa',
+        'GS': 'South Georgia and the South Sandwich Islands',
+        'SS': 'South Sudan',
+        'ES': 'Spain',
+        'LK': 'Sri Lanka',
+        'SD': 'Sudan',
+        'SR': 'Suriname',
+        'SJ': 'Svalbard and Jan Mayen',
+        'SZ': 'Swaziland',
+        'SE': 'Sweden',
+        'CH': 'Switzerland',
+        'SY': 'Syrian Arab Republic',
+        'TW': 'Taiwan, Province of China',
+        'TJ': 'Tajikistan',
+        'TZ': 'Tanzania, United Republic of',
+        'TH': 'Thailand',
+        'TL': 'Timor-Leste',
+        'TG': 'Togo',
+        'TK': 'Tokelau',
+        'TO': 'Tonga',
+        'TT': 'Trinidad and Tobago',
+        'TN': 'Tunisia',
+        'TR': 'Turkey',
+        'TM': 'Turkmenistan',
+        'TC': 'Turks and Caicos Islands',
+        'TV': 'Tuvalu',
+        'UG': 'Uganda',
+        'UA': 'Ukraine',
+        'AE': 'United Arab Emirates',
+        'GB': 'United Kingdom',
+        'US': 'United States',
+        'UM': 'United States Minor Outlying Islands',
+        'UY': 'Uruguay',
+        'UZ': 'Uzbekistan',
+        'VU': 'Vanuatu',
+        'VE': 'Venezuela, Bolivarian Republic of',
+        'VN': 'Viet Nam',
+        'VG': 'Virgin Islands, British',
+        'VI': 'Virgin Islands, U.S.',
+        'WF': 'Wallis and Futuna',
+        'EH': 'Western Sahara',
+        'YE': 'Yemen',
+        'ZM': 'Zambia',
+        'ZW': 'Zimbabwe',
+    }
+
+    @classmethod
+    def short2full(cls, code):
+        """Convert an ISO 3166-2 country code to the corresponding full name"""
+        return cls._country_map.get(code.upper())
+
+
 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
     def __init__(self, proxies=None):
         # Set default handlers
index a5a81bcd23009c3f63b2e7d1b89143492be2bf03..280afdd7f8f21cc058768eba232a11672ca32761 100644 (file)
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '2015.05.04'
+__version__ = '2015.07.21'