From: Ismael Mejia Date: Thu, 22 Aug 2013 21:29:36 +0000 (+0200) Subject: Merge branch 'master' into subtitles_rework X-Git-Url: http://git.bitcoin.ninja/index.cgi?p=youtube-dl;a=commitdiff_plain;h=18b4e04f1c663e0ea695f6501b860f85af9d7ca1;hp=d80a064eff4fe2416f9db36b07f1e2ca641f1334 Merge branch 'master' into subtitles_rework --- diff --git a/.gitignore b/.gitignore index fca34b8ba..61cb6bc3c 100644 --- a/.gitignore +++ b/.gitignore @@ -9,7 +9,6 @@ build/ dist/ MANIFEST README.txt -README.md youtube-dl.1 youtube-dl.bash-completion youtube-dl diff --git a/devscripts/youtube_genalgo.py b/devscripts/youtube_genalgo.py index 2b3879f0a..dca963e8f 100644 --- a/devscripts/youtube_genalgo.py +++ b/devscripts/youtube_genalgo.py @@ -11,30 +11,42 @@ tests = [ # 90 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`", "mrtyuioplkjhgfdsazxcvbne1234567890QWER[YUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={`]}|"), + # 89 + ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'", + "/?;:|}<[{=+-_)(*&^%$#@!MqBVCXZASDFGHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuyt"), # 88 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<", "J:|}][{=+-_)(*&;%$#@>MNBVCXZASDFGH^KLPOIUYTREWQ0987654321mnbvcxzasdfghrklpoiuytej"), # 87 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>.<", - "!?;:|}][{=+-_)(*&^$#@/MNBVCXZASqFGHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytr"), - # 86 - vfl_ymO4Z 2013/06/27 + "uioplkjhgfdsazxcvbnm1t34567890QWE2TYUIOPLKJHGFDSAZXCVeNM!@#$^&*()_-+={[]}|:;?/>.<"), + # 86 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<", - "ertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!/#$%^&*()_-+={[|};?@"), - # 85 - vflSAFCP9 2013/07/19 + "yuioplkjhgfdsazecvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<"), + # 85 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?/>.<", - "ertyuiqplkjhgfdsazx$vbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#<%^&*()_-+={[};?/c"), + ".>/?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWQ0q876543r1mnbvcx9asdfghjklpoiuyt2"), # 84 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?>.<", - "<.>?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWe098765432rmnbvcxzasdfghjklpoiuyt1"), - # 83 - vflcaqGO8 2013/07/11 + "<.>?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWQ09876543q1mnbvcxzasdfghjklpoiuew2"), + # 83 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<", - "urty8ioplkjhgfdsazxcvbqm1234567S90QWERTYUIOPLKJHGFDnAZXCVBNM!#$%^&*()_+={[};?/>.<"), + ".>/?;}[{=+_)(*&^%<#!MNBVCXZASPFGHJKLwOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytreq"), # 82 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.<", "Q>/?;}[{=+-(*<^%$#@!MNBVCXZASDFGHKLPOIUY8REWT0q&7654321mnbvcxzasdfghjklpoiuytrew9"), - # 81 + # 81 - vflLC8JvQ 2013/07/25 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.", - "urty8ioplkjhgfdsazxcvbqm1234567e90QWERTYUIOPLKHGFDSnZXCVBNM!@#$%^&*(-+={[};?/>."), + "C>/?;}[{=+-(*&^%$#@!MNBVYXZASDFGHKLPOIU.TREWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"), + # 79 - vflLC8JvQ 2013/07/25 (sporadic) + ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/", + "Z?;}[{=+-(*&^%$#@!MNBVCXRASDFGHKLPOIUYT/EWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"), +] + +tests_age_gate = [ + # 86 - vflqinMWD + ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<", + "ertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!/#$%^&*()_-+={[|};?@"), ] def find_matching(wrong, right): @@ -87,6 +99,8 @@ def genall(tests): def main(): print(genall(tests)) + print(u' Age gate:') + print(genall(tests_age_gate)) if __name__ == '__main__': main() diff --git a/test/test_all_urls.py b/test/test_all_urls.py index c73d0e467..c54faa380 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -50,6 +50,7 @@ class TestAllURLsMatching(unittest.TestCase): self.assertEqual(YoutubeIE()._extract_id('http://www.youtube.com/watch?&v=BaW_jenozKc'), 'BaW_jenozKc') self.assertEqual(YoutubeIE()._extract_id('https://www.youtube.com/watch?&v=BaW_jenozKc'), 'BaW_jenozKc') self.assertEqual(YoutubeIE()._extract_id('https://www.youtube.com/watch?feature=player_embedded&v=BaW_jenozKc'), 'BaW_jenozKc') + self.assertEqual(YoutubeIE()._extract_id('https://www.youtube.com/watch_popup?v=BaW_jenozKc'), 'BaW_jenozKc') def test_no_duplicates(self): ies = gen_extractors() diff --git a/test/test_playlists.py b/test/test_playlists.py new file mode 100644 index 000000000..65de3a55c --- /dev/null +++ b/test/test_playlists.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python + +import sys +import unittest +import json + +# Allow direct execution +import os +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.extractor import DailymotionPlaylistIE, VimeoChannelIE +from youtube_dl.utils import * + +from helper import FakeYDL + +class TestPlaylists(unittest.TestCase): + def assertIsPlaylist(self, info): + """Make sure the info has '_type' set to 'playlist'""" + self.assertEqual(info['_type'], 'playlist') + + def test_dailymotion_playlist(self): + dl = FakeYDL() + ie = DailymotionPlaylistIE(dl) + result = ie.extract('http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q') + self.assertIsPlaylist(result) + self.assertEqual(result['title'], u'SPORT') + self.assertTrue(len(result['entries']) > 20) + + def test_vimeo_channel(self): + dl = FakeYDL() + ie = VimeoChannelIE(dl) + result = ie.extract('http://vimeo.com/channels/tributes') + self.assertIsPlaylist(result) + self.assertEqual(result['title'], u'Vimeo Tributes') + self.assertTrue(len(result['entries']) > 24) + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_youtube_sig.py b/test/test_youtube_sig.py deleted file mode 100644 index 51b300532..000000000 --- a/test/test_youtube_sig.py +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env python - -import unittest -import sys - -# Allow direct execution -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from youtube_dl.extractor.youtube import YoutubeIE -from helper import FakeYDL - -sig = YoutubeIE(FakeYDL())._decrypt_signature - -class TestYoutubeSig(unittest.TestCase): - def test_92(self): - wrong = "F9F9B6E6FD47029957AB911A964CC20D95A181A5D37A2DBEFD67D403DB0E8BE4F4910053E4E8A79.0B70B.0B80B8" - right = "69B6E6FD47029957AB911A9F4CC20D95A181A5D3.A2DBEFD67D403DB0E8BE4F4910053E4E8A7980B7" - self.assertEqual(sig(wrong), right) - - def test_90(self): - wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`" - right = "mrtyuioplkjhgfdsazxcvbne1234567890QWER[YUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={`]}|" - self.assertEqual(sig(wrong), right) - - def test_88(self): - wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<" - right = "J:|}][{=+-_)(*&;%$#@>MNBVCXZASDFGH^KLPOIUYTREWQ0987654321mnbvcxzasdfghrklpoiuytej" - self.assertEqual(sig(wrong), right) - - def test_87(self): - wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>.<" - right = "!?;:|}][{=+-_)(*&^$#@/MNBVCXZASqFGHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytr" - self.assertEqual(sig(wrong), right) - - def test_86(self): - wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<" - right = "ertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!/#$%^&*()_-+={[|};?@" - self.assertEqual(sig(wrong), right) - - def test_85(self): - wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?/>.<" - right = "ertyuiqplkjhgfdsazx$vbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#<%^&*()_-+={[};?/c" - self.assertEqual(sig(wrong), right) - - def test_84(self): - wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?>.<" - right = "<.>?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWe098765432rmnbvcxzasdfghjklpoiuyt1" - self.assertEqual(sig(wrong), right) - - def test_83(self): - wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<" - right = "urty8ioplkjhgfdsazxcvbqm1234567S90QWERTYUIOPLKJHGFDnAZXCVBNM!#$%^&*()_+={[};?/>.<" - self.assertEqual(sig(wrong), right) - - def test_82(self): - wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.<" - right = "Q>/?;}[{=+-(*<^%$#@!MNBVCXZASDFGHKLPOIUY8REWT0q&7654321mnbvcxzasdfghjklpoiuytrew9" - self.assertEqual(sig(wrong), right) - - def test_81(self): - wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>." - right = "urty8ioplkjhgfdsazxcvbqm1234567e90QWERTYUIOPLKHGFDSnZXCVBNM!@#$%^&*(-+={[};?/>." - self.assertEqual(sig(wrong), right) - -if __name__ == '__main__': - unittest.main() diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 155895fe2..217c4a52f 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -79,9 +79,13 @@ class FileDownloader(object): rate = float(current) / dif eta = int((float(total) - float(current)) / rate) (eta_mins, eta_secs) = divmod(eta, 60) - if eta_mins > 99: - return '--:--' - return '%02d:%02d' % (eta_mins, eta_secs) + (eta_hours, eta_mins) = divmod(eta_mins, 60) + if eta_hours > 99: + return '--:--:--' + if eta_hours == 0: + return '%02d:%02d' % (eta_mins, eta_secs) + else: + return '%02d:%02d:%02d' % (eta_hours, eta_mins, eta_secs) @staticmethod def calc_speed(start, now, bytes): @@ -329,6 +333,35 @@ class FileDownloader(object): self.report_error(u'mplayer exited with code %d' % retval) return False + def _download_m3u8_with_ffmpeg(self, filename, url): + self.report_destination(filename) + tmpfilename = self.temp_name(filename) + + args = ['ffmpeg', '-y', '-i', url, '-f', 'mp4', tmpfilename] + # Check for ffmpeg first + try: + subprocess.call(['ffmpeg', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT) + except (OSError, IOError): + self.report_error(u'm3u8 download detected but "%s" could not be run' % args[0] ) + return False + + retval = subprocess.call(args) + if retval == 0: + fsize = os.path.getsize(encodeFilename(tmpfilename)) + self.to_screen(u'\r[%s] %s bytes' % (args[0], fsize)) + self.try_rename(tmpfilename, filename) + self._hook_progress({ + 'downloaded_bytes': fsize, + 'total_bytes': fsize, + 'filename': filename, + 'status': 'finished', + }) + return True + else: + self.to_stderr(u"\n") + self.report_error(u'ffmpeg exited with code %d' % retval) + return False + def _do_download(self, filename, info_dict): url = info_dict['url'] @@ -354,6 +387,10 @@ class FileDownloader(object): if url.startswith('mms') or url.startswith('rtsp'): return self._download_with_mplayer(filename, url) + # m3u8 manifest are downloaded with ffmpeg + if determine_ext(url) == u'm3u8': + return self._download_m3u8_with_ffmpeg(filename, url) + tmpfilename = self.temp_name(filename) stream = None diff --git a/youtube_dl/PostProcessor.py b/youtube_dl/PostProcessor.py index 8c5e53991..fddf58606 100644 --- a/youtube_dl/PostProcessor.py +++ b/youtube_dl/PostProcessor.py @@ -100,7 +100,8 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): self._nopostoverwrites = nopostoverwrites def get_audio_codec(self, path): - if not self._exes['ffprobe'] and not self._exes['avprobe']: return None + if not self._exes['ffprobe'] and not self._exes['avprobe']: + raise PostProcessingError(u'ffprobe or avprobe not found. Please install one.') try: cmd = [self._exes['avprobe'] or self._exes['ffprobe'], '-show_streams', encodeFilename(self._ffmpeg_filename_argument(path))] handle = subprocess.Popen(cmd, stderr=compat_subprocess_get_DEVNULL(), stdout=subprocess.PIPE) @@ -208,7 +209,7 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): try: os.utime(encodeFilename(new_path), (time.time(), information['filetime'])) except: - self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file') + self._downloader.report_warning(u'Cannot update utime of audio file') information['filepath'] = new_path return self._nopostoverwrites,information diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index e11d6f994..fa7bb1387 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -261,7 +261,7 @@ class YoutubeDL(object): self.report_error(u'Erroneous output template') return None except ValueError as err: - self.report_error(u'Insufficient system charset ' + repr(preferredencoding())) + self.report_error(u'Error in output template: ' + str(err) + u' (encoding: ' + repr(preferredencoding()) + ')') return None def _match_entry(self, info_dict): @@ -535,7 +535,7 @@ class YoutubeDL(object): try: success = self.fd._do_download(filename, info_dict) except (OSError, IOError) as err: - raise UnavailableVideoError() + raise UnavailableVideoError(err) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: self.report_error(u'unable to download video data: %s' % str(err)) return @@ -582,7 +582,7 @@ class YoutubeDL(object): # No clear decision yet, let IE decide keep_video = keep_video_wish except PostProcessingError as e: - self.to_stderr(u'ERROR: ' + e.msg) + self.report_error(e.msg) if keep_video is False and not self.params.get('keepvideo', False): try: self.to_screen(u'Deleting original file %s (pass -k to keep)' % filename) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 34f3dad0f..c21bf6d4a 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -27,6 +27,7 @@ __authors__ = ( 'Johny Mo Swag', 'Axel Noack', 'Albert Kim', + 'Pierre Rudloff', ) __license__ = 'Public Domain' @@ -343,7 +344,7 @@ def parseOpts(overrideArguments=None): userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf') systemConf = _readOptions('/etc/youtube-dl.conf') userConf = _readOptions(userConfFile) - commandLineConf = sys.argv[1:] + commandLineConf = sys.argv[1:] argv = systemConf + userConf + commandLineConf opts, args = parser.parse_args(argv) if opts.verbose: @@ -377,7 +378,7 @@ def _real_main(argv=None): # Set user agent if opts.user_agent is not None: std_headers['User-Agent'] = opts.user_agent - + # Set referer if opts.referer is not None: std_headers['Referer'] = opts.referer @@ -398,6 +399,8 @@ def _real_main(argv=None): batchurls = batchfd.readlines() batchurls = [x.strip() for x in batchurls] batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)] + if opts.verbose: + sys.stderr.write(u'[debug] Batch file urls: ' + repr(batchurls) + u'\n') except IOError: sys.exit(u'ERROR: batch file could not be read') all_urls = batchurls + args diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index b4a1c20e9..b4db8f0bf 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -12,7 +12,7 @@ from .comedycentral import ComedyCentralIE from .condenast import CondeNastIE from .criterion import CriterionIE from .cspan import CSpanIE -from .dailymotion import DailymotionIE +from .dailymotion import DailymotionIE, DailymotionPlaylistIE from .depositfiles import DepositFilesIE from .dotsub import DotsubIE from .dreisat import DreiSatIE @@ -36,23 +36,31 @@ from .ign import IGNIE, OneUPIE from .ina import InaIE from .infoq import InfoQIE from .instagram import InstagramIE +from .jeuxvideo import JeuxVideoIE from .jukebox import JukeboxIE from .justintv import JustinTVIE +from .kankan import KankanIE from .keek import KeekIE from .liveleak import LiveLeakIE from .livestream import LivestreamIE from .metacafe import MetacafeIE from .mixcloud import MixcloudIE from .mtv import MTVIE +from .muzu import MuzuTVIE from .myspass import MySpassIE from .myvideo import MyVideoIE from .nba import NBAIE +from .ooyala import OoyalaIE +from .pbs import PBSIE from .photobucket import PhotobucketIE from .pornotube import PornotubeIE from .rbmaradio import RBMARadioIE from .redtube import RedTubeIE from .ringtv import RingTVIE +from .roxwel import RoxwelIE +from .rtlnow import RTLnowIE from .sina import SinaIE +from .slashdot import SlashdotIE from .soundcloud import SoundcloudIE, SoundcloudSetIE from .spiegel import SpiegelIE from .stanfordoc import StanfordOpenClassroomIE @@ -67,10 +75,12 @@ from .tudou import TudouIE from .tumblr import TumblrIE from .tutv import TutvIE from .ustream import UstreamIE +from .unistra import UnistraIE from .vbox7 import Vbox7IE from .veoh import VeohIE from .vevo import VevoIE -from .vimeo import VimeoIE +from .videofyme import VideofyMeIE +from .vimeo import VimeoIE, VimeoChannelIE from .vine import VineIE from .c56 import C56IE from .wat import WatIE @@ -92,6 +102,9 @@ from .youtube import ( YoutubeChannelIE, YoutubeShowIE, YoutubeSubscriptionsIE, + YoutubeRecommendedIE, + YoutubeWatchLaterIE, + YoutubeFavouritesIE, ) from .zdf import ZDFIE diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 993e30f7a..69b3b0ad7 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -17,13 +17,14 @@ class ArteTvIE(InfoExtractor): """ _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?Pfr|de)/(?:(?:sendungen|emissions)/)?(?P.*?)/(?P.*?)(\?.*)?' _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?Pfr|de)/.*-(?P.*?).html' + _LIVEWEB_URL = r'(?:http://)?liveweb.arte.tv/(?Pfr|de)/(?P.+?)/(?P.+)' _LIVE_URL = r'index-[0-9]+\.html$' IE_NAME = u'arte.tv' @classmethod def suitable(cls, url): - return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL)) + return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL, cls._LIVEWEB_URL)) # TODO implement Live Stream # from ..utils import compat_urllib_parse @@ -68,6 +69,12 @@ class ArteTvIE(InfoExtractor): lang = mobj.group('lang') return self._extract_video(url, id, lang) + mobj = re.match(self._LIVEWEB_URL, url) + if mobj is not None: + name = mobj.group('name') + lang = mobj.group('lang') + return self._extract_liveweb(url, name, lang) + if re.search(self._LIVE_URL, video_id) is not None: raise ExtractorError(u'Arte live streams are not yet supported, sorry') # self.extractLiveStream(url) @@ -85,7 +92,7 @@ class ArteTvIE(InfoExtractor): info_dict = {'id': player_info['VID'], 'title': player_info['VTI'], - 'description': player_info['VDE'], + 'description': player_info.get('VDE'), 'upload_date': unified_strdate(player_info['VDA'].split(' ')[0]), 'thumbnail': player_info['programImage'], 'ext': 'flv', @@ -98,12 +105,14 @@ class ArteTvIE(InfoExtractor): l = 'F' elif lang == 'de': l = 'A' - regexes = [r'VO?%s' % l, r'V%s-ST.' % l] + regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l] return any(re.match(r, f['versionCode']) for r in regexes) # Some formats may not be in the same language as the url formats = filter(_match_lang, formats) # We order the formats by quality formats = sorted(formats, key=lambda f: int(f['height'])) + # Prefer videos without subtitles in the same language + formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f['versionCode']) is None) # Pick the best quality format_info = formats[-1] if format_info['mediaType'] == u'rtmp': @@ -144,3 +153,22 @@ class ArteTvIE(InfoExtractor): 'url': video_url, 'ext': 'flv', } + + def _extract_liveweb(self, url, name, lang): + """Extract form http://liveweb.arte.tv/""" + webpage = self._download_webpage(url, name) + video_id = self._search_regex(r'eventId=(\d+?)("|&)', webpage, u'event id') + config_xml = self._download_webpage('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id, + video_id, u'Downloading information') + config_doc = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8')) + event_doc = config_doc.find('event') + url_node = event_doc.find('video').find('urlHd') + if url_node is None: + url_node = video_doc.find('urlSd') + + return {'id': video_id, + 'title': event_doc.find('name%s' % lang.capitalize()).text, + 'url': url_node.text.replace('MP4', 'mp4'), + 'ext': 'flv', + 'thumbnail': self._og_search_thumbnail(webpage), + } diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index 34f555e89..53a898de3 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -1,6 +1,8 @@ import re +import json from .common import InfoExtractor +from ..utils import determine_ext class BreakIE(InfoExtractor): @@ -17,17 +19,20 @@ class BreakIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group(1).split("-")[-1] - webpage = self._download_webpage(url, video_id) - video_url = re.search(r"videoPath: '(.+?)',",webpage).group(1) - key = re.search(r"icon: '(.+?)',",webpage).group(1) - final_url = str(video_url)+"?"+str(key) - thumbnail_url = re.search(r"thumbnailURL: '(.+?)'",webpage).group(1) - title = re.search(r"sVidTitle: '(.+)',",webpage).group(1) - ext = video_url.split('.')[-1] + embed_url = 'http://www.break.com/embed/%s' % video_id + webpage = self._download_webpage(embed_url, video_id) + info_json = self._search_regex(r'var embedVars = ({.*?});', webpage, + u'info json', flags=re.DOTALL) + info = json.loads(info_json) + video_url = info['videoUri'] + m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', video_url) + if m_youtube is not None: + return self.url_result(m_youtube.group(1), 'Youtube') + final_url = video_url + '?' + info['AuthToken'] return [{ 'id': video_id, 'url': final_url, - 'ext': ext, - 'title': title, - 'thumbnail': thumbnail_url, + 'ext': determine_ext(final_url), + 'title': info['contentName'], + 'thumbnail': info['thumbUri'], }] diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py index 7ae0972e5..8d4c93d6d 100644 --- a/youtube_dl/extractor/collegehumor.py +++ b/youtube_dl/extractor/collegehumor.py @@ -1,26 +1,36 @@ import re -import socket import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( - compat_http_client, - compat_str, - compat_urllib_error, compat_urllib_parse_urlparse, - compat_urllib_request, + determine_ext, ExtractorError, ) class CollegeHumorIE(InfoExtractor): - _WORKING = False - _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P[0-9]+)/(?P.*)$' + _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/(video|embed|e)/(?P[0-9]+)/?(?P.*)$' - def report_manifest(self, video_id): - """Report information extraction.""" - self.to_screen(u'%s: Downloading XML manifest' % video_id) + _TESTS = [{ + u'url': u'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe', + u'file': u'6902724.mp4', + u'md5': u'1264c12ad95dca142a9f0bf7968105a0', + u'info_dict': { + u'title': u'Comic-Con Cosplay Catastrophe', + u'description': u'Fans get creative this year at San Diego. Too creative. And yes, that\'s really Joss Whedon.', + }, + }, + { + u'url': u'http://www.collegehumor.com/video/3505939/font-conference', + u'file': u'3505939.mp4', + u'md5': u'c51ca16b82bb456a4397987791a835f5', + u'info_dict': { + u'title': u'Font Conference', + u'description': u'This video wasn\'t long enough, so we made it double-spaced.', + }, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -36,39 +46,42 @@ class CollegeHumorIE(InfoExtractor): self.report_extraction(video_id) xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id - try: - metaXml = compat_urllib_request.urlopen(xmlUrl).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err)) + metaXml = self._download_webpage(xmlUrl, video_id, + u'Downloading info XML', + u'Unable to download video info XML') mdoc = xml.etree.ElementTree.fromstring(metaXml) try: videoNode = mdoc.findall('./video')[0] + youtubeIdNode = videoNode.find('./youtubeID') + if youtubeIdNode is not None: + return self.url_result(youtubeIdNode.text, 'Youtube') info['description'] = videoNode.findall('./description')[0].text info['title'] = videoNode.findall('./caption')[0].text info['thumbnail'] = videoNode.findall('./thumbnail')[0].text - manifest_url = videoNode.findall('./file')[0].text + next_url = videoNode.findall('./file')[0].text except IndexError: raise ExtractorError(u'Invalid metadata XML file') - manifest_url += '?hdcore=2.10.3' - self.report_manifest(video_id) - try: - manifestXml = compat_urllib_request.urlopen(manifest_url).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err)) - - adoc = xml.etree.ElementTree.fromstring(manifestXml) - try: - media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0] - node_id = media_node.attrib['url'] - video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text - except IndexError as err: - raise ExtractorError(u'Invalid manifest file') + if next_url.endswith(u'manifest.f4m'): + manifest_url = next_url + '?hdcore=2.10.3' + manifestXml = self._download_webpage(manifest_url, video_id, + u'Downloading XML manifest', + u'Unable to download video info XML') - url_pr = compat_urllib_parse_urlparse(manifest_url) - url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1' + adoc = xml.etree.ElementTree.fromstring(manifestXml) + try: + media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0] + node_id = media_node.attrib['url'] + video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text + except IndexError as err: + raise ExtractorError(u'Invalid manifest file') + url_pr = compat_urllib_parse_urlparse(info['thumbnail']) + info['url'] = url_pr.scheme + '://' + url_pr.netloc + video_id[:-2].replace('.csmil','').replace(',','') + info['ext'] = 'mp4' + else: + # Old-style direct links + info['url'] = next_url + info['ext'] = determine_ext(info['url']) - info['url'] = url - info['ext'] = 'f4f' - return [info] + return info diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 93d9e3d5e..bf8d711ee 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -24,7 +24,9 @@ class ComedyCentralIE(InfoExtractor): (full-episodes/(?P.*)| (?P (the-colbert-report-(videos|collections)/(?P[0-9]+)/[^/]*/(?P.*?)) - |(watch/(?P[^/]*)/(?P.*))))) + |(watch/(?P[^/]*)/(?P.*)))| + (?P + extended-interviews/(?P[0-9]+)/playlist_tds_extended_(?P.*?)/.*?))) $""" _TEST = { u'url': u'http://www.thedailyshow.com/watch/thu-december-13-2012/kristen-stewart', @@ -87,6 +89,9 @@ class ComedyCentralIE(InfoExtractor): else: epTitle = mobj.group('cntitle') dlNewest = False + elif mobj.group('interview'): + epTitle = mobj.group('interview_title') + dlNewest = False else: dlNewest = not mobj.group('episode') if dlNewest: diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e2e192bef..52c4483c9 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -78,7 +78,13 @@ class InfoExtractor(object): @classmethod def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(cls._VALID_URL, url) is not None + + # This does not use has/getattr intentionally - we want to know whether + # we have cached the regexp for *this* class, whereas getattr would also + # match the superclass + if '_VALID_URL_RE' not in cls.__dict__: + cls._VALID_URL_RE = re.compile(cls._VALID_URL) + return cls._VALID_URL_RE.match(url) is not None @classmethod def working(cls): diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 8fab16005..f54ecc569 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -82,8 +82,8 @@ class DailymotionIE(DailyMotionSubtitlesIE): # TODO: support choosing qualities - for key in ['stream_h264_hd1080_url', 'stream_h264_hd_url', - 'stream_h264_hq_url', 'stream_h264_url', + for key in ['stream_h264_hd1080_url','stream_h264_hd_url', + 'stream_h264_hq_url','stream_h264_url', 'stream_h264_ld_url']: if info.get(key): # key in info and info[key]: max_quality = key @@ -116,3 +116,31 @@ class DailymotionIE(DailyMotionSubtitlesIE): 'subtitles': video_subtitles, 'thumbnail': info['thumbnail_url'] }] + + +class DailymotionPlaylistIE(InfoExtractor): + _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P.+?)/' + _MORE_PAGES_INDICATOR = r'' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + video_ids = [] + + for pagenum in itertools.count(1): + webpage = self._download_webpage('https://www.dailymotion.com/playlist/%s/%s' % (playlist_id, pagenum), + playlist_id, u'Downloading page %s' % pagenum) + + playlist_el = get_element_by_attribute(u'class', u'video_list', webpage) + video_ids.extend(re.findall(r'data-id="(.+?)" data-ext-id', playlist_el)) + + if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None: + break + + entries = [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion') + for video_id in video_ids] + return {'_type': 'playlist', + 'id': playlist_id, + 'title': get_element_by_id(u'playlist_name', webpage), + 'entries': entries, + } diff --git a/youtube_dl/extractor/exfm.py b/youtube_dl/extractor/exfm.py index fe1582d1a..3443f19c5 100644 --- a/youtube_dl/extractor/exfm.py +++ b/youtube_dl/extractor/exfm.py @@ -8,17 +8,30 @@ class ExfmIE(InfoExtractor): IE_NAME = u'exfm' IE_DESC = u'ex.fm' _VALID_URL = r'(?:http://)?(?:www\.)?ex\.fm/song/([^/]+)' - _SOUNDCLOUD_URL_ = r'(?:http://)?(?:www\.)?api\.soundcloud.com/tracks/([^/]+)/stream' - _TEST = { - u'url': u'http://ex.fm/song/1bgtzg', - u'file': u'1bgtzg.mp3', - u'md5': u'8a7967a3fef10e59a1d6f86240fd41cf', - u'info_dict': { - u"title": u"We Can't Stop", - u"uploader": u"Miley Cyrus", - u'thumbnail': u'http://i1.sndcdn.com/artworks-000049666230-w9i7ef-t500x500.jpg?9d68d37' - } - } + _SOUNDCLOUD_URL = r'(?:http://)?(?:www\.)?api\.soundcloud.com/tracks/([^/]+)/stream' + _TESTS = [ + { + u'url': u'http://ex.fm/song/1bgtzg', + u'file': u'95223130.mp3', + u'md5': u'8a7967a3fef10e59a1d6f86240fd41cf', + u'info_dict': { + u"title": u"We Can't Stop - Miley Cyrus", + u"uploader": u"Miley Cyrus", + u'upload_date': u'20130603', + u'description': u'Download "We Can\'t Stop" \r\niTunes: http://smarturl.it/WeCantStop?IQid=SC\r\nAmazon: http://smarturl.it/WeCantStopAMZ?IQid=SC', + }, + u'note': u'Soundcloud song', + }, + { + u'url': u'http://ex.fm/song/wddt8', + u'file': u'wddt8.mp3', + u'md5': u'966bd70741ac5b8570d8e45bfaed3643', + u'info_dict': { + u'title': u'Safe and Sound', + u'uploader': u'Capital Cities', + }, + }, + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -26,11 +39,10 @@ class ExfmIE(InfoExtractor): info_url = "http://ex.fm/api/v3/song/%s" %(song_id) webpage = self._download_webpage(info_url, song_id) info = json.loads(webpage) - song_url = re.match(self._SOUNDCLOUD_URL_,info['song']['url']) - if song_url is not None: - song_url = song_url.group() + "?client_id=b45b1aa10f1ac2941910a7f0d10f8e28" - else: - song_url = info['song']['url'] + song_url = info['song']['url'] + if re.match(self._SOUNDCLOUD_URL, song_url) is not None: + self.to_screen('Soundcloud song detected') + return self.url_result(song_url.replace('/stream',''), 'Soundcloud') return [{ 'id': song_id, 'url': song_url, diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 67a7e5f76..4508f0dfa 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -21,17 +21,14 @@ class FunnyOrDieIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - video_url = self._html_search_regex(r']*>\s*]*>\s*(?P.*?)</h1>", - r'<title>(?P<title>[^<]+?)'), webpage, 'title', flags=re.DOTALL) - info = { 'id': video_id, 'url': video_url, 'ext': 'mp4', - 'title': title, + 'title': self._og_search_title(webpage), 'description': self._og_search_description(webpage), } return [info] diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index b633e896c..da016f7ee 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -107,8 +107,13 @@ class GenericIE(InfoExtractor): return new_url def _real_extract(self, url): - new_url = self._test_redirect(url) - if new_url: return [self.url_result(new_url)] + try: + new_url = self._test_redirect(url) + if new_url: + return [self.url_result(new_url)] + except compat_urllib_error.HTTPError: + # This may be a stupid server that doesn't like HEAD, our UA, or so + pass video_id = url.split('/')[-1] try: @@ -144,6 +149,9 @@ class GenericIE(InfoExtractor): # We only look in og:video if the MIME type is a video, don't try if it's a Flash player: if m_video_type is not None: mobj = re.search(r'.*?I[0-9]+)/.*' + _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?PI?[A-F0-9]+)/.*' _TEST = { u'url': u'www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html', u'file': u'I12055569.mp4', diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py new file mode 100644 index 000000000..4327bc13d --- /dev/null +++ b/youtube_dl/extractor/jeuxvideo.py @@ -0,0 +1,47 @@ +# coding: utf-8 + +import json +import re +import xml.etree.ElementTree + +from .common import InfoExtractor + +class JeuxVideoIE(InfoExtractor): + _VALID_URL = r'http://.*?\.jeuxvideo\.com/.*/(.*?)-\d+\.htm' + + _TEST = { + u'url': u'http://www.jeuxvideo.com/reportages-videos-jeux/0004/00046170/tearaway-playstation-vita-gc-2013-tearaway-nous-presente-ses-papiers-d-identite-00115182.htm', + u'file': u'5182.mp4', + u'md5': u'e0fdb0cd3ce98713ef9c1e1e025779d0', + u'info_dict': { + u'title': u'GC 2013 : Tearaway nous présente ses papiers d\'identité', + u'description': u'Lorsque les développeurs de LittleBigPlanet proposent un nouveau titre, on ne peut que s\'attendre à un résultat original et fort attrayant.\n', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + title = re.match(self._VALID_URL, url).group(1) + webpage = self._download_webpage(url, title) + m_download = re.search(r'', webpage) + + xml_link = m_download.group(1) + + id = re.search(r'http://www.jeuxvideo.com/config/\w+/0011/(.*?)/\d+_player\.xml', xml_link).group(1) + + xml_config = self._download_webpage(xml_link, title, + 'Downloading XML config') + config = xml.etree.ElementTree.fromstring(xml_config.encode('utf-8')) + info = re.search(r'(.*?)', + xml_config, re.MULTILINE|re.DOTALL).group(1) + info = json.loads(info)['versions'][0] + + video_url = 'http://video720.jeuxvideo.com/' + info['file'] + + return {'id': id, + 'title' : config.find('titre_video').text, + 'ext' : 'mp4', + 'url' : video_url, + 'description': self._og_search_description(webpage), + 'thumbnail': config.find('image').text, + } diff --git a/youtube_dl/extractor/kankan.py b/youtube_dl/extractor/kankan.py new file mode 100644 index 000000000..8537ba584 --- /dev/null +++ b/youtube_dl/extractor/kankan.py @@ -0,0 +1,37 @@ +import re + +from .common import InfoExtractor +from ..utils import determine_ext + + +class KankanIE(InfoExtractor): + _VALID_URL = r'https?://(?:.*?\.)?kankan\.com/.+?/(?P\d+)\.shtml' + + _TEST = { + u'url': u'http://yinyue.kankan.com/vod/48/48863.shtml', + u'file': u'48863.flv', + u'md5': u'29aca1e47ae68fc28804aca89f29507e', + u'info_dict': { + u'title': u'Ready To Go', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + + title = self._search_regex(r'G_TITLE=[\'"](.+?)[\'"]', webpage, u'video title') + gcid = self._search_regex(r'lurl:[\'"]http://.+?/.+?/(.+?)/', webpage, u'gcid') + + video_info_page = self._download_webpage('http://p2s.cl.kankan.com/getCdnresource_flv?gcid=%s' % gcid, + video_id, u'Downloading video url info') + ip = self._search_regex(r'ip:"(.+?)"', video_info_page, u'video url ip') + path = self._search_regex(r'path:"(.+?)"', video_info_page, u'video url path') + video_url = 'http://%s%s' % (ip, path) + + return {'id': video_id, + 'title': title, + 'url': video_url, + 'ext': determine_ext(video_url), + } diff --git a/youtube_dl/extractor/keek.py b/youtube_dl/extractor/keek.py index dda78743d..a7b88d2d9 100644 --- a/youtube_dl/extractor/keek.py +++ b/youtube_dl/extractor/keek.py @@ -4,10 +4,10 @@ from .common import InfoExtractor class KeekIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P\w+)' + _VALID_URL = r'https?://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P\w+)' IE_NAME = u'keek' _TEST = { - u'url': u'http://www.keek.com/ytdl/keeks/NODfbab', + u'url': u'https://www.keek.com/ytdl/keeks/NODfbab', u'file': u'NODfbab.mp4', u'md5': u'9b0636f8c0f7614afa4ea5e4c6e57e83', u'info_dict': { diff --git a/youtube_dl/extractor/muzu.py b/youtube_dl/extractor/muzu.py new file mode 100644 index 000000000..03e31ea1c --- /dev/null +++ b/youtube_dl/extractor/muzu.py @@ -0,0 +1,64 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse, + determine_ext, +) + + +class MuzuTVIE(InfoExtractor): + _VALID_URL = r'https?://www.muzu.tv/(.+?)/(.+?)/(?P\d+)' + IE_NAME = u'muzu.tv' + + _TEST = { + u'url': u'http://www.muzu.tv/defected/marcashken-featuring-sos-cat-walk-original-mix-music-video/1981454/', + u'file': u'1981454.mp4', + u'md5': u'98f8b2c7bc50578d6a0364fff2bfb000', + u'info_dict': { + u'title': u'Cat Walk (Original Mix)', + u'description': u'md5:90e868994de201b2570e4e5854e19420', + u'uploader': u'MarcAshken featuring SOS', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + info_data = compat_urllib_parse.urlencode({'format': 'json', + 'url': url, + }) + video_info_page = self._download_webpage('http://www.muzu.tv/api/oembed/?%s' % info_data, + video_id, u'Downloading video info') + info = json.loads(video_info_page) + + player_info_page = self._download_webpage('http://player.muzu.tv/player/playerInit?ai=%s' % video_id, + video_id, u'Downloading player info') + video_info = json.loads(player_info_page)['videos'][0] + for quality in ['1080' , '720', '480', '360']: + if video_info.get('v%s' % quality): + break + + data = compat_urllib_parse.urlencode({'ai': video_id, + # Even if each time you watch a video the hash changes, + # it seems to work for different videos, and it will work + # even if you use any non empty string as a hash + 'viewhash': 'VBNff6djeV4HV5TRPW5kOHub2k', + 'device': 'web', + 'qv': quality, + }) + video_url_page = self._download_webpage('http://player.muzu.tv/player/requestVideo?%s' % data, + video_id, u'Downloading video url') + video_url_info = json.loads(video_url_page) + video_url = video_url_info['url'] + + return {'id': video_id, + 'title': info['title'], + 'url': video_url, + 'ext': determine_ext(video_url), + 'thumbnail': info['thumbnail_url'], + 'description': info['description'], + 'uploader': info['author_name'], + } diff --git a/youtube_dl/extractor/myvideo.py b/youtube_dl/extractor/myvideo.py index b2a7b1df0..0404e6e43 100644 --- a/youtube_dl/extractor/myvideo.py +++ b/youtube_dl/extractor/myvideo.py @@ -2,11 +2,13 @@ import binascii import base64 import hashlib import re +import json from .common import InfoExtractor from ..utils import ( compat_ord, compat_urllib_parse, + compat_urllib_request, ExtractorError, ) @@ -16,7 +18,7 @@ from ..utils import ( class MyVideoIE(InfoExtractor): """Information Extractor for myvideo.de.""" - _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*' + _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/(?:[^/]+/)?watch/([0-9]+)/([^?/]+).*' IE_NAME = u'myvideo' _TEST = { u'url': u'http://www.myvideo.de/watch/8229274/bowling_fail_or_win', @@ -85,6 +87,20 @@ class MyVideoIE(InfoExtractor): 'ext': video_ext, }] + mobj = re.search(r'data-video-service="/service/data/video/%s/config' % video_id, webpage) + if mobj is not None: + request = compat_urllib_request.Request('http://www.myvideo.de/service/data/video/%s/config' % video_id, '') + response = self._download_webpage(request, video_id, + u'Downloading video info') + info = json.loads(base64.b64decode(response).decode('utf-8')) + return {'id': video_id, + 'title': info['title'], + 'url': info['streaming_url'].replace('rtmpe', 'rtmpt'), + 'play_path': info['filename'], + 'ext': 'flv', + 'thumbnail': info['thumbnail'][0]['url'], + } + # try encxml mobj = re.search('var flashvars={(.+?)}', webpage) if mobj is None: diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py new file mode 100644 index 000000000..b734722d0 --- /dev/null +++ b/youtube_dl/extractor/ooyala.py @@ -0,0 +1,52 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import unescapeHTML + +class OoyalaIE(InfoExtractor): + _VALID_URL = r'https?://.+?\.ooyala\.com/.*?embedCode=(?P.+?)(&|$)' + + _TEST = { + # From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video + u'url': u'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8', + u'file': u'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8.mp4', + u'md5': u'3f5cceb3a7bf461d6c29dc466cf8033c', + u'info_dict': { + u'title': u'Explaining Data Recovery from Hard Drives and SSDs', + u'description': u'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.', + }, + } + + def _extract_result(self, info, more_info): + return {'id': info['embedCode'], + 'ext': 'mp4', + 'title': unescapeHTML(info['title']), + 'url': info['url'], + 'description': unescapeHTML(more_info['description']), + 'thumbnail': more_info['promo'], + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + embedCode = mobj.group('id') + player_url = 'http://player.ooyala.com/player.js?embedCode=%s' % embedCode + player = self._download_webpage(player_url, embedCode) + mobile_url = self._search_regex(r'mobile_player_url="(.+?)&device="', + player, u'mobile player url') + mobile_player = self._download_webpage(mobile_url, embedCode) + videos_info = self._search_regex(r'eval\("\((\[{.*?stream_redirect.*?}\])\)"\);', mobile_player, u'info').replace('\\"','"') + videos_more_info = self._search_regex(r'eval\("\(({.*?\\"promo\\".*?})\)"', mobile_player, u'more info').replace('\\"','"') + videos_info = json.loads(videos_info) + videos_more_info =json.loads(videos_more_info) + + if videos_more_info.get('lineup'): + videos = [self._extract_result(info, more_info) for (info, more_info) in zip(videos_info, videos_more_info['lineup'])] + return {'_type': 'playlist', + 'id': embedCode, + 'title': unescapeHTML(videos_more_info['title']), + 'entries': videos, + } + else: + return self._extract_result(videos_info[0], videos_more_info) + diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py new file mode 100644 index 000000000..65462d867 --- /dev/null +++ b/youtube_dl/extractor/pbs.py @@ -0,0 +1,34 @@ +import re +import json + +from .common import InfoExtractor + + +class PBSIE(InfoExtractor): + _VALID_URL = r'https?://video.pbs.org/video/(?P\d+)/?' + + _TEST = { + u'url': u'http://video.pbs.org/video/2365006249/', + u'file': u'2365006249.mp4', + u'md5': 'ce1888486f0908d555a8093cac9a7362', + u'info_dict': { + u'title': u'A More Perfect Union', + u'description': u'md5:ba0c207295339c8d6eced00b7c363c6a', + u'duration': 3190, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id + info_page = self._download_webpage(info_url, video_id) + info =json.loads(info_page) + return {'id': video_id, + 'title': info['title'], + 'url': info['alternate_encoding']['url'], + 'ext': 'mp4', + 'description': info['program'].get('description'), + 'thumbnail': info.get('image_url'), + 'duration': info.get('duration'), + } diff --git a/youtube_dl/extractor/roxwel.py b/youtube_dl/extractor/roxwel.py new file mode 100644 index 000000000..d339e6cb5 --- /dev/null +++ b/youtube_dl/extractor/roxwel.py @@ -0,0 +1,49 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import unified_strdate, determine_ext + + +class RoxwelIE(InfoExtractor): + _VALID_URL = r'https?://www\.roxwel\.com/player/(?P.+?)(\.|\?|$)' + + _TEST = { + u'url': u'http://www.roxwel.com/player/passionpittakeawalklive.html', + u'file': u'passionpittakeawalklive.flv', + u'md5': u'd9dea8360a1e7d485d2206db7fe13035', + u'info_dict': { + u'title': u'Take A Walk (live)', + u'uploader': u'Passion Pit', + u'description': u'Passion Pit performs "Take A Walk\" live at The Backyard in Austin, Texas. ', + }, + u'skip': u'Requires rtmpdump', + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + filename = mobj.group('filename') + info_url = 'http://www.roxwel.com/api/videos/%s' % filename + info_page = self._download_webpage(info_url, filename, + u'Downloading video info') + + self.report_extraction(filename) + info = json.loads(info_page) + rtmp_rates = sorted([int(r.replace('flv_', '')) for r in info['media_rates'] if r.startswith('flv_')]) + best_rate = rtmp_rates[-1] + url_page_url = 'http://roxwel.com/pl_one_time.php?filename=%s&quality=%s' % (filename, best_rate) + rtmp_url = self._download_webpage(url_page_url, filename, u'Downloading video url') + ext = determine_ext(rtmp_url) + if ext == 'f4v': + rtmp_url = rtmp_url.replace(filename, 'mp4:%s' % filename) + + return {'id': filename, + 'title': info['title'], + 'url': rtmp_url, + 'ext': 'flv', + 'description': info['description'], + 'thumbnail': info.get('player_image_url') or info.get('image_url_large'), + 'uploader': info['artist'], + 'uploader_id': info['artistname'], + 'upload_date': unified_strdate(info['dbdate']), + } diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py new file mode 100644 index 000000000..2f134e6a7 --- /dev/null +++ b/youtube_dl/extractor/rtlnow.py @@ -0,0 +1,113 @@ +# encoding: utf-8 +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + ExtractorError, +) + +class RTLnowIE(InfoExtractor): + """Information Extractor for RTLnow, RTL2now and VOXnow""" + _VALID_URL = r'(?:http://)?(?P(?Prtl(?:(?P2)|-)now\.rtl(?(is_rtl2)2|)\.de/|(?:www\.)?voxnow\.de/)[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)' + _TESTS = [{ + u'url': u'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1', + u'file': u'90419.flv', + u'info_dict': { + u'upload_date': u'20070416', + u'title': u'Ahornallee - Folge 1 - Der Einzug', + u'description': u'Folge 1 - Der Einzug', + }, + u'params': { + u'skip_download': True, + }, + u'skip': u'Only works from Germany', + }, + { + u'url': u'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5', + u'file': u'69756.flv', + u'info_dict': { + u'upload_date': u'20120519', + u'title': u'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit...', + u'description': u'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.', + u'thumbnail': u'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg', + }, + u'params': { + u'skip_download': True, + }, + u'skip': u'Only works from Germany', + }, + { + u'url': u'www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17', + u'file': u'13883.flv', + u'info_dict': { + u'upload_date': u'20090627', + u'title': u'Voxtours - Südafrika-Reporter II', + u'description': u'Südafrika-Reporter II', + }, + u'params': { + u'skip_download': True, + }, + }] + + def _real_extract(self,url): + mobj = re.match(self._VALID_URL, url) + + webpage_url = u'http://' + mobj.group('url') + video_page_url = u'http://' + mobj.group('base_url') + video_id = mobj.group(u'video_id') + + webpage = self._download_webpage(webpage_url, video_id) + + note_m = re.search(r'''(?sx) + (.*?) + ''', webpage) + if note_m: + msg = clean_html(note_m.group(1)) + raise ExtractorError(msg) + + video_title = self._html_search_regex(r'(?P<title>[^<]+)', + webpage, u'title') + playerdata_url = self._html_search_regex(r'\'playerdata\': \'(?P[^\']+)\'', + webpage, u'playerdata_url') + + playerdata = self._download_webpage(playerdata_url, video_id) + mobj = re.search(r'<!\[CDATA\[(?P<description>.+?)\s+- (?:Sendung )?vom (?P<upload_date_d>[0-9]{2})\.(?P<upload_date_m>[0-9]{2})\.(?:(?P<upload_date_Y>[0-9]{4})|(?P<upload_date_y>[0-9]{2})) [0-9]{2}:[0-9]{2} Uhr\]\]>', playerdata) + if mobj: + video_description = mobj.group(u'description') + if mobj.group('upload_date_Y'): + video_upload_date = mobj.group('upload_date_Y') + else: + video_upload_date = u'20' + mobj.group('upload_date_y') + video_upload_date += mobj.group('upload_date_m')+mobj.group('upload_date_d') + else: + video_description = None + video_upload_date = None + self._downloader.report_warning(u'Unable to extract description and upload date') + + # Thumbnail: not every video has an thumbnail + mobj = re.search(r'', webpage) + if mobj: + video_thumbnail = mobj.group(u'thumbnail') + else: + video_thumbnail = None + + mobj = re.search(r']+>rtmpe://(?:[^/]+/){2})(?P[^\]]+)\]\]>', playerdata) + if mobj is None: + raise ExtractorError(u'Unable to extract media URL') + video_url = mobj.group(u'url') + video_play_path = u'mp4:' + mobj.group(u'play_path') + video_player_url = video_page_url + u'includes/vodplayer.swf' + + return [{ + 'id': video_id, + 'url': video_url, + 'play_path': video_play_path, + 'page_url': video_page_url, + 'player_url': video_player_url, + 'ext': 'flv', + 'title': video_title, + 'description': video_description, + 'upload_date': video_upload_date, + 'thumbnail': video_thumbnail, + }] diff --git a/youtube_dl/extractor/slashdot.py b/youtube_dl/extractor/slashdot.py new file mode 100644 index 000000000..2cba53076 --- /dev/null +++ b/youtube_dl/extractor/slashdot.py @@ -0,0 +1,23 @@ +import re + +from .common import InfoExtractor + + +class SlashdotIE(InfoExtractor): + _VALID_URL = r'https?://tv.slashdot.org/video/\?embed=(?P.*?)(&|$)' + + _TEST = { + u'url': u'http://tv.slashdot.org/video/?embed=JscHMzZDplD0p-yNLOzTfzC3Q3xzJaUz', + u'file': u'JscHMzZDplD0p-yNLOzTfzC3Q3xzJaUz.mp4', + u'md5': u'd2222e7a4a4c1541b3e0cf732fb26735', + u'info_dict': { + u'title': u' Meet the Stampede Supercomputing Cluster\'s Administrator', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + ooyala_url = self._search_regex(r'