From: Philipp Hagemeister Date: Tue, 15 Oct 2013 00:11:33 +0000 (+0200) Subject: Merge remote-tracking branch 'Rudloff/websurg' X-Git-Url: http://git.bitcoin.ninja/index.cgi?p=youtube-dl;a=commitdiff_plain;h=4f41664de8e0486127d8dd16168d829c26f79fdf;hp=73b4fafd82256c66198b1670d1a6dccfaf5f782c Merge remote-tracking branch 'Rudloff/websurg' --- diff --git a/.gitignore b/.gitignore index 61cb6bc3c..7dd0ad09b 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,5 @@ updates_key.pem *.flv *.mp4 *.part +test/testdata +.tox diff --git a/README.md b/README.md index 400e6cd48..8824daee2 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,8 @@ which means you can modify it, redistribute it or use it however you like. -U, --update update this program to latest version. Make sure that you have sufficient permissions (run with sudo if needed) - -i, --ignore-errors continue on download errors + -i, --ignore-errors continue on download errors, for example to to + skip unavailable videos in a playlist --dump-user-agent display the current browser identification --user-agent UA specify a custom user agent --referer REF specify a custom referer, use if the video access @@ -29,6 +30,11 @@ which means you can modify it, redistribute it or use it however you like. --extractor-descriptions Output descriptions of all supported extractors --proxy URL Use the specified HTTP/HTTPS proxy --no-check-certificate Suppress HTTPS certificate validation. + --cache-dir None Location in the filesystem where youtube-dl can + store downloaded information permanently. By + default $XDG_CACHE_HOME/youtube-dl or ~/.cache + /youtube-dl . + --no-cache-dir Disable filesystem caching ## Video Selection: --playlist-start NUMBER playlist video to start at (default is 1) @@ -45,6 +51,10 @@ which means you can modify it, redistribute it or use it however you like. --date DATE download only videos uploaded in this date --datebefore DATE download only videos uploaded before this date --dateafter DATE download only videos uploaded after this date + --no-playlist download only the currently playing video + --age-limit YEARS download only videos suitable for the given age + --download-archive FILE Download only videos not present in the archive + file. Record all downloaded videos in it. ## Download Options: -r, --rate-limit LIMIT maximum download rate (e.g. 50k or 44.6m) diff --git a/devscripts/gh-pages/update-sites.py b/devscripts/gh-pages/update-sites.py index 33f242480..153e15c8a 100755 --- a/devscripts/gh-pages/update-sites.py +++ b/devscripts/gh-pages/update-sites.py @@ -16,10 +16,11 @@ def main(): ie_htmls = [] for ie in sorted(youtube_dl.gen_extractors(), key=lambda i: i.IE_NAME.lower()): ie_html = '{}'.format(ie.IE_NAME) - try: + ie_desc = getattr(ie, 'IE_DESC', None) + if ie_desc is False: + continue + elif ie_desc is not None: ie_html += ': {}'.format(ie.IE_DESC) - except AttributeError: - pass if ie.working() == False: ie_html += ' (Currently broken)' ie_htmls.append('
  • {}
  • '.format(ie_html)) diff --git a/devscripts/youtube_genalgo.py b/devscripts/youtube_genalgo.py deleted file mode 100644 index 66019ee55..000000000 --- a/devscripts/youtube_genalgo.py +++ /dev/null @@ -1,109 +0,0 @@ -#!/usr/bin/env python - -# Generate youtube signature algorithm from test cases - -import sys - -tests = [ - # 92 - vflQw-fB4 2013/07/17 - ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`~\"", - "mrtyuioplkjhgfdsazxcvbnq1234567890QWERTY}IOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]\"|:;"), - # 90 - ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`", - "mrtyuioplkjhgfdsazxcvbne1234567890QWER[YUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={`]}|"), - # 89 - ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'", - "/?;:|}<[{=+-_)(*&^%$#@!MqBVCXZASDFGHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuyt"), - # 88 - vflapUV9V 2013/08/28 - ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<", - "ioplkjhgfdsazxcvbnm12<4567890QWERTYUIOZLKJHGFDSAeXCVBNM!@#$%^&*()_-+={[]}|:;?/>.3"), - # 87 - ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>.<", - "uioplkjhgfdsazxcvbnm1t34567890QWE2TYUIOPLKJHGFDSAZXCVeNM!@#$^&*()_-+={[]}|:;?/>.<"), - # 86 - vfluy6kdb 2013/09/06 - ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<", - "yuioplkjhgfdsazxcvbnm12345678q0QWrRTYUIOELKJHGFD-AZXCVBNM!@#$%^&*()_<+={[|};?/>.S"), - # 85 - vflkuzxcs 2013/09/11 - ('0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[', - '3456789a0cdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRS[UVWXYZ!"#$%&\'()*+,-./:;<=>?@'), - # 84 - vflg0g8PQ 2013/08/29 (sporadic) - ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?>.<", - ">?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWq0987654321mnbvcxzasdfghjklpoiuytr"), - # 83 - ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<", - ".>/?;}[{=+_)(*&^%<#!MNBVCXZASPFGHJKLwOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytreq"), - # 82 - vflGNjMhJ 2013/09/12 - ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.<", - ".>/?;}[<=+-(*&^%$#@!MNBVCXeASDFGHKLPOqUYTREWQ0987654321mnbvcxzasdfghjklpoiuytrIwZ"), - # 81 - vflLC8JvQ 2013/07/25 - ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.", - "C>/?;}[{=+-(*&^%$#@!MNBVYXZASDFGHKLPOIU.TREWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"), - # 80 - vflZK4ZYR 2013/08/23 (sporadic) - ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>", - "wertyuioplkjhgfdsaqxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&z(-+={[};?/>"), - # 79 - vflLC8JvQ 2013/07/25 (sporadic) - ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/", - "Z?;}[{=+-(*&^%$#@!MNBVCXRASDFGHKLPOIUYT/EWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"), -] - -tests_age_gate = [ - # 86 - vflqinMWD - ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<", - "ertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!/#$%^&*()_-+={[|};?@"), -] - -def find_matching(wrong, right): - idxs = [wrong.index(c) for c in right] - return compress(idxs) - return ('s[%d]' % i for i in idxs) - -def compress(idxs): - def _genslice(start, end, step): - starts = '' if start == 0 else str(start) - ends = ':%d' % (end+step) - steps = '' if step == 1 else (':%d' % step) - return 's[%s%s%s]' % (starts, ends, steps) - - step = None - for i, prev in zip(idxs[1:], idxs[:-1]): - if step is not None: - if i - prev == step: - continue - yield _genslice(start, prev, step) - step = None - continue - if i - prev in [-1, 1]: - step = i - prev - start = prev - continue - else: - yield 's[%d]' % prev - if step is None: - yield 's[%d]' % i - else: - yield _genslice(start, i, step) - -def _assert_compress(inp, exp): - res = list(compress(inp)) - if res != exp: - print('Got %r, expected %r' % (res, exp)) - assert res == exp -_assert_compress([0,2,4,6], ['s[0]', 's[2]', 's[4]', 's[6]']) -_assert_compress([0,1,2,4,6,7], ['s[:3]', 's[4]', 's[6:8]']) -_assert_compress([8,0,1,2,4,7,6,9], ['s[8]', 's[:3]', 's[4]', 's[7:5:-1]', 's[9]']) - -def gen(wrong, right, indent): - code = ' + '.join(find_matching(wrong, right)) - return 'if len(s) == %d:\n%s return %s\n' % (len(wrong), indent, code) - -def genall(tests): - indent = ' ' * 8 - return indent + (indent + 'el').join(gen(wrong, right, indent) for wrong,right in tests) - -def main(): - print(genall(tests)) - print(u' Age gate:') - print(genall(tests_age_gate)) - -if __name__ == '__main__': - main() diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/helper.py b/test/helper.py index a2b468b50..79a0ede48 100644 --- a/test/helper.py +++ b/test/helper.py @@ -1,38 +1,63 @@ +import errno import io +import hashlib import json import os.path +import re +import types import youtube_dl.extractor -from youtube_dl import YoutubeDL, YoutubeDLHandler -from youtube_dl.utils import ( - compat_cookiejar, - compat_urllib_request, -) - -# General configuration (from __init__, not very elegant...) -jar = compat_cookiejar.CookieJar() -cookie_processor = compat_urllib_request.HTTPCookieProcessor(jar) -proxy_handler = compat_urllib_request.ProxyHandler() -opener = compat_urllib_request.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler()) -compat_urllib_request.install_opener(opener) - -PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json") -with io.open(PARAMETERS_FILE, encoding='utf-8') as pf: - parameters = json.load(pf) +from youtube_dl import YoutubeDL + + +def global_setup(): + youtube_dl._setup_opener(timeout=10) + + +def get_params(override=None): + PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), + "parameters.json") + with io.open(PARAMETERS_FILE, encoding='utf-8') as pf: + parameters = json.load(pf) + if override: + parameters.update(override) + return parameters + + +def try_rm(filename): + """ Remove a file if it exists """ + try: + os.remove(filename) + except OSError as ose: + if ose.errno != errno.ENOENT: + raise + class FakeYDL(YoutubeDL): def __init__(self): - self.result = [] # Different instances of the downloader can't share the same dictionary # some test set the "sublang" parameter, which would break the md5 checks. - self.params = dict(parameters) - def to_screen(self, s): + params = get_params() + super(FakeYDL, self).__init__(params) + self.result = [] + + def to_screen(self, s, skip_eol=None): print(s) + def trouble(self, s, tb=None): raise Exception(s) + def download(self, x): self.result.append(x) + def expect_warning(self, regex): + # Silence an expected warning matching a regex + old_report_warning = self.report_warning + def report_warning(self, message): + if re.match(regex, message): return + old_report_warning(message) + self.report_warning = types.MethodType(report_warning, self) + def get_testcases(): for ie in youtube_dl.extractor.gen_extractors(): t = getattr(ie, '_TEST', None) @@ -42,3 +67,6 @@ def get_testcases(): for t in getattr(ie, '_TESTS', []): t['name'] = type(ie).__name__[:-len('IE')] yield t + + +md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() diff --git a/test/test_age_restriction.py b/test/test_age_restriction.py new file mode 100644 index 000000000..d500c6edc --- /dev/null +++ b/test/test_age_restriction.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import global_setup, try_rm +global_setup() + + +from youtube_dl import YoutubeDL + + +def _download_restricted(url, filename, age): + """ Returns true iff the file has been downloaded """ + + params = { + 'age_limit': age, + 'skip_download': True, + 'writeinfojson': True, + "outtmpl": "%(id)s.%(ext)s", + } + ydl = YoutubeDL(params) + ydl.add_default_info_extractors() + json_filename = filename + '.info.json' + try_rm(json_filename) + ydl.download([url]) + res = os.path.exists(json_filename) + try_rm(json_filename) + return res + + +class TestAgeRestriction(unittest.TestCase): + def _assert_restricted(self, url, filename, age, old_age=None): + self.assertTrue(_download_restricted(url, filename, old_age)) + self.assertFalse(_download_restricted(url, filename, age)) + + def test_youtube(self): + self._assert_restricted('07FYdnEawAQ', '07FYdnEawAQ.mp4', 10) + + def test_youporn(self): + self._assert_restricted( + 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', + '505835.mp4', 2, old_age=25) + + def test_pornotube(self): + self._assert_restricted( + 'http://pornotube.com/c/173/m/1689755/Marilyn-Monroe-Bathing', + '1689755.flv', 13) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_all_urls.py b/test/test_all_urls.py index ff1c86efe..56e5f80e1 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -1,14 +1,20 @@ #!/usr/bin/env python +# Allow direct execution +import os import sys import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -# Allow direct execution -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.extractor import YoutubeIE, YoutubePlaylistIE, YoutubeChannelIE, JustinTVIE, gen_extractors -from helper import get_testcases +from test.helper import get_testcases + +from youtube_dl.extractor import ( + gen_extractors, + JustinTVIE, + YoutubeIE, +) + class TestAllURLsMatching(unittest.TestCase): def setUp(self): diff --git a/test/test_dailymotion_subtitles.py b/test/test_dailymotion_subtitles.py index 83c65d57e..c596415c4 100644 --- a/test/test_dailymotion_subtitles.py +++ b/test/test_dailymotion_subtitles.py @@ -1,20 +1,16 @@ #!/usr/bin/env python +# Allow direct execution +import os import sys import unittest -import json -import io -import hashlib +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -# Allow direct execution -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from test.helper import FakeYDL, global_setup, md5 +global_setup() -from youtube_dl.extractor import DailymotionIE -from youtube_dl.utils import * -from helper import FakeYDL -md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() +from youtube_dl.extractor import DailymotionIE class TestDailymotionSubtitles(unittest.TestCase): def setUp(self): @@ -45,15 +41,18 @@ class TestDailymotionSubtitles(unittest.TestCase): subtitles = self.getSubtitles() self.assertEqual(len(subtitles.keys()), 5) def test_list_subtitles(self): + self.DL.expect_warning(u'Automatic Captions not supported by this server') self.DL.params['listsubtitles'] = True info_dict = self.getInfoDict() self.assertEqual(info_dict, None) def test_automatic_captions(self): + self.DL.expect_warning(u'Automatic Captions not supported by this server') self.DL.params['writeautomaticsub'] = True self.DL.params['subtitleslang'] = ['en'] subtitles = self.getSubtitles() self.assertTrue(len(subtitles.keys()) == 0) def test_nosubtitles(self): + self.DL.expect_warning(u'video doesn\'t have subtitles') self.url = 'http://www.dailymotion.com/video/x12u166_le-zapping-tele-star-du-08-aout-2013_tv' self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True diff --git a/test/test_download.py b/test/test_download.py index 23a66254d..b9a9be11d 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -1,43 +1,31 @@ #!/usr/bin/env python -import errno +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import get_params, get_testcases, global_setup, try_rm, md5 +global_setup() + + import hashlib import io -import os import json -import unittest -import sys import socket -import binascii - -# Allow direct execution -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import youtube_dl.YoutubeDL -from youtube_dl.utils import * - -PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json") +from youtube_dl.utils import ( + compat_str, + compat_urllib_error, + DownloadError, + ExtractorError, + UnavailableVideoError, +) RETRIES = 3 -# General configuration (from __init__, not very elegant...) -jar = compat_cookiejar.CookieJar() -cookie_processor = compat_urllib_request.HTTPCookieProcessor(jar) -proxy_handler = compat_urllib_request.ProxyHandler() -opener = compat_urllib_request.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler()) -compat_urllib_request.install_opener(opener) -socket.setdefaulttimeout(10) - -def _try_rm(filename): - """ Remove a file if it exists """ - try: - os.remove(filename) - except OSError as ose: - if ose.errno != errno.ENOENT: - raise - -md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() - class YoutubeDL(youtube_dl.YoutubeDL): def __init__(self, *args, **kwargs): self.to_stderr = self.to_screen @@ -54,17 +42,12 @@ def _file_md5(fn): with open(fn, 'rb') as f: return hashlib.md5(f.read()).hexdigest() -from helper import get_testcases defs = get_testcases() -with io.open(PARAMETERS_FILE, encoding='utf-8') as pf: - parameters = json.load(pf) - class TestDownload(unittest.TestCase): maxDiff = None def setUp(self): - self.parameters = parameters self.defs = defs ### Dynamically generate tests @@ -84,8 +67,7 @@ def generator(test_case): print_skipping(test_case['skip']) return - params = self.parameters.copy() - params.update(test_case.get('params', {})) + params = get_params(test_case.get('params', {})) ydl = YoutubeDL(params) ydl.add_default_info_extractors() @@ -97,9 +79,9 @@ def generator(test_case): test_cases = test_case.get('playlist', [test_case]) for tc in test_cases: - _try_rm(tc['file']) - _try_rm(tc['file'] + '.part') - _try_rm(tc['file'] + '.info.json') + try_rm(tc['file']) + try_rm(tc['file'] + '.part') + try_rm(tc['file'] + '.info.json') try: for retry in range(1, RETRIES + 1): try: @@ -145,9 +127,9 @@ def generator(test_case): self.assertTrue(key in info_dict.keys() and info_dict[key]) finally: for tc in test_cases: - _try_rm(tc['file']) - _try_rm(tc['file'] + '.part') - _try_rm(tc['file'] + '.info.json') + try_rm(tc['file']) + try_rm(tc['file'] + '.part') + try_rm(tc['file'] + '.info.json') return test_template diff --git a/test/test_playlists.py b/test/test_playlists.py index d079a4f23..d6a8d56df 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -1,17 +1,27 @@ #!/usr/bin/env python +# encoding: utf-8 -import sys -import unittest -import json # Allow direct execution import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.extractor import DailymotionPlaylistIE, VimeoChannelIE, UstreamChannelIE, SoundcloudUserIE -from youtube_dl.utils import * +from test.helper import FakeYDL, global_setup +global_setup() + + +from youtube_dl.extractor import ( + DailymotionPlaylistIE, + DailymotionUserIE, + VimeoChannelIE, + UstreamChannelIE, + SoundcloudUserIE, + LivestreamIE, + NHLVideocenterIE, +) -from helper import FakeYDL class TestPlaylists(unittest.TestCase): def assertIsPlaylist(self, info): @@ -26,6 +36,14 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['title'], u'SPORT') self.assertTrue(len(result['entries']) > 20) + def test_dailymotion_user(self): + dl = FakeYDL() + ie = DailymotionUserIE(dl) + result = ie.extract('http://www.dailymotion.com/user/generation-quoi/') + self.assertIsPlaylist(result) + self.assertEqual(result['title'], u'Génération Quoi') + self.assertTrue(len(result['entries']) >= 26) + def test_vimeo_channel(self): dl = FakeYDL() ie = VimeoChannelIE(dl) @@ -50,5 +68,22 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['id'], u'9615865') self.assertTrue(len(result['entries']) >= 12) + def test_livestream_event(self): + dl = FakeYDL() + ie = LivestreamIE(dl) + result = ie.extract('http://new.livestream.com/tedx/cityenglish') + self.assertIsPlaylist(result) + self.assertEqual(result['title'], u'TEDCity2.0 (English)') + self.assertTrue(len(result['entries']) >= 4) + + def test_nhl_videocenter(self): + dl = FakeYDL() + ie = NHLVideocenterIE(dl) + result = ie.extract('http://video.canucks.nhl.com/videocenter/console?catid=999') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], u'999') + self.assertEqual(result['title'], u'Highlights') + self.assertEqual(len(result['entries']), 12) + if __name__ == '__main__': unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index ff2e9885b..270669044 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1,14 +1,14 @@ #!/usr/bin/env python -# Various small unit tests - +# Allow direct execution +import os import sys import unittest -import xml.etree.ElementTree +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -# Allow direct execution -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +# Various small unit tests +import xml.etree.ElementTree #from youtube_dl.utils import htmlentity_transform from youtube_dl.utils import ( @@ -20,6 +20,7 @@ from youtube_dl.utils import ( unified_strdate, find_xpath_attr, get_meta_content, + xpath_with_ns, ) if sys.version_info < (3, 0): @@ -141,5 +142,18 @@ class TestUtil(unittest.TestCase): self.assertEqual(get_meta('description'), u'foo & bar') self.assertEqual(get_meta('author'), 'Plato') + def test_xpath_with_ns(self): + testxml = u''' + + The Author + http://server.com/download.mp3 + + ''' + doc = xml.etree.ElementTree.fromstring(testxml) + find = lambda p: doc.find(xpath_with_ns(p, {'media': 'http://example.com/'})) + self.assertTrue(find('media:song') is not None) + self.assertEqual(find('media:song/media:author').text, u'The Author') + self.assertEqual(find('media:song/url').text, u'http://server.com/download.mp3') + if __name__ == '__main__': unittest.main() diff --git a/test/test_write_annotations.py b/test/test_write_annotations.py new file mode 100644 index 000000000..6f08808cd --- /dev/null +++ b/test/test_write_annotations.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python +# coding: utf-8 + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import get_params, global_setup, try_rm +global_setup() + + +import io + +import xml.etree.ElementTree + +import youtube_dl.YoutubeDL +import youtube_dl.extractor +from youtube_dl.utils import True + + +class YoutubeDL(youtube_dl.YoutubeDL): + def __init__(self, *args, **kwargs): + super(YoutubeDL, self).__init__(*args, **kwargs) + self.to_stderr = self.to_screen + +params = get_params({ + 'writeannotations': True, + 'skip_download': True, + 'writeinfojson': False, + 'format': 'flv', +}) + + + +TEST_ID = 'gr51aVj-mLg' +ANNOTATIONS_FILE = TEST_ID + '.flv.annotations.xml' +EXPECTED_ANNOTATIONS = ['Speech bubble', 'Note', 'Title', 'Spotlight', 'Label'] + +class TestAnnotations(unittest.TestCase): + def setUp(self): + # Clear old files + self.tearDown() + + + def test_info_json(self): + expected = list(EXPECTED_ANNOTATIONS) #Two annotations could have the same text. + ie = youtube_dl.extractor.YoutubeIE() + ydl = YoutubeDL(params) + ydl.add_info_extractor(ie) + ydl.download([TEST_ID]) + self.assertTrue(os.path.exists(ANNOTATIONS_FILE)) + annoxml = None + with io.open(ANNOTATIONS_FILE, 'r', encoding='utf-8') as annof: + annoxml = xml.etree.ElementTree.parse(annof) + self.assertTrue(annoxml is not None, 'Failed to parse annotations XML') + root = annoxml.getroot() + self.assertEqual(root.tag, 'document') + annotationsTag = root.find('annotations') + self.assertEqual(annotationsTag.tag, 'annotations') + annotations = annotationsTag.findall('annotation') + + #Not all the annotations have TEXT children and the annotations are returned unsorted. + for a in annotations: + self.assertEqual(a.tag, 'annotation') + if a.get('type') == 'text': + textTag = a.find('TEXT') + text = textTag.text + self.assertTrue(text in expected) #assertIn only added in python 2.7 + #remove the first occurance, there could be more than one annotation with the same text + expected.remove(text) + #We should have seen (and removed) all the expected annotation texts. + self.assertEqual(len(expected), 0, 'Not all expected annotations were found.') + + + def tearDown(self): + try_rm(ANNOTATIONS_FILE) + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_write_info_json.py b/test/test_write_info_json.py index de6d5180f..a5b6f6972 100644 --- a/test/test_write_info_json.py +++ b/test/test_write_info_json.py @@ -1,37 +1,34 @@ #!/usr/bin/env python # coding: utf-8 -import json +# Allow direct execution import os import sys import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -# Allow direct execution -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from test.helper import get_params, global_setup +global_setup() + + +import io +import json import youtube_dl.YoutubeDL import youtube_dl.extractor -from youtube_dl.utils import * - -PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json") -# General configuration (from __init__, not very elegant...) -jar = compat_cookiejar.CookieJar() -cookie_processor = compat_urllib_request.HTTPCookieProcessor(jar) -proxy_handler = compat_urllib_request.ProxyHandler() -opener = compat_urllib_request.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler()) -compat_urllib_request.install_opener(opener) class YoutubeDL(youtube_dl.YoutubeDL): def __init__(self, *args, **kwargs): super(YoutubeDL, self).__init__(*args, **kwargs) self.to_stderr = self.to_screen -with io.open(PARAMETERS_FILE, encoding='utf-8') as pf: - params = json.load(pf) -params['writeinfojson'] = True -params['skip_download'] = True -params['writedescription'] = True +params = get_params({ + 'writeinfojson': True, + 'skip_download': True, + 'writedescription': True, +}) + TEST_ID = 'BaW_jenozKc' INFO_JSON_FILE = TEST_ID + '.mp4.info.json' @@ -42,6 +39,7 @@ This is a test video for youtube-dl. For more information, contact phihag@phihag.de .''' + class TestInfoJSON(unittest.TestCase): def setUp(self): # Clear old files diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index dd9e292b0..c1753b5bb 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -1,20 +1,26 @@ #!/usr/bin/env python +# Allow direct execution +import os import sys import unittest -import json +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -# Allow direct execution -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from test.helper import FakeYDL, global_setup +global_setup() -from youtube_dl.extractor import YoutubeUserIE, YoutubePlaylistIE, YoutubeIE, YoutubeChannelIE, YoutubeShowIE -from youtube_dl.utils import * -from helper import FakeYDL +from youtube_dl.extractor import ( + YoutubeUserIE, + YoutubePlaylistIE, + YoutubeIE, + YoutubeChannelIE, + YoutubeShowIE, +) + class TestYoutubeLists(unittest.TestCase): - def assertIsPlaylist(self,info): + def assertIsPlaylist(self, info): """Make sure the info has '_type' set to 'playlist'""" self.assertEqual(info['_type'], 'playlist') @@ -27,6 +33,14 @@ class TestYoutubeLists(unittest.TestCase): ytie_results = [YoutubeIE()._extract_id(url['url']) for url in result['entries']] self.assertEqual(ytie_results, [ 'bV9L5Ht9LgY', 'FXxLjLQi3Fg', 'tU3Bgo5qJZE']) + def test_youtube_playlist_noplaylist(self): + dl = FakeYDL() + dl.params['noplaylist'] = True + ie = YoutubePlaylistIE(dl) + result = ie.extract('https://www.youtube.com/watch?v=FXxLjLQi3Fg&list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re') + self.assertEqual(result['_type'], 'url') + self.assertEqual(YoutubeIE()._extract_id(result['url']), 'FXxLjLQi3Fg') + def test_issue_673(self): dl = FakeYDL() ie = YoutubePlaylistIE(dl) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py new file mode 100644 index 000000000..5e1ff5eb0 --- /dev/null +++ b/test/test_youtube_signature.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import global_setup +global_setup() + + +import io +import re +import string + +from youtube_dl.extractor import YoutubeIE +from youtube_dl.utils import compat_str, compat_urlretrieve + +_TESTS = [ + ( + u'https://s.ytimg.com/yts/jsbin/html5player-vflHOr_nV.js', + u'js', + 86, + u'>=<;:/.-[+*)(\'&%$#"!ZYX0VUTSRQPONMLKJIHGFEDCBA\\yxwvutsrqponmlkjihgfedcba987654321', + ), + ( + u'https://s.ytimg.com/yts/jsbin/html5player-vfldJ8xgI.js', + u'js', + 85, + u'3456789a0cdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRS[UVWXYZ!"#$%&\'()*+,-./:;<=>?@', + ), + ( + u'https://s.ytimg.com/yts/swfbin/watch_as3-vflg5GhxU.swf', + u'swf', + 82, + u':/.-,+*)=\'&%$#"!ZYX0VUTSRQPONMLKJIHGFEDCBAzyxw>utsrqponmlkjihgfedcba987654321' + ), +] + + +class TestSignature(unittest.TestCase): + def setUp(self): + TEST_DIR = os.path.dirname(os.path.abspath(__file__)) + self.TESTDATA_DIR = os.path.join(TEST_DIR, 'testdata') + if not os.path.exists(self.TESTDATA_DIR): + os.mkdir(self.TESTDATA_DIR) + + +def make_tfunc(url, stype, sig_length, expected_sig): + basename = url.rpartition('/')[2] + m = re.match(r'.*-([a-zA-Z0-9_-]+)\.[a-z]+$', basename) + assert m, '%r should follow URL format' % basename + test_id = m.group(1) + + def test_func(self): + fn = os.path.join(self.TESTDATA_DIR, basename) + + if not os.path.exists(fn): + compat_urlretrieve(url, fn) + + ie = YoutubeIE() + if stype == 'js': + with io.open(fn, encoding='utf-8') as testf: + jscode = testf.read() + func = ie._parse_sig_js(jscode) + else: + assert stype == 'swf' + with open(fn, 'rb') as testf: + swfcode = testf.read() + func = ie._parse_sig_swf(swfcode) + src_sig = compat_str(string.printable[:sig_length]) + got_sig = func(src_sig) + self.assertEqual(got_sig, expected_sig) + + test_func.__name__ = str('test_signature_' + stype + '_' + test_id) + setattr(TestSignature, test_func.__name__, test_func) + +for test_spec in _TESTS: + make_tfunc(*test_spec) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_youtube_subtitles.py b/test/test_youtube_subtitles.py index 168e6c66c..00430a338 100644 --- a/test/test_youtube_subtitles.py +++ b/test/test_youtube_subtitles.py @@ -1,76 +1,87 @@ #!/usr/bin/env python +# Allow direct execution +import os import sys import unittest -import json -import io -import hashlib +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import FakeYDL, global_setup, md5 +global_setup() -# Allow direct execution -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl.extractor import YoutubeIE -from youtube_dl.utils import * -from helper import FakeYDL -md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() class TestYoutubeSubtitles(unittest.TestCase): def setUp(self): self.DL = FakeYDL() self.url = 'QRS8MkLhQmM' + def getInfoDict(self): IE = YoutubeIE(self.DL) info_dict = IE.extract(self.url) return info_dict + def getSubtitles(self): info_dict = self.getInfoDict() - return info_dict[0]['subtitles'] + return info_dict[0]['subtitles'] + def test_youtube_no_writesubtitles(self): self.DL.params['writesubtitles'] = False subtitles = self.getSubtitles() self.assertEqual(subtitles, None) + def test_youtube_subtitles(self): self.DL.params['writesubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260') + def test_youtube_subtitles_lang(self): self.DL.params['writesubtitles'] = True self.DL.params['subtitleslangs'] = ['it'] subtitles = self.getSubtitles() self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d') + def test_youtube_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(len(subtitles.keys()), 13) + def test_youtube_subtitles_sbv_format(self): self.DL.params['writesubtitles'] = True self.DL.params['subtitlesformat'] = 'sbv' subtitles = self.getSubtitles() self.assertEqual(md5(subtitles['en']), '13aeaa0c245a8bed9a451cb643e3ad8b') + def test_youtube_subtitles_vtt_format(self): self.DL.params['writesubtitles'] = True self.DL.params['subtitlesformat'] = 'vtt' subtitles = self.getSubtitles() self.assertEqual(md5(subtitles['en']), '356cdc577fde0c6783b9b822e7206ff7') + def test_youtube_list_subtitles(self): + self.DL.expect_warning(u'Video doesn\'t have automatic captions') self.DL.params['listsubtitles'] = True info_dict = self.getInfoDict() self.assertEqual(info_dict, None) + def test_youtube_automatic_captions(self): self.url = '8YoUxe5ncPo' self.DL.params['writeautomaticsub'] = True self.DL.params['subtitleslangs'] = ['it'] subtitles = self.getSubtitles() self.assertTrue(subtitles['it'] is not None) + def test_youtube_nosubtitles(self): + self.DL.expect_warning(u'video doesn\'t have subtitles') self.url = 'sAjKT8FhjI8' self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(len(subtitles), 0) + def test_youtube_multiple_langs(self): self.url = 'QRS8MkLhQmM' self.DL.params['writesubtitles'] = True diff --git a/tox.ini b/tox.ini new file mode 100644 index 000000000..ed01e3386 --- /dev/null +++ b/tox.ini @@ -0,0 +1,8 @@ +[tox] +envlist = py26,py27,py33 +[testenv] +deps = + nose + coverage +commands = nosetests --verbose {posargs:test} # --with-coverage --cover-package=youtube_dl --cover-html + # test.test_download:TestDownload.test_NowVideo diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 0b5a5d77d..8ecabab1a 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -77,26 +77,43 @@ class FileDownloader(object): @staticmethod def calc_percent(byte_counter, data_len): if data_len is None: + return None + return float(byte_counter) / float(data_len) * 100.0 + + @staticmethod + def format_percent(percent): + if percent is None: return '---.-%' - return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0)) + return '%6s' % ('%3.1f%%' % percent) @staticmethod def calc_eta(start, now, total, current): if total is None: - return '--:--' + return None dif = now - start if current == 0 or dif < 0.001: # One millisecond - return '--:--' + return None rate = float(current) / dif - eta = int((float(total) - float(current)) / rate) + return int((float(total) - float(current)) / rate) + + @staticmethod + def format_eta(eta): + if eta is None: + return '--:--' return FileDownloader.format_seconds(eta) @staticmethod def calc_speed(start, now, bytes): dif = now - start if bytes == 0 or dif < 0.001: # One millisecond + return None + return float(bytes) / dif + + @staticmethod + def format_speed(speed): + if speed is None: return '%10s' % '---b/s' - return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif)) + return '%10s' % ('%s/s' % FileDownloader.format_bytes(speed)) @staticmethod def best_block_size(elapsed_time, bytes): @@ -205,11 +222,14 @@ class FileDownloader(object): """Report destination filename.""" self.to_screen(u'[download] Destination: ' + filename) - def report_progress(self, percent_str, data_len_str, speed_str, eta_str): + def report_progress(self, percent, data_len_str, speed, eta): """Report download progress.""" if self.params.get('noprogress', False): return clear_line = (u'\x1b[K' if sys.stderr.isatty() and os.name != 'nt' else u'') + eta_str = self.format_eta(eta) + percent_str = self.format_percent(percent) + speed_str = self.format_speed(speed) if self.params.get('progress_with_newline', False): self.to_screen(u'[download] %s of %s at %s ETA %s' % (percent_str, data_len_str, speed_str, eta_str)) @@ -250,6 +270,7 @@ class FileDownloader(object): def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url): self.report_destination(filename) tmpfilename = self.temp_name(filename) + test = self.params.get('test', False) # Check for rtmpdump first try: @@ -271,6 +292,8 @@ class FileDownloader(object): basic_args += ['--playpath', play_path] if tc_url is not None: basic_args += ['--tcUrl', url] + if test: + basic_args += ['--stop', '1'] args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)] if self.params.get('verbose', False): try: @@ -280,7 +303,7 @@ class FileDownloader(object): shell_quote = repr self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args)) retval = subprocess.call(args) - while retval == 2 or retval == 1: + while (retval == 2 or retval == 1) and not test: prevsize = os.path.getsize(encodeFilename(tmpfilename)) self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True) time.sleep(5.0) # This seems to be needed @@ -293,7 +316,7 @@ class FileDownloader(object): self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.') retval = 0 break - if retval == 0: + if retval == 0 or (test and retval == 2): fsize = os.path.getsize(encodeFilename(tmpfilename)) self.to_screen(u'\r[rtmpdump] %s bytes' % fsize) self.try_rename(tmpfilename, filename) @@ -378,6 +401,7 @@ class FileDownloader(object): self._hook_progress({ 'filename': filename, 'status': 'finished', + 'total_bytes': os.path.getsize(encodeFilename(filename)), }) return True @@ -524,13 +548,14 @@ class FileDownloader(object): block_size = self.best_block_size(after - before, len(data_block)) # Progress message - speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len) + speed = self.calc_speed(start, time.time(), byte_counter - resume_len) if data_len is None: self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA') + eta = None else: - percent_str = self.calc_percent(byte_counter, data_len) - eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) - self.report_progress(percent_str, data_len_str, speed_str, eta_str) + percent = self.calc_percent(byte_counter, data_len) + eta = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) + self.report_progress(percent, data_len_str, speed, eta) self._hook_progress({ 'downloaded_bytes': byte_counter, @@ -538,6 +563,8 @@ class FileDownloader(object): 'tmpfilename': tmpfilename, 'filename': filename, 'status': 'downloading', + 'eta': eta, + 'speed': speed, }) # Apply rate limit @@ -580,6 +607,8 @@ class FileDownloader(object): * downloaded_bytes: Bytes on disks * total_bytes: Total bytes, None if unknown * tmpfilename: The filename we're currently writing to + * eta: The estimated time in seconds, None if unknown + * speed: The download speed in bytes/second, None if unknown Hooks are guaranteed to be called at least once (with status "finished") if the download is successful. diff --git a/youtube_dl/PostProcessor.py b/youtube_dl/PostProcessor.py index ae56d2082..13b56ede5 100644 --- a/youtube_dl/PostProcessor.py +++ b/youtube_dl/PostProcessor.py @@ -3,7 +3,14 @@ import subprocess import sys import time -from .utils import * + +from .utils import ( + compat_subprocess_get_DEVNULL, + encodeFilename, + PostProcessingError, + shell_quote, + subtitles_filename, +) class PostProcessor(object): @@ -82,6 +89,8 @@ class FFmpegPostProcessor(PostProcessor): + opts + [encodeFilename(self._ffmpeg_filename_argument(out_path))]) + if self._downloader.params.get('verbose', False): + self._downloader.to_screen(u'[debug] ffmpeg command line: %s' % shell_quote(cmd)) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout,stderr = p.communicate() if p.returncode != 0: @@ -177,7 +186,8 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): extension = self._preferredcodec more_opts = [] if self._preferredquality is not None: - if int(self._preferredquality) < 10: + # The opus codec doesn't support the -aq option + if int(self._preferredquality) < 10 and extension != 'opus': more_opts += [self._exes['avconv'] and '-q:a' or '-aq', self._preferredquality] else: more_opts += [self._exes['avconv'] and '-b:a' or '-ab', self._preferredquality + 'k'] @@ -444,8 +454,11 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): if information['ext'] != u'mp4': self._downloader.to_screen(u'[ffmpeg] Subtitles can only be embedded in mp4 files') return True, information - sub_langs = [key for key in information['subtitles']] + if not information.get('subtitles'): + self._downloader.to_screen(u'[ffmpeg] There aren\'t any subtitles to embed') + return True, information + sub_langs = [key for key in information['subtitles']] filename = information['filepath'] input_files = [filename] + [subtitles_filename(filename, lang, self._subformat) for lang in sub_langs] @@ -464,3 +477,35 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): os.rename(encodeFilename(temp_filename), encodeFilename(filename)) return True, information + + +class FFmpegMetadataPP(FFmpegPostProcessor): + def run(self, info): + metadata = {} + if info.get('title') is not None: + metadata['title'] = info['title'] + if info.get('upload_date') is not None: + metadata['date'] = info['upload_date'] + if info.get('uploader') is not None: + metadata['artist'] = info['uploader'] + elif info.get('uploader_id') is not None: + metadata['artist'] = info['uploader_id'] + + if not metadata: + self._downloader.to_screen(u'[ffmpeg] There isn\'t any metadata to add') + return True, info + + filename = info['filepath'] + ext = os.path.splitext(filename)[1][1:] + temp_filename = filename + u'.temp' + + options = ['-c', 'copy'] + for (name, value) in metadata.items(): + options.extend(['-metadata', '%s="%s"' % (name, value)]) + options.extend(['-f', ext]) + + self._downloader.to_screen(u'[ffmpeg] Adding metadata to \'%s\'' % filename) + self.run_ffmpeg(filename, temp_filename, options) + os.remove(encodeFilename(filename)) + os.rename(encodeFilename(temp_filename), encodeFilename(filename)) + return True, info diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index de2b133e0..c8054544a 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -3,6 +3,7 @@ from __future__ import absolute_import +import errno import io import os import re @@ -70,6 +71,7 @@ class YoutubeDL(object): logtostderr: Log messages to stderr instead of stdout. writedescription: Write the video description to a .description file writeinfojson: Write the video description to a .info.json file + writeannotations: Write the video annotations to a .annotations.xml file writethumbnail: Write the thumbnail image to a file writesubtitles: Write the video subtitles to a file writeautomaticsub: Write the automatic subtitles to a file @@ -81,6 +83,14 @@ class YoutubeDL(object): keepvideo: Keep the video file after post-processing daterange: A DateRange object, download only if the upload_date is in the range. skip_download: Skip the actual download of the video file + cachedir: Location of the cache files in the filesystem. + None to disable filesystem cache. + noplaylist: Download single video instead of a playlist if in doubt. + age_limit: An integer representing the user's age in years. + Unsuitable videos for the given age are skipped. + downloadarchive: File name of a file where all downloads are recorded. + Videos already present in the file are not downloaded + again. The following parameters are not used by YoutubeDL itself, they are used by the FileDownloader: @@ -104,6 +114,17 @@ class YoutubeDL(object): self._download_retcode = 0 self._num_downloads = 0 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)] + + if (sys.version_info >= (3,) and sys.platform != 'win32' and + sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] + and not params['restrictfilenames']): + # On Python 3, the Unicode filesystem API will throw errors (#1474) + self.report_warning( + u'Assuming --restrict-filenames since file system encoding ' + u'cannot encode all charactes. ' + u'Set the LC_ALL environment variable to fix this.') + params['restrictfilenames'] = True + self.params = params self.fd = FileDownloader(self, self.params) @@ -238,6 +259,10 @@ class YoutubeDL(object): """ Report that the metadata file has been written """ self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn) + def report_writeannotations(self, annofn): + """ Report that the annotations file has been written. """ + self.to_screen(u'[info] Writing video annotations to: ' + annofn) + def report_file_already_downloaded(self, file_name): """Report file has already been fully downloaded.""" try: @@ -295,6 +320,13 @@ class YoutubeDL(object): dateRange = self.params.get('daterange', DateRange()) if date not in dateRange: return u'[download] %s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange) + age_limit = self.params.get('age_limit') + if age_limit is not None: + if age_limit < info_dict.get('age_limit', 0): + return u'Skipping "' + title + '" because it is age restricted' + if self.in_download_archive(info_dict): + return (u'%(title)s has already been recorded in archive' + % info_dict) return None def extract_info(self, url, download=True, ie_key=None, extra_info={}): @@ -495,6 +527,18 @@ class YoutubeDL(object): self.report_error(u'Cannot write description file ' + descfn) return + if self.params.get('writeannotations', False): + try: + annofn = filename + u'.annotations.xml' + self.report_writeannotations(annofn) + with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile: + annofile.write(info_dict['annotations']) + except (KeyError, TypeError): + self.report_warning(u'There are no annotations to write.') + except (OSError, IOError): + self.report_error(u'Cannot write annotations file: ' + annofn) + return + subtitles_are_requested = any([self.params.get('writesubtitles', False), self.params.get('writeautomaticsub')]) @@ -532,11 +576,15 @@ class YoutubeDL(object): thumb_filename = filename.rpartition('.')[0] + u'.' + thumb_format self.to_screen(u'[%s] %s: Downloading thumbnail ...' % (info_dict['extractor'], info_dict['id'])) - uf = compat_urllib_request.urlopen(info_dict['thumbnail']) - with open(thumb_filename, 'wb') as thumbf: - shutil.copyfileobj(uf, thumbf) - self.to_screen(u'[%s] %s: Writing thumbnail to: %s' % - (info_dict['extractor'], info_dict['id'], thumb_filename)) + try: + uf = compat_urllib_request.urlopen(info_dict['thumbnail']) + with open(thumb_filename, 'wb') as thumbf: + shutil.copyfileobj(uf, thumbf) + self.to_screen(u'[%s] %s: Writing thumbnail to: %s' % + (info_dict['extractor'], info_dict['id'], thumb_filename)) + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self.report_warning(u'Unable to download thumbnail "%s": %s' % + (info_dict['thumbnail'], compat_str(err))) if not self.params.get('skip_download', False): if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)): @@ -544,11 +592,11 @@ class YoutubeDL(object): else: try: success = self.fd._do_download(filename, info_dict) - except (OSError, IOError) as err: - raise UnavailableVideoError(err) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: self.report_error(u'unable to download video data: %s' % str(err)) return + except (OSError, IOError) as err: + raise UnavailableVideoError(err) except (ContentTooShortError, ) as err: self.report_error(u'content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded)) return @@ -560,6 +608,8 @@ class YoutubeDL(object): self.report_error(u'postprocessing: %s' % str(err)) return + self.record_download_archive(info_dict) + def download(self, url_list): """Download a given list of URLs.""" if len(url_list) > 1 and self.fixed_template(): @@ -599,3 +649,26 @@ class YoutubeDL(object): os.remove(encodeFilename(filename)) except (IOError, OSError): self.report_warning(u'Unable to remove downloaded video file') + + def in_download_archive(self, info_dict): + fn = self.params.get('download_archive') + if fn is None: + return False + vid_id = info_dict['extractor'] + u' ' + info_dict['id'] + try: + with locked_file(fn, 'r', encoding='utf-8') as archive_file: + for line in archive_file: + if line.strip() == vid_id: + return True + except IOError as ioe: + if ioe.errno != errno.ENOENT: + raise + return False + + def record_download_archive(self, info_dict): + fn = self.params.get('download_archive') + if fn is None: + return + vid_id = info_dict['extractor'] + u' ' + info_dict['id'] + with locked_file(fn, 'a', encoding='utf-8') as archive_file: + archive_file.write(vid_id + u'\n') diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 9efd7c3f7..5248a92c7 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -30,11 +30,14 @@ __authors__ = ( 'Pierre Rudloff', 'Huarong Huo', 'Ismael Mejía', + 'Steffan \'Ruirize\' James', + 'Andras Elso', ) __license__ = 'Public Domain' import codecs +import collections import getpass import optparse import os @@ -44,17 +47,43 @@ import shlex import socket import subprocess import sys -import warnings +import traceback import platform -from .utils import * +from .utils import ( + compat_cookiejar, + compat_print, + compat_str, + compat_urllib_request, + DateRange, + decodeOption, + determine_ext, + DownloadError, + get_cachedir, + make_HTTPS_handler, + MaxDownloadsReached, + platform_name, + preferredencoding, + SameFileError, + std_headers, + write_string, + YoutubeDLHandler, +) from .update import update_self from .version import __version__ -from .FileDownloader import * +from .FileDownloader import ( + FileDownloader, +) from .extractor import gen_extractors from .YoutubeDL import YoutubeDL -from .PostProcessor import * +from .PostProcessor import ( + FFmpegMetadataPP, + FFmpegVideoConvertor, + FFmpegExtractAudioPP, + FFmpegEmbedSubtitlePP, +) + def parseOpts(overrideArguments=None): def _readOptions(filename_bytes): @@ -149,7 +178,7 @@ def parseOpts(overrideArguments=None): general.add_option('-U', '--update', action='store_true', dest='update_self', help='update this program to latest version. Make sure that you have sufficient permissions (run with sudo if needed)') general.add_option('-i', '--ignore-errors', - action='store_true', dest='ignoreerrors', help='continue on download errors', default=False) + action='store_true', dest='ignoreerrors', help='continue on download errors, for example to to skip unavailable videos in a playlist', default=False) general.add_option('--dump-user-agent', action='store_true', dest='dump_user_agent', help='display the current browser identification', default=False) @@ -166,6 +195,12 @@ def parseOpts(overrideArguments=None): help='Output descriptions of all supported extractors', default=False) general.add_option('--proxy', dest='proxy', default=None, help='Use the specified HTTP/HTTPS proxy', metavar='URL') general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.') + general.add_option( + '--cache-dir', dest='cachedir', default=get_cachedir(), + help='Location in the filesystem where youtube-dl can store downloaded information permanently. By default $XDG_CACHE_HOME/youtube-dl or ~/.cache/youtube-dl .') + general.add_option( + '--no-cache-dir', action='store_const', const=None, dest='cachedir', + help='Disable filesystem caching') selection.add_option('--playlist-start', @@ -180,6 +215,13 @@ def parseOpts(overrideArguments=None): selection.add_option('--date', metavar='DATE', dest='date', help='download only videos uploaded in this date', default=None) selection.add_option('--datebefore', metavar='DATE', dest='datebefore', help='download only videos uploaded before this date', default=None) selection.add_option('--dateafter', metavar='DATE', dest='dateafter', help='download only videos uploaded after this date', default=None) + selection.add_option('--no-playlist', action='store_true', dest='noplaylist', help='download only the currently playing video', default=False) + selection.add_option('--age-limit', metavar='YEARS', dest='age_limit', + help='download only videos suitable for the given age', + default=None, type=int) + selection.add_option('--download-archive', metavar='FILE', + dest='download_archive', + help='Download only videos not present in the archive file. Record all downloaded videos in it.') authentication.add_option('-u', '--username', @@ -271,6 +313,10 @@ def parseOpts(overrideArguments=None): verbosity.add_option('--dump-intermediate-pages', action='store_true', dest='dump_intermediate_pages', default=False, help='print downloaded pages to debug problems(very verbose)') + verbosity.add_option('--youtube-print-sig-code', + action='store_true', dest='youtube_print_sig_code', default=False, + help=optparse.SUPPRESS_HELP) + filesystem.add_option('-t', '--title', action='store_true', dest='usetitle', help='use title in file name (default)', default=False) @@ -320,6 +366,9 @@ def parseOpts(overrideArguments=None): filesystem.add_option('--write-info-json', action='store_true', dest='writeinfojson', help='write video metadata to a .info.json file', default=False) + filesystem.add_option('--write-annotations', + action='store_true', dest='writeannotations', + help='write video annotations to a .annotation file', default=False) filesystem.add_option('--write-thumbnail', action='store_true', dest='writethumbnail', help='write thumbnail image to disk', default=False) @@ -339,6 +388,8 @@ def parseOpts(overrideArguments=None): help='do not overwrite post-processed files; the post-processed files are overwritten by default') postproc.add_option('--embed-subs', action='store_true', dest='embedsubtitles', default=False, help='embed subtitles in the video (only for mp4 videos)') + postproc.add_option('--add-metadata', action='store_true', dest='addmetadata', default=False, + help='add metadata to the files') parser.add_option_group(general) @@ -358,9 +409,13 @@ def parseOpts(overrideArguments=None): else: xdg_config_home = os.environ.get('XDG_CONFIG_HOME') if xdg_config_home: - userConfFile = os.path.join(xdg_config_home, 'youtube-dl.conf') + userConfFile = os.path.join(xdg_config_home, 'youtube-dl', 'config') + if not os.path.isfile(userConfFile): + userConfFile = os.path.join(xdg_config_home, 'youtube-dl.conf') else: - userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf') + userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl', 'config') + if not os.path.isfile(userConfFile): + userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf') systemConf = _readOptions('/etc/youtube-dl.conf') userConf = _readOptions(userConfFile) commandLineConf = sys.argv[1:] @@ -425,27 +480,7 @@ def _real_main(argv=None): all_urls = batchurls + args all_urls = [url.strip() for url in all_urls] - # General configuration - cookie_processor = compat_urllib_request.HTTPCookieProcessor(jar) - if opts.proxy is not None: - if opts.proxy == '': - proxies = {} - else: - proxies = {'http': opts.proxy, 'https': opts.proxy} - else: - proxies = compat_urllib_request.getproxies() - # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805) - if 'http' in proxies and 'https' not in proxies: - proxies['https'] = proxies['http'] - proxy_handler = compat_urllib_request.ProxyHandler(proxies) - https_handler = make_HTTPS_handler(opts) - opener = compat_urllib_request.build_opener(https_handler, proxy_handler, cookie_processor, YoutubeDLHandler()) - # Delete the default user-agent header, which would otherwise apply in - # cases where our custom HTTP handler doesn't come into play - # (See https://github.com/rg3/youtube-dl/issues/1309 for details) - opener.addheaders =[] - compat_urllib_request.install_opener(opener) - socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words) + opener = _setup_opener(jar=jar, opts=opts) extractors = gen_extractors() @@ -462,6 +497,8 @@ def _real_main(argv=None): if not ie._WORKING: continue desc = getattr(ie, 'IE_DESC', ie.IE_NAME) + if desc is False: + continue if hasattr(ie, 'SEARCH_KEY'): _SEARCHES = (u'cute kittens', u'slithering pythons', u'falling cat', u'angry poodle', u'purple fish', u'running tortoise') _COUNTS = (u'', u'5', u'10', u'all') @@ -550,6 +587,10 @@ def _real_main(argv=None): or (opts.useid and u'%(id)s.%(ext)s') or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s') or u'%(title)s-%(id)s.%(ext)s') + if '%(ext)s' not in outtmpl and opts.extractaudio: + parser.error(u'Cannot download a video and extract audio into the same' + u' file! Use "%%(ext)s" instead of %r' % + determine_ext(outtmpl, u'')) # YoutubeDL ydl = YoutubeDL({ @@ -584,11 +625,13 @@ def _real_main(argv=None): 'progress_with_newline': opts.progress_with_newline, 'playliststart': opts.playliststart, 'playlistend': opts.playlistend, + 'noplaylist': opts.noplaylist, 'logtostderr': opts.outtmpl == '-', 'consoletitle': opts.consoletitle, 'nopart': opts.nopart, 'updatetime': opts.updatetime, 'writedescription': opts.writedescription, + 'writeannotations': opts.writeannotations, 'writeinfojson': opts.writeinfojson, 'writethumbnail': opts.writethumbnail, 'writesubtitles': opts.writesubtitles, @@ -608,6 +651,10 @@ def _real_main(argv=None): 'min_filesize': opts.min_filesize, 'max_filesize': opts.max_filesize, 'daterange': date, + 'cachedir': opts.cachedir, + 'youtube_print_sig_code': opts.youtube_print_sig_code, + 'age_limit': opts.age_limit, + 'download_archive': opts.download_archive, }) if opts.verbose: @@ -627,11 +674,19 @@ def _real_main(argv=None): except: pass write_string(u'[debug] Python version %s - %s' %(platform.python_version(), platform_name()) + u'\n') - write_string(u'[debug] Proxy map: ' + str(proxy_handler.proxies) + u'\n') + + proxy_map = {} + for handler in opener.handlers: + if hasattr(handler, 'proxies'): + proxy_map.update(handler.proxies) + write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n') ydl.add_default_info_extractors() # PostProcessors + # Add the metadata pp first, the other pps will copy it + if opts.addmetadata: + ydl.add_post_processor(FFmpegMetadataPP()) if opts.extractaudio: ydl.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, nopostoverwrites=opts.nopostoverwrites)) if opts.recodevideo: @@ -641,7 +696,7 @@ def _real_main(argv=None): # Update version if opts.update_self: - update_self(ydl.to_screen, opts.verbose, sys.argv[0]) + update_self(ydl.to_screen, opts.verbose) # Maybe do nothing if len(all_urls) < 1: @@ -660,11 +715,42 @@ def _real_main(argv=None): if opts.cookiefile is not None: try: jar.save() - except (IOError, OSError) as err: + except (IOError, OSError): sys.exit(u'ERROR: unable to save cookie jar') sys.exit(retcode) + +def _setup_opener(jar=None, opts=None, timeout=300): + if opts is None: + FakeOptions = collections.namedtuple( + 'FakeOptions', ['proxy', 'no_check_certificate']) + opts = FakeOptions(proxy=None, no_check_certificate=False) + + cookie_processor = compat_urllib_request.HTTPCookieProcessor(jar) + if opts.proxy is not None: + if opts.proxy == '': + proxies = {} + else: + proxies = {'http': opts.proxy, 'https': opts.proxy} + else: + proxies = compat_urllib_request.getproxies() + # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805) + if 'http' in proxies and 'https' not in proxies: + proxies['https'] = proxies['http'] + proxy_handler = compat_urllib_request.ProxyHandler(proxies) + https_handler = make_HTTPS_handler(opts) + opener = compat_urllib_request.build_opener( + https_handler, proxy_handler, cookie_processor, YoutubeDLHandler()) + # Delete the default user-agent header, which would otherwise apply in + # cases where our custom HTTP handler doesn't come into play + # (See https://github.com/rg3/youtube-dl/issues/1309 for details) + opener.addheaders = [] + compat_urllib_request.install_opener(opener) + socket.setdefaulttimeout(timeout) + return opener + + def main(argv=None): try: _real_main(argv) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 19ded18f1..5f0e2ec9b 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -2,7 +2,12 @@ from .appletrailers import AppleTrailersIE from .addanime import AddAnimeIE from .archiveorg import ArchiveOrgIE from .ard import ARDIE -from .arte import ArteTvIE +from .arte import ( + ArteTvIE, + ArteTVPlus7IE, + ArteTVCreativeIE, + ArteTVFutureIE, +) from .auengine import AUEngineIE from .bandcamp import BandcampIE from .bliptv import BlipTVIE, BlipTVUserIE @@ -12,27 +17,40 @@ from .brightcove import BrightcoveIE from .c56 import C56IE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE +from .cinemassacre import CinemassacreIE from .cnn import CNNIE from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE from .condenast import CondeNastIE from .criterion import CriterionIE from .cspan import CSpanIE -from .dailymotion import DailymotionIE, DailymotionPlaylistIE +from .dailymotion import ( + DailymotionIE, + DailymotionPlaylistIE, + DailymotionUserIE, +) from .daum import DaumIE from .depositfiles import DepositFilesIE from .dotsub import DotsubIE from .dreisat import DreiSatIE from .defense import DefenseGouvFrIE +from .ebaumsworld import EbaumsWorldIE from .ehow import EHowIE from .eighttracks import EightTracksIE from .escapist import EscapistIE from .exfm import ExfmIE from .facebook import FacebookIE +from .faz import FazIE +from .fktv import ( + FKTVIE, + FKTVPosteckeIE, +) from .flickr import FlickrIE from .francetv import ( PluzzIE, FranceTvInfoIE, + France2IE, + GenerationQuoiIE ) from .freesound import FreesoundIE from .funnyordie import FunnyOrDieIE @@ -49,6 +67,7 @@ from .ign import IGNIE, OneUPIE from .ina import InaIE from .infoq import InfoQIE from .instagram import InstagramIE +from .internetvideoarchive import InternetVideoArchiveIE from .jeuxvideo import JeuxVideoIE from .jukebox import JukeboxIE from .justintv import JustinTVIE @@ -68,6 +87,9 @@ from .myvideo import MyVideoIE from .naver import NaverIE from .nba import NBAIE from .nbc import NBCNewsIE +from .newgrounds import NewgroundsIE +from .nhl import NHLIE, NHLVideocenterIE +from .nowvideo import NowVideoIE from .ooyala import OoyalaIE from .orf import ORFIE from .pbs import PBSIE @@ -77,6 +99,7 @@ from .rbmaradio import RBMARadioIE from .redtube import RedTubeIE from .ringtv import RingTVIE from .ro220 import Ro220IE +from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE from .rtlnow import RTLnowIE from .sina import SinaIE @@ -89,6 +112,7 @@ from .spiegel import SpiegelIE from .stanfordoc import StanfordOpenClassroomIE from .statigram import StatigramIE from .steam import SteamIE +from .sztvhu import SztvHuIE from .teamcoco import TeamcocoIE from .ted import TEDIE from .tf1 import TF1IE @@ -105,7 +129,10 @@ from .veehd import VeeHDIE from .veoh import VeohIE from .vevo import VevoIE from .vice import ViceIE +from .viddler import ViddlerIE +from .videodetective import VideoDetectiveIE from .videofyme import VideofyMeIE +from .videopremium import VideoPremiumIE from .vimeo import VimeoIE, VimeoChannelIE from .vine import VineIE from .wat import WatIE @@ -129,6 +156,7 @@ from .youtube import ( YoutubeShowIE, YoutubeSubscriptionsIE, YoutubeRecommendedIE, + YoutubeTruncatedURLIE, YoutubeWatchLaterIE, YoutubeFavouritesIE, ) diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index 8b191c196..6d6237f8a 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -1,8 +1,10 @@ import re import xml.etree.ElementTree +import json from .common import InfoExtractor from ..utils import ( + compat_urlparse, determine_ext, ) @@ -14,10 +16,9 @@ class AppleTrailersIE(InfoExtractor): u"playlist": [ { u"file": u"manofsteel-trailer4.mov", - u"md5": u"11874af099d480cc09e103b189805d5f", + u"md5": u"d97a8e575432dbcb81b7c3acb741f8a8", u"info_dict": { u"duration": 111, - u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_11624.jpg", u"title": u"Trailer 4", u"upload_date": u"20130523", u"uploader_id": u"wb", @@ -25,10 +26,9 @@ class AppleTrailersIE(InfoExtractor): }, { u"file": u"manofsteel-trailer3.mov", - u"md5": u"07a0a262aae5afe68120eed61137ab34", + u"md5": u"b8017b7131b721fb4e8d6f49e1df908c", u"info_dict": { u"duration": 182, - u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_10793.jpg", u"title": u"Trailer 3", u"upload_date": u"20130417", u"uploader_id": u"wb", @@ -36,10 +36,9 @@ class AppleTrailersIE(InfoExtractor): }, { u"file": u"manofsteel-trailer.mov", - u"md5": u"e401fde0813008e3307e54b6f384cff1", + u"md5": u"d0f1e1150989b9924679b441f3404d48", u"info_dict": { u"duration": 148, - u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_8703.jpg", u"title": u"Trailer", u"upload_date": u"20121212", u"uploader_id": u"wb", @@ -47,10 +46,9 @@ class AppleTrailersIE(InfoExtractor): }, { u"file": u"manofsteel-teaser.mov", - u"md5": u"76b392f2ae9e7c98b22913c10a639c97", + u"md5": u"5fe08795b943eb2e757fa95cb6def1cb", u"info_dict": { u"duration": 93, - u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_6899.jpg", u"title": u"Teaser", u"upload_date": u"20120721", u"uploader_id": u"wb", @@ -59,87 +57,61 @@ class AppleTrailersIE(InfoExtractor): ] } + _JSON_RE = r'iTunes.playURL\((.*?)\);' + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) movie = mobj.group('movie') uploader_id = mobj.group('company') - playlist_url = url.partition(u'?')[0] + u'/includes/playlists/web.inc' + playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc') playlist_snippet = self._download_webpage(playlist_url, movie) - playlist_cleaned = re.sub(r'(?s)', u'', playlist_snippet) + playlist_cleaned = re.sub(r'(?s).*?', u'', playlist_snippet) + playlist_cleaned = re.sub(r'', r'', playlist_cleaned) + # The ' in the onClick attributes are not escaped, it couldn't be parsed + # with xml.etree.ElementTree.fromstring + # like: http://trailers.apple.com/trailers/wb/gravity/ + def _clean_json(m): + return u'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') + playlist_cleaned = re.sub(self._JSON_RE, _clean_json, playlist_cleaned) playlist_html = u'' + playlist_cleaned + u'' - size_cache = {} - doc = xml.etree.ElementTree.fromstring(playlist_html) playlist = [] for li in doc.findall('./div/ul/li'): - title = li.find('.//h3').text + on_click = li.find('.//a').attrib['onClick'] + trailer_info_json = self._search_regex(self._JSON_RE, + on_click, u'trailer info') + trailer_info = json.loads(trailer_info_json) + title = trailer_info['title'] video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower() thumbnail = li.find('.//img').attrib['src'] + upload_date = trailer_info['posted'].replace('-', '') - date_el = li.find('.//p') - upload_date = None - m = re.search(r':\s?(?P[0-9]{2})/(?P[0-9]{2})/(?P[0-9]{2})', date_el.text) - if m: - upload_date = u'20' + m.group('year') + m.group('month') + m.group('day') - runtime_el = date_el.find('./br') - m = re.search(r':\s?(?P[0-9]+):(?P[0-9]{1,2})', runtime_el.tail) + runtime = trailer_info['runtime'] + m = re.search(r'(?P[0-9]+):(?P[0-9]{1,2})', runtime) duration = None if m: duration = 60 * int(m.group('minutes')) + int(m.group('seconds')) - formats = [] - for formats_el in li.findall('.//a'): - if formats_el.attrib['class'] != 'OverlayPanel': - continue - target = formats_el.attrib['target'] - - format_code = formats_el.text - if 'Automatic' in format_code: - continue + first_url = trailer_info['url'] + trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower() + settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id) + settings_json = self._download_webpage(settings_json_url, trailer_id, u'Downloading settings json') + settings = json.loads(settings_json) - size_q = formats_el.attrib['href'] - size_id = size_q.rpartition('#videos-')[2] - if size_id not in size_cache: - size_url = url + size_q - sizepage_html = self._download_webpage( - size_url, movie, - note=u'Downloading size info %s' % size_id, - errnote=u'Error while downloading size info %s' % size_id, - ) - _doc = xml.etree.ElementTree.fromstring(sizepage_html) - size_cache[size_id] = _doc - - sizepage_doc = size_cache[size_id] - links = sizepage_doc.findall('.//{http://www.w3.org/1999/xhtml}ul/{http://www.w3.org/1999/xhtml}li/{http://www.w3.org/1999/xhtml}a') - for vid_a in links: - href = vid_a.get('href') - if not href.endswith(target): - continue - detail_q = href.partition('#')[0] - detail_url = url + '/' + detail_q - - m = re.match(r'includes/(?P[^/]+)/', detail_q) - detail_id = m.group('detail_id') - - detail_html = self._download_webpage( - detail_url, movie, - note=u'Downloading detail %s %s' % (detail_id, size_id), - errnote=u'Error while downloading detail %s %s' % (detail_id, size_id) - ) - detail_doc = xml.etree.ElementTree.fromstring(detail_html) - movie_link_el = detail_doc.find('.//{http://www.w3.org/1999/xhtml}a') - assert movie_link_el.get('class') == 'movieLink' - movie_link = movie_link_el.get('href').partition('?')[0].replace('_', '_h') - ext = determine_ext(movie_link) - assert ext == 'mov' - - formats.append({ - 'format': format_code, - 'ext': ext, - 'url': movie_link, - }) + formats = [] + for format in settings['metadata']['sizes']: + # The src is a file pointing to the real video file + format_url = re.sub(r'_(\d*p.mov)', r'_h\1', format['src']) + formats.append({ + 'url': format_url, + 'ext': determine_ext(format_url), + 'format': format['type'], + 'width': format['width'], + 'height': int(format['height']), + }) + formats = sorted(formats, key=lambda f: (f['height'], f['width'])) info = { '_type': 'video', diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 69b3b0ad7..5ee8a67b1 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -1,3 +1,4 @@ +# encoding: utf-8 import re import json import xml.etree.ElementTree @@ -7,15 +8,15 @@ from ..utils import ( ExtractorError, find_xpath_attr, unified_strdate, + determine_ext, + get_element_by_id, ) +# There are different sources of video in arte.tv, the extraction process +# is different for each one. The videos usually expire in 7 days, so we can't +# add tests. + class ArteTvIE(InfoExtractor): - """ - There are two sources of video in arte.tv: videos.arte.tv and - www.arte.tv/guide, the extraction process is different for each one. - The videos expire in 7 days, so we can't add tests. - """ - _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?Pfr|de)/(?:(?:sendungen|emissions)/)?(?P.*?)/(?P.*?)(\?.*)?' _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?Pfr|de)/.*-(?P.*?).html' _LIVEWEB_URL = r'(?:http://)?liveweb.arte.tv/(?Pfr|de)/(?P.+?)/(?P.+)' _LIVE_URL = r'index-[0-9]+\.html$' @@ -24,7 +25,7 @@ class ArteTvIE(InfoExtractor): @classmethod def suitable(cls, url): - return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL, cls._LIVEWEB_URL)) + return any(re.match(regex, url) for regex in (cls._VIDEOS_URL, cls._LIVEWEB_URL)) # TODO implement Live Stream # from ..utils import compat_urllib_parse @@ -55,14 +56,6 @@ class ArteTvIE(InfoExtractor): # video_url = u'%s/%s' % (info.get('url'), info.get('path')) def _real_extract(self, url): - mobj = re.match(self._EMISSION_URL, url) - if mobj is not None: - lang = mobj.group('lang') - # This is not a real id, it can be for example AJT for the news - # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal - video_id = mobj.group('id') - return self._extract_emission(url, video_id, lang) - mobj = re.match(self._VIDEOS_URL, url) if mobj is not None: id = mobj.group('id') @@ -80,49 +73,6 @@ class ArteTvIE(InfoExtractor): # self.extractLiveStream(url) # return - def _extract_emission(self, url, video_id, lang): - """Extract from www.arte.tv/guide""" - webpage = self._download_webpage(url, video_id) - json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url') - - json_info = self._download_webpage(json_url, video_id, 'Downloading info json') - self.report_extraction(video_id) - info = json.loads(json_info) - player_info = info['videoJsonPlayer'] - - info_dict = {'id': player_info['VID'], - 'title': player_info['VTI'], - 'description': player_info.get('VDE'), - 'upload_date': unified_strdate(player_info['VDA'].split(' ')[0]), - 'thumbnail': player_info['programImage'], - 'ext': 'flv', - } - - formats = player_info['VSR'].values() - def _match_lang(f): - # Return true if that format is in the language of the url - if lang == 'fr': - l = 'F' - elif lang == 'de': - l = 'A' - regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l] - return any(re.match(r, f['versionCode']) for r in regexes) - # Some formats may not be in the same language as the url - formats = filter(_match_lang, formats) - # We order the formats by quality - formats = sorted(formats, key=lambda f: int(f['height'])) - # Prefer videos without subtitles in the same language - formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f['versionCode']) is None) - # Pick the best quality - format_info = formats[-1] - if format_info['mediaType'] == u'rtmp': - info_dict['url'] = format_info['streamer'] - info_dict['play_path'] = 'mp4:' + format_info['url'] - else: - info_dict['url'] = format_info['url'] - - return info_dict - def _extract_video(self, url, video_id, lang): """Extract from videos.arte.tv""" ref_xml_url = url.replace('/videos/', '/do_delegate/videos/') @@ -172,3 +122,110 @@ class ArteTvIE(InfoExtractor): 'ext': 'flv', 'thumbnail': self._og_search_thumbnail(webpage), } + + +class ArteTVPlus7IE(InfoExtractor): + IE_NAME = u'arte.tv:+7' + _VALID_URL = r'https?://www\.arte.tv/guide/(?Pfr|de)/(?:(?:sendungen|emissions)/)?(?P.*?)/(?P.*?)(\?.*)?' + + @classmethod + def _extract_url_info(cls, url): + mobj = re.match(cls._VALID_URL, url) + lang = mobj.group('lang') + # This is not a real id, it can be for example AJT for the news + # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal + video_id = mobj.group('id') + return video_id, lang + + def _real_extract(self, url): + video_id, lang = self._extract_url_info(url) + webpage = self._download_webpage(url, video_id) + return self._extract_from_webpage(webpage, video_id, lang) + + def _extract_from_webpage(self, webpage, video_id, lang): + json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url') + + json_info = self._download_webpage(json_url, video_id, 'Downloading info json') + self.report_extraction(video_id) + info = json.loads(json_info) + player_info = info['videoJsonPlayer'] + + info_dict = { + 'id': player_info['VID'], + 'title': player_info['VTI'], + 'description': player_info.get('VDE'), + 'upload_date': unified_strdate(player_info.get('VDA', '').split(' ')[0]), + 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), + } + + formats = player_info['VSR'].values() + def _match_lang(f): + if f.get('versionCode') is None: + return True + # Return true if that format is in the language of the url + if lang == 'fr': + l = 'F' + elif lang == 'de': + l = 'A' + regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l] + return any(re.match(r, f['versionCode']) for r in regexes) + # Some formats may not be in the same language as the url + formats = filter(_match_lang, formats) + # Some formats use the m3u8 protocol + formats = filter(lambda f: f.get('videoFormat') != 'M3U8', formats) + # We order the formats by quality + formats = sorted(formats, key=lambda f: int(f.get('height',-1))) + # Prefer videos without subtitles in the same language + formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f.get('versionCode', '')) is None) + # Pick the best quality + def _format(format_info): + info = { + 'width': format_info.get('width'), + 'height': format_info.get('height'), + } + if format_info['mediaType'] == u'rtmp': + info['url'] = format_info['streamer'] + info['play_path'] = 'mp4:' + format_info['url'] + info['ext'] = 'flv' + else: + info['url'] = format_info['url'] + info['ext'] = determine_ext(info['url']) + return info + info_dict['formats'] = [_format(f) for f in formats] + # TODO: Remove when #980 has been merged + info_dict.update(info_dict['formats'][-1]) + + return info_dict + + +# It also uses the arte_vp_url url from the webpage to extract the information +class ArteTVCreativeIE(ArteTVPlus7IE): + IE_NAME = u'arte.tv:creative' + _VALID_URL = r'https?://creative\.arte\.tv/(?Pfr|de)/magazine?/(?P.+)' + + _TEST = { + u'url': u'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design', + u'file': u'050489-002.mp4', + u'info_dict': { + u'title': u'Agentur Amateur #2 - Corporate Design', + }, + } + + +class ArteTVFutureIE(ArteTVPlus7IE): + IE_NAME = u'arte.tv:future' + _VALID_URL = r'https?://future\.arte\.tv/(?Pfr|de)/(thema|sujet)/.*?#article-anchor-(?P\d+)' + + _TEST = { + u'url': u'http://future.arte.tv/fr/sujet/info-sciences#article-anchor-7081', + u'file': u'050940-003.mp4', + u'info_dict': { + u'title': u'Les champignons au secours de la planète', + }, + } + + def _real_extract(self, url): + anchor_id, lang = self._extract_url_info(url) + webpage = self._download_webpage(url, anchor_id) + row = get_element_by_id(anchor_id, webpage) + return self._extract_from_webpage(row, anchor_id, lang) diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py index 08b28c994..493504f75 100644 --- a/youtube_dl/extractor/bliptv.py +++ b/youtube_dl/extractor/bliptv.py @@ -115,7 +115,7 @@ class BlipTVIE(InfoExtractor): ext = umobj.group(1) info = { - 'id': data['item_id'], + 'id': compat_str(data['item_id']), 'url': video_url, 'uploader': data['display_name'], 'upload_date': upload_date, diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 71e3c7883..745212f2f 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -1,3 +1,5 @@ +# encoding: utf-8 + import re import json import xml.etree.ElementTree @@ -7,15 +9,39 @@ from ..utils import ( compat_urllib_parse, find_xpath_attr, compat_urlparse, + + ExtractorError, ) class BrightcoveIE(InfoExtractor): _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P.*)' _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' _PLAYLIST_URL_TEMPLATE = 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=%s' - - # There is a test for Brigtcove in GenericIE, that way we test both the download - # and the detection of videos, and we don't have to find an URL that is always valid + + _TESTS = [ + { + # From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/ + u'url': u'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001', + u'file': u'2371591881001.mp4', + u'md5': u'9e80619e0a94663f0bdc849b4566af19', + u'note': u'Test Brightcove downloads and detection in GenericIE', + u'info_dict': { + u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', + u'uploader': u'8TV', + u'description': u'md5:a950cc4285c43e44d763d036710cd9cd', + } + }, + { + # From http://medianetwork.oracle.com/video/player/1785452137001 + u'url': u'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1217746023001&flashID=myPlayer&%40videoPlayer=1785452137001', + u'file': u'1785452137001.flv', + u'info_dict': { + u'title': u'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges', + u'description': u'John Rose speaks at the JVM Language Summit, August 1, 2012.', + u'uploader': u'Oracle', + }, + }, + ] @classmethod def _build_brighcove_url(cls, object_str): @@ -23,6 +49,11 @@ class BrightcoveIE(InfoExtractor): Build a Brightcove url from a xml string containing {params} """ + + # Fix up some stupid HTML, see https://github.com/rg3/youtube-dl/issues/1553 + object_str = re.sub(r'(', + lambda m: m.group(1) + '/>', object_str) + object_doc = xml.etree.ElementTree.fromstring(object_str) assert u'BrightcoveExperience' in object_doc.attrib['class'] params = {'flashID': object_doc.attrib['id'], @@ -72,15 +103,27 @@ class BrightcoveIE(InfoExtractor): playlist_title=playlist_info['mediaCollectionDTO']['displayName']) def _extract_video_info(self, video_info): - renditions = video_info['renditions'] - renditions = sorted(renditions, key=lambda r: r['size']) - best_format = renditions[-1] + info = { + 'id': video_info['id'], + 'title': video_info['displayName'], + 'description': video_info.get('shortDescription'), + 'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), + 'uploader': video_info.get('publisherName'), + } - return {'id': video_info['id'], - 'title': video_info['displayName'], - 'url': best_format['defaultURL'], + renditions = video_info.get('renditions') + if renditions: + renditions = sorted(renditions, key=lambda r: r['size']) + best_format = renditions[-1] + info.update({ + 'url': best_format['defaultURL'], 'ext': 'mp4', - 'description': video_info.get('shortDescription'), - 'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), - 'uploader': video_info.get('publisherName'), - } + }) + elif video_info.get('FLVFullLengthURL') is not None: + info.update({ + 'url': video_info['FLVFullLengthURL'], + 'ext': 'flv', + }) + else: + raise ExtractorError(u'Unable to extract video url for %s' % info['id']) + return info diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py new file mode 100644 index 000000000..6925b96c2 --- /dev/null +++ b/youtube_dl/extractor/cinemassacre.py @@ -0,0 +1,91 @@ +# encoding: utf-8 +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, +) + + +class CinemassacreIE(InfoExtractor): + _VALID_URL = r'(?:http://)?(?:www\.)?(?Pcinemassacre\.com/(?P[0-9]{4})/(?P[0-9]{2})/(?P[0-9]{2})/.+?)(?:[/?].*)?' + _TESTS = [{ + u'url': u'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/', + u'file': u'19911.flv', + u'info_dict': { + u'upload_date': u'20121110', + u'title': u'“Angry Video Game Nerd: The Movie” – Trailer', + u'description': u'md5:fb87405fcb42a331742a0dce2708560b', + }, + u'params': { + # rtmp download + u'skip_download': True, + }, + }, + { + u'url': u'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940', + u'file': u'521be8ef82b16.flv', + u'info_dict': { + u'upload_date': u'20131002', + u'title': u'The Mummy’s Hand (1940)', + }, + u'params': { + # rtmp download + u'skip_download': True, + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + webpage_url = u'http://' + mobj.group('url') + webpage = self._download_webpage(webpage_url, None) # Don't know video id yet + video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d') + mobj = re.search(r'src="(?Phttp://player\.screenwavemedia\.com/play/(?:embed|player)\.php\?id=(?:Cinemassacre-)?(?P.+?))"', webpage) + if not mobj: + raise ExtractorError(u'Can\'t extract embed url and video id') + playerdata_url = mobj.group(u'embed_url') + video_id = mobj.group(u'video_id') + + video_title = self._html_search_regex(r'(?P<title>.+?)\|', + webpage, u'title') + video_description = self._html_search_regex(r'<div class="entry-content">(?P<description>.+?)</div>', + webpage, u'description', flags=re.DOTALL, fatal=False) + if len(video_description) == 0: + video_description = None + + playerdata = self._download_webpage(playerdata_url, video_id) + base_url = self._html_search_regex(r'\'streamer\': \'(?P<base_url>rtmp://.*?)/(?:vod|Cinemassacre)\'', + playerdata, u'base_url') + base_url += '/Cinemassacre/' + # Important: The file names in playerdata are not used by the player and even wrong for some videos + sd_file = 'Cinemassacre-%s_high.mp4' % video_id + hd_file = 'Cinemassacre-%s.mp4' % video_id + video_thumbnail = 'http://image.screenwavemedia.com/Cinemassacre/Cinemassacre-%s_thumb_640x360.jpg' % video_id + + formats = [ + { + 'url': base_url + sd_file, + 'ext': 'flv', + 'format': 'sd', + 'format_id': 'sd', + }, + { + 'url': base_url + hd_file, + 'ext': 'flv', + 'format': 'hd', + 'format_id': 'hd', + }, + ] + + info = { + 'id': video_id, + 'title': video_title, + 'formats': formats, + 'description': video_description, + 'upload_date': video_date, + 'thumbnail': video_thumbnail, + } + # TODO: Remove when #980 has been merged + info.update(formats[-1]) + return info diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index bf8d711ee..69b2beece 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -51,12 +51,12 @@ class ComedyCentralIE(InfoExtractor): '400': 'mp4', } _video_dimensions = { - '3500': '1280x720', - '2200': '960x540', - '1700': '768x432', - '1200': '640x360', - '750': '512x288', - '400': '384x216', + '3500': (1280, 720), + '2200': (960, 540), + '1700': (768, 432), + '1200': (640, 360), + '750': (512, 288), + '400': (384, 216), } @classmethod @@ -64,11 +64,13 @@ class ComedyCentralIE(InfoExtractor): """Receives a URL and returns True if suitable for this IE.""" return re.match(cls._VALID_URL, url, re.VERBOSE) is not None - def _print_formats(self, formats): - print('Available formats:') - for x in formats: - print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???'))) - + @staticmethod + def _transform_rtmp_url(rtmp_video_url): + m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url) + if not m: + raise ExtractorError(u'Cannot transform RTMP url') + base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/' + return base + m.group('finalid') def _real_extract(self, url): mobj = re.match(self._VALID_URL, url, re.VERBOSE) @@ -155,40 +157,31 @@ class ComedyCentralIE(InfoExtractor): self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found') continue - if self._downloader.params.get('listformats', None): - self._print_formats([i[0] for i in turls]) - return - - # For now, just pick the highest bitrate - format,rtmp_video_url = turls[-1] - - # Get the format arg from the arg stream - req_format = self._downloader.params.get('format', None) - - # Select format if we can find one - for f,v in turls: - if f == req_format: - format, rtmp_video_url = f, v - break - - m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url) - if not m: - raise ExtractorError(u'Cannot transform RTMP url') - base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/' - video_url = base + m.group('finalid') + formats = [] + for format, rtmp_video_url in turls: + w, h = self._video_dimensions.get(format, (None, None)) + formats.append({ + 'url': self._transform_rtmp_url(rtmp_video_url), + 'ext': self._video_extensions.get(format, 'mp4'), + 'format_id': format, + 'height': h, + 'width': w, + }) effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1) info = { 'id': shortMediaId, - 'url': video_url, + 'formats': formats, 'uploader': showId, 'upload_date': officialDate, 'title': effTitle, - 'ext': 'mp4', - 'format': format, 'thumbnail': None, 'description': compat_str(officialTitle), } + + # TODO: Remove when #980 has been merged + info.update(info['formats'][-1]) + results.append(info) return results diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 77726ee24..2a5a85dc6 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -35,6 +35,8 @@ class InfoExtractor(object): title: Video title, unescaped. ext: Video filename extension. + Instead of url and ext, formats can also specified. + The following fields are optional: format: The video format, defaults to ext (used for --get-format) @@ -52,8 +54,20 @@ class InfoExtractor(object): view_count: How many users have watched the video on the platform. urlhandle: [internal] The urlHandle to be used to download the file, like returned by urllib.request.urlopen - - The fields should all be Unicode strings. + age_limit: Age restriction for the video, as an integer (years) + formats: A list of dictionaries for each format available, it must + be ordered from worst to best quality. Potential fields: + * url Mandatory. The URL of the video file + * ext Will be calculated from url if missing + * format A human-readable description of the format + ("mp4 container with h264/opus"). + Calculated from width and height if missing. + * format_id A short description of the format + ("mp4_h264_opus" or "19") + * width Width of the video, if known + * height Height of the video, if known + + Unless mentioned otherwise, the fields should be Unicode strings. Subclasses of this one should re-define the _real_initialize() and _real_extract() methods and define a _VALID_URL regexp. @@ -305,6 +319,15 @@ class InfoExtractor(object): self._og_regex('video')], html, name, **kargs) + def _rta_search(self, html): + # See http://www.rtalabel.org/index.php?content=howtofaq#single + if re.search(r'(?ix)<meta\s+name="rating"\s+' + r' content="RTA-5042-1996-1400-1577-RTA"', + html): + return 18 + return 0 + + class SearchInfoExtractor(InfoExtractor): """ Base class for paged search queries extractors. diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 360113f9c..7d8353946 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -10,25 +10,49 @@ from ..utils import ( compat_str, get_element_by_attribute, get_element_by_id, + orderedSet, ExtractorError, ) +class DailymotionBaseInfoExtractor(InfoExtractor): + @staticmethod + def _build_request(url): + """Build a request with the family filter disabled""" + request = compat_urllib_request.Request(url) + request.add_header('Cookie', 'family_filter=off') + return request -class DailymotionIE(SubtitlesInfoExtractor): +class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): """Information Extractor for Dailymotion""" _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/(?:embed/)?video/([^/]+)' IE_NAME = u'dailymotion' - _TEST = { - u'url': u'http://www.dailymotion.com/video/x33vw9_tutoriel-de-youtubeur-dl-des-video_tech', - u'file': u'x33vw9.mp4', - u'md5': u'392c4b85a60a90dc4792da41ce3144eb', - u'info_dict': { - u"uploader": u"Amphora Alex and Van .", - u"title": u"Tutoriel de Youtubeur\"DL DES VIDEO DE YOUTUBE\"" - } - } + _TESTS = [ + { + u'url': u'http://www.dailymotion.com/video/x33vw9_tutoriel-de-youtubeur-dl-des-video_tech', + u'file': u'x33vw9.mp4', + u'md5': u'392c4b85a60a90dc4792da41ce3144eb', + u'info_dict': { + u"uploader": u"Amphora Alex and Van .", + u"title": u"Tutoriel de Youtubeur\"DL DES VIDEO DE YOUTUBE\"" + } + }, + # Vevo video + { + u'url': u'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi', + u'file': u'USUV71301934.mp4', + u'info_dict': { + u'title': u'Roar (Official)', + u'uploader': u'Katy Perry', + u'upload_date': u'20130905', + }, + u'params': { + u'skip_download': True, + }, + u'skip': u'VEVO is only available in some countries', + }, + ] def _real_extract(self, url): # Extract id and simplified title from URL @@ -40,13 +64,21 @@ class DailymotionIE(SubtitlesInfoExtractor): url = 'http://www.dailymotion.com/video/%s' % video_id # Retrieve video webpage to extract further information - request = compat_urllib_request.Request(url) - request.add_header('Cookie', 'family_filter=off') + request = self._build_request(url) webpage = self._download_webpage(request, video_id) # Extract URL, uploader and title from webpage self.report_extraction(video_id) + # It may just embed a vevo video: + m_vevo = re.search( + r'<link rel="video_src" href="[^"]*?vevo.com[^"]*?videoId=(?P<id>[\w]*)', + webpage) + if m_vevo is not None: + vevo_id = m_vevo.group('id') + self.to_screen(u'Vevo video detected: %s' % vevo_id) + return self.url_result(u'vevo:%s' % vevo_id, ie='Vevo') + video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', # Looking for official user r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'], @@ -63,6 +95,9 @@ class DailymotionIE(SubtitlesInfoExtractor): info = self._search_regex(r'var info = ({.*?}),$', embed_page, 'video info', flags=re.MULTILINE) info = json.loads(info) + if info.get('error') is not None: + msg = 'Couldn\'t get video, Dailymotion says: %s' % info['error']['title'] + raise ExtractorError(msg, expected=True) # TODO: support choosing qualities @@ -110,29 +145,56 @@ class DailymotionIE(SubtitlesInfoExtractor): return {} -class DailymotionPlaylistIE(InfoExtractor): +class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): + IE_NAME = u'dailymotion:playlist' _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>.+?)/' _MORE_PAGES_INDICATOR = r'<div class="next">.*?<a.*?href="/playlist/.+?".*?>.*?</a>.*?</div>' + _PAGE_TEMPLATE = 'https://www.dailymotion.com/playlist/%s/%s' - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - playlist_id = mobj.group('id') + def _extract_entries(self, id): video_ids = [] - for pagenum in itertools.count(1): - webpage = self._download_webpage('https://www.dailymotion.com/playlist/%s/%s' % (playlist_id, pagenum), - playlist_id, u'Downloading page %s' % pagenum) + request = self._build_request(self._PAGE_TEMPLATE % (id, pagenum)) + webpage = self._download_webpage(request, + id, u'Downloading page %s' % pagenum) playlist_el = get_element_by_attribute(u'class', u'video_list', webpage) - video_ids.extend(re.findall(r'data-id="(.+?)" data-ext-id', playlist_el)) + video_ids.extend(re.findall(r'data-id="(.+?)"', playlist_el)) if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None: break + return [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion') + for video_id in orderedSet(video_ids)] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + webpage = self._download_webpage(url, playlist_id) - entries = [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion') - for video_id in video_ids] return {'_type': 'playlist', 'id': playlist_id, 'title': get_element_by_id(u'playlist_name', webpage), - 'entries': entries, + 'entries': self._extract_entries(playlist_id), } + + +class DailymotionUserIE(DailymotionPlaylistIE): + IE_NAME = u'dailymotion:user' + _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/user/(?P<user>[^/]+)' + _MORE_PAGES_INDICATOR = r'<div class="next">.*?<a.*?href="/user/.+?".*?>.*?</a>.*?</div>' + _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + user = mobj.group('user') + webpage = self._download_webpage(url, user) + full_user = self._html_search_regex( + r'<a class="label" href="/%s".*?>(.*?)</' % re.escape(user), + webpage, u'user', flags=re.DOTALL) + + return { + '_type': 'playlist', + 'id': user, + 'title': full_user, + 'entries': self._extract_entries(user), + } diff --git a/youtube_dl/extractor/ebaumsworld.py b/youtube_dl/extractor/ebaumsworld.py new file mode 100644 index 000000000..f02c6998b --- /dev/null +++ b/youtube_dl/extractor/ebaumsworld.py @@ -0,0 +1,37 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import determine_ext + + +class EbaumsWorldIE(InfoExtractor): + _VALID_URL = r'https?://www\.ebaumsworld\.com/video/watch/(?P<id>\d+)' + + _TEST = { + u'url': u'http://www.ebaumsworld.com/video/watch/83367677/', + u'file': u'83367677.mp4', + u'info_dict': { + u'title': u'A Giant Python Opens The Door', + u'description': u'This is how nightmares start...', + u'uploader': u'jihadpizza', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + config_xml = self._download_webpage( + 'http://www.ebaumsworld.com/video/player/%s' % video_id, video_id) + config = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8')) + video_url = config.find('file').text + + return { + 'id': video_id, + 'title': config.find('title').text, + 'url': video_url, + 'ext': determine_ext(video_url), + 'description': config.find('description').text, + 'thumbnail': config.find('image').text, + 'uploader': config.find('username').text, + } diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index beaa5b4bd..9d1bc0751 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -106,8 +106,8 @@ class FacebookIE(InfoExtractor): video_duration = int(video_data['video_duration']) thumbnail = video_data['thumbnail_src'] - video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>', - webpage, u'title') + video_title = self._html_search_regex( + r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, u'title') info = { 'id': video_id, diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py new file mode 100644 index 000000000..deaa4ed2d --- /dev/null +++ b/youtube_dl/extractor/faz.py @@ -0,0 +1,60 @@ +# encoding: utf-8 +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + clean_html, + get_element_by_attribute, +) + + +class FazIE(InfoExtractor): + IE_NAME = u'faz.net' + _VALID_URL = r'https?://www\.faz\.net/multimedia/videos/.*?-(?P<id>\d+).html' + + _TEST = { + u'url': u'http://www.faz.net/multimedia/videos/stockholm-chemie-nobelpreis-fuer-drei-amerikanische-forscher-12610585.html', + u'file': u'12610585.mp4', + u'info_dict': { + u'title': u'Stockholm: Chemie-Nobelpreis für drei amerikanische Forscher', + u'description': u'md5:1453fbf9a0d041d985a47306192ea253', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + self.to_screen(video_id) + webpage = self._download_webpage(url, video_id) + config_xml_url = self._search_regex(r'writeFLV\(\'(.+?)\',', webpage, + u'config xml url') + config_xml = self._download_webpage(config_xml_url, video_id, + u'Downloading config xml') + config = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8')) + + encodings = config.find('ENCODINGS') + formats = [] + for code in ['LOW', 'HIGH', 'HQ']: + encoding = encodings.find(code) + if encoding is None: + continue + encoding_url = encoding.find('FILENAME').text + formats.append({ + 'url': encoding_url, + 'ext': determine_ext(encoding_url), + 'format_id': code.lower(), + }) + + descr_html = get_element_by_attribute('class', 'Content Copy', webpage) + info = { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'formats': formats, + 'description': clean_html(descr_html), + 'thumbnail': config.find('STILL/STILL_BIG').text, + } + # TODO: Remove when #980 has been merged + info.update(formats[-1]) + return info diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py new file mode 100644 index 000000000..9c89362ef --- /dev/null +++ b/youtube_dl/extractor/fktv.py @@ -0,0 +1,79 @@ +import re +import random +import json + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + get_element_by_id, + clean_html, +) + + +class FKTVIE(InfoExtractor): + IE_NAME = u'fernsehkritik.tv' + _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik.tv/folge-(?P<ep>[0-9]+)(?:/.*)?' + + _TEST = { + u'url': u'http://fernsehkritik.tv/folge-1', + u'file': u'00011.flv', + u'info_dict': { + u'title': u'Folge 1 vom 10. April 2007', + u'description': u'md5:fb4818139c7cfe6907d4b83412a6864f', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + episode = int(mobj.group('ep')) + + server = random.randint(2, 4) + video_thumbnail = 'http://fernsehkritik.tv/images/magazin/folge%d.jpg' % episode + start_webpage = self._download_webpage('http://fernsehkritik.tv/folge-%d/Start' % episode, + episode) + playlist = self._search_regex(r'playlist = (\[.*?\]);', start_webpage, + u'playlist', flags=re.DOTALL) + files = json.loads(re.sub('{[^{}]*?}', '{}', playlist)) + # TODO: return a single multipart video + videos = [] + for i, _ in enumerate(files, 1): + video_id = '%04d%d' % (episode, i) + video_url = 'http://dl%d.fernsehkritik.tv/fernsehkritik%d%s.flv' % (server, episode, '' if i == 1 else '-%d' % i) + video_title = 'Fernsehkritik %d.%d' % (episode, i) + videos.append({ + 'id': video_id, + 'url': video_url, + 'ext': determine_ext(video_url), + 'title': clean_html(get_element_by_id('eptitle', start_webpage)), + 'description': clean_html(get_element_by_id('contentlist', start_webpage)), + 'thumbnail': video_thumbnail + }) + return videos + + +class FKTVPosteckeIE(InfoExtractor): + IE_NAME = u'fernsehkritik.tv:postecke' + _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik.tv/inline-video/postecke.php\?(.*&)?ep=(?P<ep>[0-9]+)(&|$)' + _TEST = { + u'url': u'http://fernsehkritik.tv/inline-video/postecke.php?iframe=true&width=625&height=440&ep=120', + u'file': u'0120.flv', + u'md5': u'262f0adbac80317412f7e57b4808e5c4', + u'info_dict': { + u"title": u"Postecke 120" + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + episode = int(mobj.group('ep')) + + server = random.randint(2, 4) + video_id = '%04d' % episode + video_url = 'http://dl%d.fernsehkritik.tv/postecke/postecke%d.flv' % (server, episode) + video_title = 'Postecke %d' % episode + return { + 'id': video_id, + 'url': video_url, + 'ext': determine_ext(video_url), + 'title': video_title, + } diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index 80d96baf7..e1d2f0526 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -9,7 +9,7 @@ from ..utils import ( class FlickrIE(InfoExtractor): """Information Extractor for Flickr videos""" - _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*' + _VALID_URL = r'(?:https?://)?(?:www\.|secure\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*' _TEST = { u'url': u'http://www.flickr.com/photos/forestwander-nature-pictures/5645318632/in/photostream/', u'file': u'5645318632.mp4', diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index f2b12c884..086cafca0 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -1,6 +1,7 @@ # encoding: utf-8 import re import xml.etree.ElementTree +import json from .common import InfoExtractor from ..utils import ( @@ -34,17 +35,7 @@ class PluzzIE(FranceTVBaseInfoExtractor): IE_NAME = u'pluzz.francetv.fr' _VALID_URL = r'https?://pluzz\.francetv\.fr/videos/(.*?)\.html' - _TEST = { - u'url': u'http://pluzz.francetv.fr/videos/allo_rufo_saison5_,88439064.html', - u'file': u'88439064.mp4', - u'info_dict': { - u'title': u'Allô Rufo', - u'description': u'md5:d909f1ebdf963814b65772aea250400e', - }, - u'params': { - u'skip_download': True, - }, - } + # Can't use tests, videos expire in 7 days def _real_extract(self, url): title = re.match(self._VALID_URL, url).group(1) @@ -75,3 +66,64 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): webpage = self._download_webpage(url, page_title) video_id = self._search_regex(r'id-video=(\d+?)"', webpage, u'video id') return self._extract_video(video_id) + + +class France2IE(FranceTVBaseInfoExtractor): + IE_NAME = u'france2.fr' + _VALID_URL = r'''(?x)https?://www\.france2\.fr/ + (?: + emissions/.*?/videos/(?P<id>\d+) + | emission/(?P<key>[^/?]+) + )''' + + _TEST = { + u'url': u'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104', + u'file': u'75540104.mp4', + u'info_dict': { + u'title': u'13h15, le samedi...', + u'description': u'md5:2e5b58ba7a2d3692b35c792be081a03d', + }, + u'params': { + u'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj.group('key'): + webpage = self._download_webpage(url, mobj.group('key')) + video_id = self._html_search_regex( + r'''(?x)<div\s+class="video-player">\s* + <a\s+href="http://videos.francetv.fr/video/([0-9]+)"\s+ + class="francetv-video-player">''', + webpage, u'video ID') + else: + video_id = mobj.group('id') + return self._extract_video(video_id) + + +class GenerationQuoiIE(InfoExtractor): + IE_NAME = u'france2.fr:generation-quoi' + _VALID_URL = r'https?://generation-quoi\.france2\.fr/portrait/(?P<name>.*)(\?|$)' + + _TEST = { + u'url': u'http://generation-quoi.france2.fr/portrait/garde-a-vous', + u'file': u'k7FJX8VBcvvLmX4wA5Q.mp4', + u'info_dict': { + u'title': u'Génération Quoi - Garde à Vous', + u'uploader': u'Génération Quoi', + }, + u'params': { + # It uses Dailymotion + u'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + name = mobj.group('name') + info_url = compat_urlparse.urljoin(url, '/medias/video/%s.json' % name) + info_json = self._download_webpage(info_url, name) + info = json.loads(info_json) + return self.url_result('http://www.dailymotion.com/video/%s' % info['id'], + ie='Dailymotion') diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index f3d86a711..2ccdb7073 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -21,7 +21,8 @@ class FunnyOrDieIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - video_url = self._search_regex(r'type="video/mp4" src="(.*?)"', + video_url = self._search_regex( + [r'type="video/mp4" src="(.*?)"', r'src="([^>]*?)" type=\'video/mp4\''], webpage, u'video URL', flags=re.DOTALL) info = { diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index cd3bbe65f..098768361 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -1,55 +1,59 @@ import re -import xml.etree.ElementTree +import json from .common import InfoExtractor from ..utils import ( - unified_strdate, compat_urllib_parse, + compat_urlparse, + unescapeHTML, + get_meta_content, ) + class GameSpotIE(InfoExtractor): _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<page_id>\d+)/?' _TEST = { u"url": u"http://www.gamespot.com/arma-iii/videos/arma-iii-community-guide-sitrep-i-6410818/", - u"file": u"6410818.mp4", + u"file": u"gs-2300-6410818.mp4", u"md5": u"b2a30deaa8654fcccd43713a6b6a4825", u"info_dict": { u"title": u"Arma 3 - Community Guide: SITREP I", - u"upload_date": u"20130627", + u'description': u'Check out this video where some of the basics of Arma 3 is explained.', } } - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - page_id = mobj.group('page_id') + page_id = video_id = mobj.group('page_id') webpage = self._download_webpage(url, page_id) - video_id = self._html_search_regex([r'"og:video" content=".*?\?id=(\d+)"', - r'http://www\.gamespot\.com/videoembed/(\d+)'], - webpage, 'video id') - data = compat_urllib_parse.urlencode({'id': video_id, 'newplayer': '1'}) - info_url = 'http://www.gamespot.com/pages/video_player/xml.php?' + data - info_xml = self._download_webpage(info_url, video_id) - doc = xml.etree.ElementTree.fromstring(info_xml) - clip_el = doc.find('./playList/clip') + data_video_json = self._search_regex(r'data-video=\'(.*?)\'', webpage, u'data video') + data_video = json.loads(unescapeHTML(data_video_json)) - http_urls = [{'url': node.find('filePath').text, - 'rate': int(node.find('rate').text)} - for node in clip_el.find('./httpURI')] - best_quality = sorted(http_urls, key=lambda f: f['rate'])[-1] - video_url = best_quality['url'] - title = clip_el.find('./title').text - ext = video_url.rpartition('.')[2] - thumbnail_url = clip_el.find('./screenGrabURI').text - view_count = int(clip_el.find('./views').text) - upload_date = unified_strdate(clip_el.find('./postDate').text) + # Transform the manifest url to a link to the mp4 files + # they are used in mobile devices. + f4m_url = data_video['videoStreams']['f4m_stream'] + f4m_path = compat_urlparse.urlparse(f4m_url).path + QUALITIES_RE = r'((,\d+)+,?)' + qualities = self._search_regex(QUALITIES_RE, f4m_path, u'qualities').strip(',').split(',') + http_path = f4m_path[1:].split('/', 1)[1] + http_template = re.sub(QUALITIES_RE, r'%s', http_path) + http_template = http_template.replace('.csmil/manifest.f4m', '') + http_template = compat_urlparse.urljoin('http://video.gamespotcdn.com/', http_template) + formats = [] + for q in qualities: + formats.append({ + 'url': http_template % q, + 'ext': 'mp4', + 'format_id': q, + }) - return [{ - 'id' : video_id, - 'url' : video_url, - 'ext' : ext, - 'title' : title, - 'thumbnail' : thumbnail_url, - 'upload_date' : upload_date, - 'view_count' : view_count, - }] + info = { + 'id': data_video['guid'], + 'title': compat_urllib_parse.unquote(data_video['title']), + 'formats': formats, + 'description': get_meta_content('description', webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + } + # TODO: Remove when #980 has been merged + info.update(formats[-1]) + return info diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index f92e61fea..d48c84f8d 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -29,17 +29,6 @@ class GenericIE(InfoExtractor): u"title": u"R\u00e9gis plante sa Jeep" } }, - { - u'url': u'http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/', - u'file': u'2371591881001.mp4', - u'md5': u'9e80619e0a94663f0bdc849b4566af19', - u'note': u'Test Brightcove downloads and detection in GenericIE', - u'info_dict': { - u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', - u'uploader': u'8TV', - u'description': u'md5:a950cc4285c43e44d763d036710cd9cd', - } - }, ] def report_download_webpage(self, video_id): @@ -128,11 +117,11 @@ class GenericIE(InfoExtractor): except ValueError: # since this is the last-resort InfoExtractor, if # this error is thrown, it'll be thrown here - raise ExtractorError(u'Invalid URL: %s' % url) + raise ExtractorError(u'Failed to download URL: %s' % url) self.report_extraction(video_id) # Look for BrightCove: - m_brightcove = re.search(r'<object.+?class=([\'"]).*?BrightcoveExperience.*?\1.+?</object>', webpage, re.DOTALL) + m_brightcove = re.search(r'<object[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1.+?</object>', webpage, re.DOTALL) if m_brightcove is not None: self.to_screen(u'Brightcove video detected.') bc_url = BrightcoveIE._build_brighcove_url(m_brightcove.group()) @@ -160,12 +149,12 @@ class GenericIE(InfoExtractor): # HTML5 video mobj = re.search(r'<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage, flags=re.DOTALL) if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) + raise ExtractorError(u'Unsupported URL: %s' % url) # It's possible that one of the regexes # matched, but returned an empty group: if mobj.group(1) is None: - raise ExtractorError(u'Invalid URL: %s' % url) + raise ExtractorError(u'Did not find a valid video URL at %s' % url) video_url = mobj.group(1) video_url = compat_urlparse.urljoin(url, video_url) diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py index 8895ad289..ab12d7e93 100644 --- a/youtube_dl/extractor/googleplus.py +++ b/youtube_dl/extractor/googleplus.py @@ -41,7 +41,8 @@ class GooglePlusIE(InfoExtractor): # Extract update date upload_date = self._html_search_regex( - ['title="Timestamp">(.*?)</a>', r'<a.+?class="g-M.+?>(.+?)</a>'], + r'''(?x)<a.+?class="o-T-s\s[^"]+"\s+style="display:\s*none"\s*> + ([0-9]{4}-[0-9]{2}-[0-9]{2})</a>''', webpage, u'upload date', fatal=False) if upload_date: # Convert timestring to a format suitable for filename diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index b1c84278a..c52146f7d 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -13,7 +13,7 @@ class IGNIE(InfoExtractor): Some videos of it.ign.com are also supported """ - _VALID_URL = r'https?://.+?\.ign\.com/(?P<type>videos|show_videos|articles)(/.+)?/(?P<name_or_id>.+)' + _VALID_URL = r'https?://.+?\.ign\.com/(?P<type>videos|show_videos|articles|(?:[^/]*/feature))(/.+)?/(?P<name_or_id>.+)' IE_NAME = u'ign.com' _CONFIG_URL_TEMPLATE = 'http://www.ign.com/videos/configs/id/%s.config' @@ -21,15 +21,39 @@ class IGNIE(InfoExtractor): r'id="my_show_video">.*?<p>(.*?)</p>', ] - _TEST = { - u'url': u'http://www.ign.com/videos/2013/06/05/the-last-of-us-review', - u'file': u'8f862beef863986b2785559b9e1aa599.mp4', - u'md5': u'eac8bdc1890980122c3b66f14bdd02e9', - u'info_dict': { - u'title': u'The Last of Us Review', - u'description': u'md5:c8946d4260a4d43a00d5ae8ed998870c', - } - } + _TESTS = [ + { + u'url': u'http://www.ign.com/videos/2013/06/05/the-last-of-us-review', + u'file': u'8f862beef863986b2785559b9e1aa599.mp4', + u'md5': u'eac8bdc1890980122c3b66f14bdd02e9', + u'info_dict': { + u'title': u'The Last of Us Review', + u'description': u'md5:c8946d4260a4d43a00d5ae8ed998870c', + } + }, + { + u'url': u'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind', + u'playlist': [ + { + u'file': u'5ebbd138523268b93c9141af17bec937.mp4', + u'info_dict': { + u'title': u'GTA 5 Video Review', + u'description': u'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', + }, + }, + { + u'file': u'638672ee848ae4ff108df2a296418ee2.mp4', + u'info_dict': { + u'title': u'GTA 5\'s Twisted Beauty in Super Slow Motion', + u'description': u'The twisted beauty of GTA 5 in stunning slow motion.', + }, + }, + ], + u'params': { + u'skip_download': True, + }, + }, + ] def _find_video_id(self, webpage): res_id = [r'data-video-id="(.+?)"', @@ -46,6 +70,13 @@ class IGNIE(InfoExtractor): if page_type == 'articles': video_url = self._search_regex(r'var videoUrl = "(.+?)"', webpage, u'video url') return self.url_result(video_url, ie='IGN') + elif page_type != 'video': + multiple_urls = re.findall( + '<param name="flashvars" value="[^"]*?url=(https?://www\.ign\.com/videos/.*?)["&]', + webpage) + if multiple_urls: + return [self.url_result(u, ie='IGN') for u in multiple_urls] + video_id = self._find_video_id(webpage) result = self._get_video_info(video_id) description = self._html_search_regex(self._DESCRIPTION_RE, @@ -87,6 +118,9 @@ class OneUPIE(IGNIE): } } + # Override IGN tests + _TESTS = [] + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) id = mobj.group('name_or_id') diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py new file mode 100644 index 000000000..5986459d6 --- /dev/null +++ b/youtube_dl/extractor/internetvideoarchive.py @@ -0,0 +1,87 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + compat_urlparse, + compat_urllib_parse, + xpath_with_ns, + determine_ext, +) + + +class InternetVideoArchiveIE(InfoExtractor): + _VALID_URL = r'https?://video\.internetvideoarchive\.net/flash/players/.*?\?.*?publishedid.*?' + + _TEST = { + u'url': u'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247', + u'file': u'452693.mp4', + u'info_dict': { + u'title': u'SKYFALL', + u'description': u'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.', + u'duration': 156, + }, + } + + @staticmethod + def _build_url(query): + return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query + + @staticmethod + def _clean_query(query): + NEEDED_ARGS = ['publishedid', 'customerid'] + query_dic = compat_urlparse.parse_qs(query) + cleaned_dic = dict((k,v[0]) for (k,v) in query_dic.items() if k in NEEDED_ARGS) + # Other player ids return m3u8 urls + cleaned_dic['playerid'] = '247' + cleaned_dic['videokbrate'] = '100000' + return compat_urllib_parse.urlencode(cleaned_dic) + + def _real_extract(self, url): + query = compat_urlparse.urlparse(url).query + query_dic = compat_urlparse.parse_qs(query) + video_id = query_dic['publishedid'][0] + url = self._build_url(query) + + flashconfiguration_xml = self._download_webpage(url, video_id, + u'Downloading flash configuration') + flashconfiguration = xml.etree.ElementTree.fromstring(flashconfiguration_xml.encode('utf-8')) + file_url = flashconfiguration.find('file').text + file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx') + # Replace some of the parameters in the query to get the best quality + # and http links (no m3u8 manifests) + file_url = re.sub(r'(?<=\?)(.+)$', + lambda m: self._clean_query(m.group()), + file_url) + info_xml = self._download_webpage(file_url, video_id, + u'Downloading video info') + info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) + item = info.find('channel/item') + + def _bp(p): + return xpath_with_ns(p, + {'media': 'http://search.yahoo.com/mrss/', + 'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats'}) + formats = [] + for content in item.findall(_bp('media:group/media:content')): + attr = content.attrib + f_url = attr['url'] + formats.append({ + 'url': f_url, + 'ext': determine_ext(f_url), + 'width': int(attr['width']), + 'bitrate': int(attr['bitrate']), + }) + formats = sorted(formats, key=lambda f: f['bitrate']) + + info = { + 'id': video_id, + 'title': item.find('title').text, + 'formats': formats, + 'thumbnail': item.find(_bp('media:thumbnail')).attrib['url'], + 'description': item.find('description').text, + 'duration': int(attr['duration']), + } + # TODO: Remove when #980 has been merged + info.update(formats[-1]) + return info diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py index 4327bc13d..6bb54b932 100644 --- a/youtube_dl/extractor/jeuxvideo.py +++ b/youtube_dl/extractor/jeuxvideo.py @@ -6,13 +6,14 @@ import xml.etree.ElementTree from .common import InfoExtractor + class JeuxVideoIE(InfoExtractor): _VALID_URL = r'http://.*?\.jeuxvideo\.com/.*/(.*?)-\d+\.htm' _TEST = { u'url': u'http://www.jeuxvideo.com/reportages-videos-jeux/0004/00046170/tearaway-playstation-vita-gc-2013-tearaway-nous-presente-ses-papiers-d-identite-00115182.htm', u'file': u'5182.mp4', - u'md5': u'e0fdb0cd3ce98713ef9c1e1e025779d0', + u'md5': u'046e491afb32a8aaac1f44dd4ddd54ee', u'info_dict': { u'title': u'GC 2013 : Tearaway nous présente ses papiers d\'identité', u'description': u'Lorsque les développeurs de LittleBigPlanet proposent un nouveau titre, on ne peut que s\'attendre à un résultat original et fort attrayant.\n', @@ -23,25 +24,29 @@ class JeuxVideoIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) title = re.match(self._VALID_URL, url).group(1) webpage = self._download_webpage(url, title) - m_download = re.search(r'<param name="flashvars" value="config=(.*?)" />', webpage) - - xml_link = m_download.group(1) + xml_link = self._html_search_regex( + r'<param name="flashvars" value="config=(.*?)" />', + webpage, u'config URL') - id = re.search(r'http://www.jeuxvideo.com/config/\w+/0011/(.*?)/\d+_player\.xml', xml_link).group(1) + video_id = self._search_regex( + r'http://www\.jeuxvideo\.com/config/\w+/\d+/(.*?)/\d+_player\.xml', + xml_link, u'video ID') - xml_config = self._download_webpage(xml_link, title, - 'Downloading XML config') + xml_config = self._download_webpage( + xml_link, title, u'Downloading XML config') config = xml.etree.ElementTree.fromstring(xml_config.encode('utf-8')) - info = re.search(r'<format\.json>(.*?)</format\.json>', - xml_config, re.MULTILINE|re.DOTALL).group(1) - info = json.loads(info)['versions'][0] + info_json = self._search_regex( + r'(?sm)<format\.json>(.*?)</format\.json>', + xml_config, u'JSON information') + info = json.loads(info_json)['versions'][0] video_url = 'http://video720.jeuxvideo.com/' + info['file'] - return {'id': id, - 'title' : config.find('titre_video').text, - 'ext' : 'mp4', - 'url' : video_url, - 'description': self._og_search_description(webpage), - 'thumbnail': config.find('image').text, - } + return { + 'id': video_id, + 'title': config.find('titre_video').text, + 'ext': 'mp4', + 'url': video_url, + 'description': self._og_search_description(webpage), + 'thumbnail': config.find('image').text, + } diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index 309921078..d04da98c8 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -2,7 +2,12 @@ import re import json from .common import InfoExtractor -from ..utils import compat_urllib_parse_urlparse, compat_urlparse +from ..utils import ( + compat_urllib_parse_urlparse, + compat_urlparse, + get_meta_content, + ExtractorError, +) class LivestreamIE(InfoExtractor): @@ -35,8 +40,11 @@ class LivestreamIE(InfoExtractor): if video_id is None: # This is an event page: - api_url = self._search_regex(r'event_design_eventId: \'(.+?)\'', - webpage, 'api url') + player = get_meta_content('twitter:player', webpage) + if player is None: + raise ExtractorError('Couldn\'t extract event api url') + api_url = player.replace('/player', '') + api_url = re.sub(r'^(https?://)(new\.)', r'\1api.\2', api_url) info = json.loads(self._download_webpage(api_url, event_name, u'Downloading event info')) videos = [self._extract_video_info(video_data['data']) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 8f956571d..e520e2bb4 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -54,23 +54,26 @@ class MTVIE(InfoExtractor): def _get_thumbnail_url(self, uri, itemdoc): return 'http://mtv.mtvnimages.com/uri/' + uri - def _extract_video_url(self, metadataXml): + def _extract_video_formats(self, metadataXml): if '/error_country_block.swf' in metadataXml: raise ExtractorError(u'This video is not available from your country.', expected=True) mdoc = xml.etree.ElementTree.fromstring(metadataXml.encode('utf-8')) renditions = mdoc.findall('.//rendition') - # For now, always pick the highest quality. - rendition = renditions[-1] - - try: - _,_,ext = rendition.attrib['type'].partition('/') - format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate'] - rtmp_video_url = rendition.find('./src').text - except KeyError: - raise ExtractorError('Invalid rendition field.') - video_url = self._transform_rtmp_url(rtmp_video_url) - return {'ext': ext, 'url': video_url, 'format': format} + formats = [] + for rendition in mdoc.findall('.//rendition'): + try: + _, _, ext = rendition.attrib['type'].partition('/') + rtmp_video_url = rendition.find('./src').text + formats.append({'ext': ext, + 'url': self._transform_rtmp_url(rtmp_video_url), + 'format_id': rendition.get('bitrate'), + 'width': int(rendition.get('width')), + 'height': int(rendition.get('height')), + }) + except (KeyError, TypeError): + raise ExtractorError('Invalid rendition field.') + return formats def _get_video_info(self, itemdoc): uri = itemdoc.find('guid').text @@ -81,19 +84,25 @@ class MTVIE(InfoExtractor): mediagen_url += '&acceptMethods=fms' mediagen_page = self._download_webpage(mediagen_url, video_id, u'Downloading video urls') - video_info = self._extract_video_url(mediagen_page) description_node = itemdoc.find('description') if description_node is not None: - description = description_node.text + description = description_node.text.strip() else: description = None - video_info.update({'title': itemdoc.find('title').text, - 'id': video_id, - 'thumbnail': self._get_thumbnail_url(uri, itemdoc), - 'description': description, - }) - return video_info + + info = { + 'title': itemdoc.find('title').text, + 'formats': self._extract_video_formats(mediagen_page), + 'id': video_id, + 'thumbnail': self._get_thumbnail_url(uri, itemdoc), + 'description': description, + } + + # TODO: Remove when #980 has been merged + info.update(info['formats'][-1]) + + return info def _get_videos_info(self, uri): video_id = self._id_from_uri(uri) diff --git a/youtube_dl/extractor/newgrounds.py b/youtube_dl/extractor/newgrounds.py new file mode 100644 index 000000000..2ef80bce0 --- /dev/null +++ b/youtube_dl/extractor/newgrounds.py @@ -0,0 +1,38 @@ +import json +import re + +from .common import InfoExtractor +from ..utils import determine_ext + + +class NewgroundsIE(InfoExtractor): + _VALID_URL = r'(?:https?://)?(?:www\.)?newgrounds\.com/audio/listen/(?P<id>\d+)' + _TEST = { + u'url': u'http://www.newgrounds.com/audio/listen/549479', + u'file': u'549479.mp3', + u'md5': u'fe6033d297591288fa1c1f780386f07a', + u'info_dict': { + u"title": u"B7 - BusMode", + u"uploader": u"Burn7", + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + music_id = mobj.group('id') + webpage = self._download_webpage(url, music_id) + + title = self._html_search_regex(r',"name":"([^"]+)",', webpage, u'music title') + uploader = self._html_search_regex(r',"artist":"([^"]+)",', webpage, u'music uploader') + + music_url_json_string = self._html_search_regex(r'({"url":"[^"]+"),', webpage, u'music url') + '}' + music_url_json = json.loads(music_url_json_string) + music_url = music_url_json['url'] + + return { + 'id': music_id, + 'title': title, + 'url': music_url, + 'uploader': uploader, + 'ext': determine_ext(music_url), + } diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py new file mode 100644 index 000000000..e8d43dd13 --- /dev/null +++ b/youtube_dl/extractor/nhl.py @@ -0,0 +1,120 @@ +import re +import json +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + compat_urlparse, + compat_urllib_parse, + determine_ext, + unified_strdate, +) + + +class NHLBaseInfoExtractor(InfoExtractor): + @staticmethod + def _fix_json(json_string): + return json_string.replace('\\\'', '\'') + + def _extract_video(self, info): + video_id = info['id'] + self.report_extraction(video_id) + + initial_video_url = info['publishPoint'] + data = compat_urllib_parse.urlencode({ + 'type': 'fvod', + 'path': initial_video_url.replace('.mp4', '_sd.mp4'), + }) + path_url = 'http://video.nhl.com/videocenter/servlets/encryptvideopath?' + data + path_response = self._download_webpage(path_url, video_id, + u'Downloading final video url') + path_doc = xml.etree.ElementTree.fromstring(path_response) + video_url = path_doc.find('path').text + + join = compat_urlparse.urljoin + return { + 'id': video_id, + 'title': info['name'], + 'url': video_url, + 'ext': determine_ext(video_url), + 'description': info['description'], + 'duration': int(info['duration']), + 'thumbnail': join(join(video_url, '/u/'), info['bigImage']), + 'upload_date': unified_strdate(info['releaseDate'].split('.')[0]), + } + + +class NHLIE(NHLBaseInfoExtractor): + IE_NAME = u'nhl.com' + _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/console\?.*?(?<=[?&])id=(?P<id>\d+)' + + _TEST = { + u'url': u'http://video.canucks.nhl.com/videocenter/console?catid=6?id=453614', + u'file': u'453614.mp4', + u'info_dict': { + u'title': u'Quick clip: Weise 4-3 goal vs Flames', + u'description': u'Dale Weise scores his first of the season to put the Canucks up 4-3.', + u'duration': 18, + u'upload_date': u'20131006', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + json_url = 'http://video.nhl.com/videocenter/servlets/playlist?ids=%s&format=json' % video_id + info_json = self._download_webpage(json_url, video_id, + u'Downloading info json') + info_json = self._fix_json(info_json) + info = json.loads(info_json)[0] + return self._extract_video(info) + + +class NHLVideocenterIE(NHLBaseInfoExtractor): + IE_NAME = u'nhl.com:videocenter' + IE_DESC = u'Download the first 12 videos from a videocenter category' + _VALID_URL = r'https?://video\.(?P<team>[^.]*)\.nhl\.com/videocenter/(console\?.*?catid=(?P<catid>[^&]+))?' + + @classmethod + def suitable(cls, url): + if NHLIE.suitable(url): + return False + return super(NHLVideocenterIE, cls).suitable(url) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + team = mobj.group('team') + webpage = self._download_webpage(url, team) + cat_id = self._search_regex( + [r'var defaultCatId = "(.+?)";', + r'{statusIndex:0,index:0,.*?id:(.*?),'], + webpage, u'category id') + playlist_title = self._html_search_regex( + r'\?catid=%s">(.*?)</a>' % cat_id, + webpage, u'playlist title', flags=re.DOTALL) + + data = compat_urllib_parse.urlencode({ + 'cid': cat_id, + # This is the default value + 'count': 12, + 'ptrs': 3, + 'format': 'json', + }) + path = '/videocenter/servlets/browse?' + data + request_url = compat_urlparse.urljoin(url, path) + response = self._download_webpage(request_url, playlist_title) + response = self._fix_json(response) + if not response.strip(): + self._downloader.report_warning(u'Got an empty reponse, trying ' + u'adding the "newvideos" parameter') + response = self._download_webpage(request_url + '&newvideos=true', + playlist_title) + response = self._fix_json(response) + videos = json.loads(response) + + return { + '_type': 'playlist', + 'title': playlist_title, + 'id': cat_id, + 'entries': [self._extract_video(i) for i in videos], + } diff --git a/youtube_dl/extractor/nowvideo.py b/youtube_dl/extractor/nowvideo.py new file mode 100644 index 000000000..ab52ad401 --- /dev/null +++ b/youtube_dl/extractor/nowvideo.py @@ -0,0 +1,43 @@ +import re + +from .common import InfoExtractor +from ..utils import compat_urlparse + + +class NowVideoIE(InfoExtractor): + _VALID_URL = r'(?:https?://)?(?:www\.)?nowvideo\.ch/video/(?P<id>\w+)' + _TEST = { + u'url': u'http://www.nowvideo.ch/video/0mw0yow7b6dxa', + u'file': u'0mw0yow7b6dxa.flv', + u'md5': u'f8fbbc8add72bd95b7850c6a02fc8817', + u'info_dict': { + u"title": u"youtubedl test video _BaW_jenozKc.mp4" + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('id') + webpage_url = 'http://www.nowvideo.ch/video/' + video_id + webpage = self._download_webpage(webpage_url, video_id) + + self.report_extraction(video_id) + + video_title = self._html_search_regex(r'<h4>(.*)</h4>', + webpage, u'video title') + + video_key = self._search_regex(r'var fkzd="(.*)";', + webpage, u'video key') + + api_call = "http://www.nowvideo.ch/api/player.api.php?file={0}&numOfErrors=0&cid=1&key={1}".format(video_id, video_key) + api_response = self._download_webpage(api_call, video_id, + u'Downloading API page') + video_url = compat_urlparse.parse_qs(api_response)[u'url'][0] + + return [{ + 'id': video_id, + 'url': video_url, + 'ext': 'flv', + 'title': video_title, + }] diff --git a/youtube_dl/extractor/pornotube.py b/youtube_dl/extractor/pornotube.py index add76a11e..5d770ec28 100644 --- a/youtube_dl/extractor/pornotube.py +++ b/youtube_dl/extractor/pornotube.py @@ -38,6 +38,7 @@ class PornotubeIE(InfoExtractor): VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by' upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False) if upload_date: upload_date = unified_strdate(upload_date) + age_limit = self._rta_search(webpage) info = {'id': video_id, 'url': video_url, @@ -45,6 +46,7 @@ class PornotubeIE(InfoExtractor): 'upload_date': upload_date, 'title': video_title, 'ext': 'flv', - 'format': 'flv'} + 'format': 'flv', + 'age_limit': age_limit} return [info] diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 1d2cf1f56..365aade56 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -14,24 +14,30 @@ class RedTubeIE(InfoExtractor): } } - def _real_extract(self,url): + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - video_extension = 'mp4' + video_extension = 'mp4' webpage = self._download_webpage(url, video_id) self.report_extraction(video_id) - video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">', - webpage, u'video URL') + video_url = self._html_search_regex( + r'<source src="(.+?)" type="video/mp4">', webpage, u'video URL') - video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>', + video_title = self._html_search_regex( + r'<h1 class="videoTitle slidePanelMovable">(.+?)</h1>', webpage, u'title') - return [{ - 'id': video_id, - 'url': video_url, - 'ext': video_extension, - 'title': video_title, - }] + # No self-labeling, but they describe themselves as + # "Home of Videos Porno" + age_limit = 18 + + return { + 'id': video_id, + 'url': video_url, + 'ext': video_extension, + 'title': video_title, + 'age_limit': age_limit, + } diff --git a/youtube_dl/extractor/rottentomatoes.py b/youtube_dl/extractor/rottentomatoes.py new file mode 100644 index 000000000..c79c39413 --- /dev/null +++ b/youtube_dl/extractor/rottentomatoes.py @@ -0,0 +1,16 @@ +from .videodetective import VideoDetectiveIE + + +# It just uses the same method as videodetective.com, +# the internetvideoarchive.com is extracted from the og:video property +class RottenTomatoesIE(VideoDetectiveIE): + _VALID_URL = r'https?://www\.rottentomatoes\.com/m/[^/]+/trailers/(?P<id>\d+)' + + _TEST = { + u'url': u'http://www.rottentomatoes.com/m/toy_story_3/trailers/11028566/', + u'file': '613340.mp4', + u'info_dict': { + u'title': u'TOY STORY 3', + u'description': u'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.', + }, + } diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py index 7bb236c2b..d1b08c9bc 100644 --- a/youtube_dl/extractor/rtlnow.py +++ b/youtube_dl/extractor/rtlnow.py @@ -8,8 +8,8 @@ from ..utils import ( ) class RTLnowIE(InfoExtractor): - """Information Extractor for RTL NOW, RTL2 NOW, SUPER RTL NOW and VOX NOW""" - _VALID_URL = r'(?:http://)?(?P<url>(?P<base_url>rtl-now\.rtl\.de/|rtl2now\.rtl2\.de/|(?:www\.)?voxnow\.de/|(?:www\.)?superrtlnow\.de/)[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)' + """Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW""" + _VALID_URL = r'(?:http://)?(?P<url>(?P<base_url>rtl-now\.rtl\.de/|rtl2now\.rtl2\.de/|(?:www\.)?voxnow\.de/|(?:www\.)?rtlnitronow\.de/|(?:www\.)?superrtlnow\.de/|(?:www\.)?n-tvnow\.de/)[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)' _TESTS = [{ u'url': u'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1', u'file': u'90419.flv', @@ -61,8 +61,35 @@ class RTLnowIE(InfoExtractor): u'params': { u'skip_download': True, }, + }, + { + u'url': u'http://www.rtlnitronow.de/recht-ordnung/lebensmittelkontrolle-erlangenordnungsamt-berlin.php?film_id=127367&player=1&season=1', + u'file': u'127367.flv', + u'info_dict': { + u'upload_date': u'20130926', + u'title': u'Recht & Ordnung - Lebensmittelkontrolle Erlangen/Ordnungsamt...', + u'description': u'Lebensmittelkontrolle Erlangen/Ordnungsamt Berlin', + u'thumbnail': u'http://autoimg.static-fra.de/nitronow/344787/1500x1500/image2.jpg', + }, + u'params': { + u'skip_download': True, + }, + }, + { + u'url': u'http://www.n-tvnow.de/top-gear/episode-1-2013-01-01-00-00-00.php?film_id=124903&player=1&season=10', + u'file': u'124903.flv', + u'info_dict': { + u'upload_date': u'20130101', + u'title': u'Top Gear vom 01.01.2013', + u'description': u'Episode 1', + }, + u'params': { + u'skip_download': True, + }, + u'skip': u'Only works from Germany', }] + def _real_extract(self,url): mobj = re.match(self._VALID_URL, url) @@ -79,20 +106,23 @@ class RTLnowIE(InfoExtractor): msg = clean_html(note_m.group(1)) raise ExtractorError(msg) - video_title = self._html_search_regex(r'<title>(?P<title>[^<]+)', + video_title = self._html_search_regex(r'(?P<title>[^<]+?)( \| [^<]*)?', webpage, u'title') playerdata_url = self._html_search_regex(r'\'playerdata\': \'(?P[^\']+)\'', webpage, u'playerdata_url') playerdata = self._download_webpage(playerdata_url, video_id) - mobj = re.search(r'<!\[CDATA\[(?P<description>.+?)\s+- (?:Sendung )?vom (?P<upload_date_d>[0-9]{2})\.(?P<upload_date_m>[0-9]{2})\.(?:(?P<upload_date_Y>[0-9]{4})|(?P<upload_date_y>[0-9]{2})) [0-9]{2}:[0-9]{2} Uhr\]\]>', playerdata) + mobj = re.search(r'<!\[CDATA\[(?P<description>.+?)(?:\s+- (?:Sendung )?vom (?P<upload_date_d>[0-9]{2})\.(?P<upload_date_m>[0-9]{2})\.(?:(?P<upload_date_Y>[0-9]{4})|(?P<upload_date_y>[0-9]{2})) [0-9]{2}:[0-9]{2} Uhr)?\]\]>', playerdata) if mobj: video_description = mobj.group(u'description') if mobj.group('upload_date_Y'): video_upload_date = mobj.group('upload_date_Y') - else: + elif mobj.group('upload_date_y'): video_upload_date = u'20' + mobj.group('upload_date_y') - video_upload_date += mobj.group('upload_date_m')+mobj.group('upload_date_d') + else: + video_upload_date = None + if video_upload_date: + video_upload_date += mobj.group('upload_date_m')+mobj.group('upload_date_d') else: video_description = None video_upload_date = None diff --git a/youtube_dl/extractor/southparkstudios.py b/youtube_dl/extractor/southparkstudios.py index a5dc754dd..b1e96b679 100644 --- a/youtube_dl/extractor/southparkstudios.py +++ b/youtube_dl/extractor/southparkstudios.py @@ -5,7 +5,7 @@ from .mtv import MTVIE, _media_xml_tag class SouthParkStudiosIE(MTVIE): IE_NAME = u'southparkstudios.com' - _VALID_URL = r'https?://www\.southparkstudios\.com/clips/(?P\d+)' + _VALID_URL = r'https?://www\.southparkstudios\.com/(clips|full-episodes)/(?P.+?)(\?|#|$)' _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' @@ -23,7 +23,11 @@ class SouthParkStudiosIE(MTVIE): def _get_thumbnail_url(self, uri, itemdoc): search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail')) - return itemdoc.find(search_path).attrib['url'] + thumb_node = itemdoc.find(search_path) + if thumb_node is None: + return None + else: + return thumb_node.attrib['url'] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/sztvhu.py b/youtube_dl/extractor/sztvhu.py new file mode 100644 index 000000000..cd3e203e6 --- /dev/null +++ b/youtube_dl/extractor/sztvhu.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- + +import re + +from .common import InfoExtractor +from ..utils import determine_ext + + +class SztvHuIE(InfoExtractor): + _VALID_URL = r'(?:http://)?(?:(?:www\.)?sztv\.hu|www\.tvszombathely\.hu)/(?:[^/]+)/.+-(?P[0-9]+)' + _TEST = { + u'url': u'http://sztv.hu/hirek/cserkeszek-nepszerusitettek-a-kornyezettudatos-eletmodot-a-savaria-teren-20130909', + u'file': u'20130909.mp4', + u'md5': u'a6df607b11fb07d0e9f2ad94613375cb', + u'info_dict': { + u"title": u"Cserkészek népszerűsítették a környezettudatos életmódot a Savaria téren", + u"description": u'A zöld nap játékos ismeretterjesztő programjait a Magyar Cserkész Szövetség szervezte, akik az ország nyolc városában adják át tudásukat az érdeklődőknek. A PET...', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + video_file = self._search_regex( + r'file: "...:(.*?)",', webpage, 'video file') + title = self._html_search_regex( + r'', + webpage, 'video description', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) + + video_url = 'http://media.sztv.hu/vod/' + video_file + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'ext': determine_ext(video_url), + 'description': description, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 4c11f7a03..dfa1176a3 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -77,12 +77,20 @@ class TEDIE(InfoExtractor): thumbnail = self._search_regex(r'[\s.]*[\s.]* last_version['version']: + last_version = version + if last_version['version'] == -1: + raise ExtractorError(u'Unable to extract last version of the video') + + renditions = xml.etree.ElementTree.fromstring(last_version['data']) + formats = [] + # Already sorted from worst to best quality + for rend in renditions.findall('rendition'): + attr = rend.attrib + f_url = attr['url'] + formats.append({ + 'url': f_url, + 'ext': determine_ext(f_url), + 'height': int(attr['frameheight']), + 'width': int(attr['frameWidth']), + }) + + date_epoch = int(self._search_regex( + r'/Date\((\d+)\)/', video_info['launchDate'], u'launch date'))/1000 + upload_date = datetime.datetime.fromtimestamp(date_epoch) + info = { + 'id': video_id, + 'title': video_info['title'], + 'formats': formats, + 'thumbnail': video_info['imageUrl'], + 'upload_date': upload_date.strftime('%Y%m%d'), + 'uploader': video_info['mainArtists'][0]['artistName'], + 'duration': video_info['duration'], + } + + # TODO: Remove when #980 has been merged + info.update(formats[-1]) + + return info diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py new file mode 100644 index 000000000..12c84a985 --- /dev/null +++ b/youtube_dl/extractor/viddler.py @@ -0,0 +1,64 @@ +import json +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, +) + + +class ViddlerIE(InfoExtractor): + _VALID_URL = r'(?Phttps?://(?:www\.)?viddler.com)/(?:v|embed|player)/(?P[0-9]+)' + _TEST = { + u"url": u"http://www.viddler.com/v/43903784", + u'file': u'43903784.mp4', + u'md5': u'fbbaedf7813e514eb7ca30410f439ac9', + u'info_dict': { + u"title": u"Video Made Easy", + u"uploader": u"viddler", + u"duration": 100.89, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + embed_url = mobj.group('domain') + u'/embed/' + video_id + webpage = self._download_webpage(embed_url, video_id) + + video_sources_code = self._search_regex( + r"(?ms)sources\s*:\s*(\{.*?\})", webpage, u'video URLs') + video_sources = json.loads(video_sources_code.replace("'", '"')) + + formats = [{ + 'url': video_url, + 'format': format_id, + } for video_url, format_id in video_sources.items()] + + title = self._html_search_regex( + r"title\s*:\s*'([^']*)'", webpage, u'title') + uploader = self._html_search_regex( + r"authorName\s*:\s*'([^']*)'", webpage, u'uploader', fatal=False) + duration_s = self._html_search_regex( + r"duration\s*:\s*([0-9.]*)", webpage, u'duration', fatal=False) + duration = float(duration_s) if duration_s else None + thumbnail = self._html_search_regex( + r"thumbnail\s*:\s*'([^']*)'", + webpage, u'thumbnail', fatal=False) + + info = { + '_type': 'video', + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'duration': duration, + 'formats': formats, + } + + # TODO: Remove when #980 has been merged + info['formats'][-1]['ext'] = determine_ext(info['formats'][-1]['url']) + info.update(info['formats'][-1]) + + return info diff --git a/youtube_dl/extractor/videodetective.py b/youtube_dl/extractor/videodetective.py new file mode 100644 index 000000000..d89f84094 --- /dev/null +++ b/youtube_dl/extractor/videodetective.py @@ -0,0 +1,30 @@ +import re + +from .common import InfoExtractor +from .internetvideoarchive import InternetVideoArchiveIE +from ..utils import ( + compat_urlparse, +) + + +class VideoDetectiveIE(InfoExtractor): + _VALID_URL = r'https?://www\.videodetective\.com/[^/]+/[^/]+/(?P\d+)' + + _TEST = { + u'url': u'http://www.videodetective.com/movies/kick-ass-2/194487', + u'file': u'194487.mp4', + u'info_dict': { + u'title': u'KICK-ASS 2', + u'description': u'md5:65ba37ad619165afac7d432eaded6013', + u'duration': 138, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + og_video = self._og_search_video_url(webpage) + query = compat_urlparse.urlparse(og_video).query + return self.url_result(InternetVideoArchiveIE._build_url(query), + ie=InternetVideoArchiveIE.ie_key()) diff --git a/youtube_dl/extractor/videopremium.py b/youtube_dl/extractor/videopremium.py new file mode 100644 index 000000000..65f39b982 --- /dev/null +++ b/youtube_dl/extractor/videopremium.py @@ -0,0 +1,40 @@ +import re +import random + +from .common import InfoExtractor + + +class VideoPremiumIE(InfoExtractor): + _VALID_URL = r'(?:https?://)?(?:www\.)?videopremium\.tv/(?P\w+)(?:/.*)?' + _TEST = { + u'url': u'http://videopremium.tv/4w7oadjsf156', + u'file': u'4w7oadjsf156.f4v', + u'info_dict': { + u"title": u"youtube-dl_test_video____a_________-BaW_jenozKc.mp4.mp4" + }, + u'params': { + u'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('id') + webpage_url = 'http://videopremium.tv/' + video_id + webpage = self._download_webpage(webpage_url, video_id) + + self.report_extraction(video_id) + + video_title = self._html_search_regex(r'\s*(.+?)\s*<', + webpage, u'video title') + + return [{ + 'id': video_id, + 'url': "rtmp://e%d.md.iplay.md/play" % random.randint(1, 16), + 'play_path': "mp4:%s.f4v" % video_id, + 'page_url': "http://videopremium.tv/" + video_id, + 'player_url': "http://videopremium.tv/uplayer/uppod.swf", + 'ext': 'f4v', + 'title': video_title, + }] diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 4a7d82b7a..cea29f035 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -17,7 +17,7 @@ class VimeoIE(InfoExtractor): """Information extractor for vimeo.com.""" # _VALID_URL matches Vimeo URLs - _VALID_URL = r'(?Phttps?://)?(?:(?:www|player)\.)?vimeo(?Ppro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?Pplay_redirect_hls\?clip_id=)?(?:videos?/)?(?P[0-9]+)(?:[?].*)?$' + _VALID_URL = r'(?Phttps?://)?(?:(?:www|player)\.)?vimeo(?Ppro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?Pplay_redirect_hls\?clip_id=)?(?:videos?/)?(?P[0-9]+)/?(?:[?].*)?$' _NETRC_MACHINE = 'vimeo' IE_NAME = u'vimeo' _TESTS = [ diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index fa759d30c..361619694 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -11,8 +11,8 @@ from ..utils import ( class XHamsterIE(InfoExtractor): """Information Extractor for xHamster""" - _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P[0-9]+)/.*\.html' - _TEST = { + _VALID_URL = r'(?:http://)?(?:www\.)?xhamster\.com/movies/(?P[0-9]+)/(?P.+?)\.html(?:\?.*)?' + _TESTS = [{ u'url': u'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', u'file': u'1509445.flv', u'md5': u'9f48e0e8d58e3076bb236ff412ab62fa', @@ -21,13 +21,24 @@ class XHamsterIE(InfoExtractor): u"uploader_id": u"Ruseful2011", u"title": u"FemaleAgent Shy beauty takes the bait" } - } + }, + { + u'url': u'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', + u'file': u'2221348.flv', + u'md5': u'e767b9475de189320f691f49c679c4c7', + u'info_dict': { + u"upload_date": u"20130914", + u"uploader_id": u"jojo747400", + u"title": u"Britney Spears Sexy Booty" + } + }] def _real_extract(self,url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - mrss_url = 'http://xhamster.com/movies/%s/.html?hd' % video_id + seo = mobj.group('seo') + mrss_url = 'http://xhamster.com/movies/%s/%s.html?hd' % (video_id, seo) webpage = self._download_webpage(mrss_url, video_id) mobj = re.search(r'\'srv\': \'(?P[^\']*)\',\s*\'file\': \'(?P[^\']+)\',', webpage) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 32d5b9477..464b498f5 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -1,4 +1,3 @@ -import datetime import itertools import json import re @@ -6,86 +5,104 @@ import re from .common import InfoExtractor, SearchInfoExtractor from ..utils import ( compat_urllib_parse, - - ExtractorError, + compat_urlparse, + determine_ext, + clean_html, ) + class YahooIE(InfoExtractor): IE_DESC = u'Yahoo screen' _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P\d*?)\.html' - _TEST = { - u'url': u'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', - u'file': u'214727115.flv', - u'md5': u'2e717f169c1be93d84d3794a00d4a325', - u'info_dict': { - u"title": u"Julian Smith & Travis Legg Watch Julian Smith" + _TESTS = [ + { + u'url': u'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', + u'file': u'214727115.flv', + u'info_dict': { + u'title': u'Julian Smith & Travis Legg Watch Julian Smith', + u'description': u'Julian and Travis watch Julian Smith', + }, + u'params': { + # Requires rtmpdump + u'skip_download': True, + }, }, - u'skip': u'Requires rtmpdump' - } + { + u'url': u'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html', + u'file': u'103000935.flv', + u'info_dict': { + u'title': u'Codefellas - The Cougar Lies with Spanish Moss', + u'description': u'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?', + }, + u'params': { + # Requires rtmpdump + u'skip_download': True, + }, + }, + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P.+?)";', webpage) - if m_id is None: - # TODO: Check which url parameters are required - info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id - webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage') - info_re = r'''<!\[CDATA\[(?P<title>.*?)\]\]>.* - .*?)\]\]>.* - .*?)\ .*\]\]>.* - [A-Za-z0-9]+)(\.html|/v.swf)' + _VALID_URL = r'(?:(?:http://)?(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)|youku:)(?P[A-Za-z0-9]+)(?:\.html|/v\.swf|)' _TEST = { u"url": u"http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html", u"file": u"XNDgyMDQ2NTQw_part00.flv", @@ -66,6 +66,12 @@ class YoukuIE(InfoExtractor): self.report_extraction(video_id) try: config = json.loads(jsondata) + error_code = config['data'][0].get('error_code') + if error_code: + # -8 means blocked outside China. + error = config['data'][0].get('error') # Chinese and English, separated by newline. + raise ExtractorError(error or u'Server reported error %i' % error_code, + expected=True) video_title = config['data'][0]['title'] seed = config['data'][0]['seed'] @@ -89,6 +95,7 @@ class YoukuIE(InfoExtractor): fileid = config['data'][0]['streamfileids'][format] keys = [s['k'] for s in config['data'][0]['segs'][format]] + # segs is usually a dictionary, but an empty *list* if an error occured. except (UnicodeDecodeError, ValueError, KeyError): raise ExtractorError(u'Unable to extract info section') diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index c85fd4b5a..b1f93dd1b 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -51,6 +51,7 @@ class YouPornIE(InfoExtractor): req = compat_urllib_request.Request(url) req.add_header('Cookie', 'age_verified=1') webpage = self._download_webpage(req, video_id) + age_limit = self._rta_search(webpage) # Get JSON parameters json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters') @@ -115,7 +116,8 @@ class YouPornIE(InfoExtractor): 'ext': extension, 'format': format, 'thumbnail': thumbnail, - 'description': video_description + 'description': video_description, + 'age_limit': age_limit, }) if self._downloader.params.get('listformats', None): diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index f227e2086..4347651d7 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1,28 +1,39 @@ # coding: utf-8 +import collections +import errno +import io +import itertools import json -import netrc +import os.path import re import socket -import itertools +import string +import struct +import traceback import xml.etree.ElementTree +import zlib from .common import InfoExtractor, SearchInfoExtractor from .subtitles import SubtitlesInfoExtractor from ..utils import ( + compat_chr, compat_http_client, compat_parse_qs, compat_urllib_error, compat_urllib_parse, compat_urllib_request, + compat_urlparse, compat_str, clean_html, + get_cachedir, get_element_by_id, ExtractorError, unescapeHTML, unified_strdate, orderedSet, + write_json_file, ) class YoutubeBaseInfoExtractor(InfoExtractor): @@ -352,7 +363,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): u"info_dict": { u"upload_date": u"20120506", u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]", - u"description": u"md5:3e2666e0a55044490499ea45fe9037b7", + u"description": u"md5:5b292926389560516e384ac437c0ec07", u"uploader": u"Icona Pop", u"uploader_id": u"IconaPop" } @@ -369,21 +380,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): u"uploader_id": u"justintimberlakeVEVO" } }, - { - u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE', - u'file': u'TGi3HqYrWHE.mp4', - u'note': u'm3u8 video', - u'info_dict': { - u'title': u'Triathlon - Men - London 2012 Olympic Games', - u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games', - u'uploader': u'olympic', - u'upload_date': u'20120807', - u'uploader_id': u'olympic', - }, - u'params': { - u'skip_download': True, - }, - }, ] @@ -393,6 +389,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): if YoutubePlaylistIE.suitable(url): return False return re.match(cls._VALID_URL, url, re.VERBOSE) is not None + def __init__(self, *args, **kwargs): + super(YoutubeIE, self).__init__(*args, **kwargs) + self._player_cache = {} + def report_video_webpage_download(self, video_id): """Report attempt to download video webpage.""" self.to_screen(u'%s: Downloading video webpage' % video_id) @@ -413,11 +413,664 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): """Indicate the download will use the RTMP protocol.""" self.to_screen(u'RTMP download detected') - def _decrypt_signature(self, s): + def _extract_signature_function(self, video_id, player_url, slen): + id_m = re.match(r'.*-(?P[a-zA-Z0-9_-]+)\.(?P[a-z]+)$', + player_url) + player_type = id_m.group('ext') + player_id = id_m.group('id') + + # Read from filesystem cache + func_id = '%s_%s_%d' % (player_type, player_id, slen) + assert os.path.basename(func_id) == func_id + cache_dir = get_cachedir(self._downloader.params) + + cache_enabled = cache_dir is not None + if cache_enabled: + cache_fn = os.path.join(os.path.expanduser(cache_dir), + u'youtube-sigfuncs', + func_id + '.json') + try: + with io.open(cache_fn, 'r', encoding='utf-8') as cachef: + cache_spec = json.load(cachef) + return lambda s: u''.join(s[i] for i in cache_spec) + except IOError: + pass # No cache available + + if player_type == 'js': + code = self._download_webpage( + player_url, video_id, + note=u'Downloading %s player %s' % (player_type, player_id), + errnote=u'Download of %s failed' % player_url) + res = self._parse_sig_js(code) + elif player_type == 'swf': + urlh = self._request_webpage( + player_url, video_id, + note=u'Downloading %s player %s' % (player_type, player_id), + errnote=u'Download of %s failed' % player_url) + code = urlh.read() + res = self._parse_sig_swf(code) + else: + assert False, 'Invalid player type %r' % player_type + + if cache_enabled: + try: + test_string = u''.join(map(compat_chr, range(slen))) + cache_res = res(test_string) + cache_spec = [ord(c) for c in cache_res] + try: + os.makedirs(os.path.dirname(cache_fn)) + except OSError as ose: + if ose.errno != errno.EEXIST: + raise + write_json_file(cache_spec, cache_fn) + except Exception: + tb = traceback.format_exc() + self._downloader.report_warning( + u'Writing cache to %r failed: %s' % (cache_fn, tb)) + + return res + + def _print_sig_code(self, func, slen): + def gen_sig_code(idxs): + def _genslice(start, end, step): + starts = u'' if start == 0 else str(start) + ends = (u':%d' % (end+step)) if end + step >= 0 else u':' + steps = u'' if step == 1 else (u':%d' % step) + return u's[%s%s%s]' % (starts, ends, steps) + + step = None + start = '(Never used)' # Quelch pyflakes warnings - start will be + # set as soon as step is set + for i, prev in zip(idxs[1:], idxs[:-1]): + if step is not None: + if i - prev == step: + continue + yield _genslice(start, prev, step) + step = None + continue + if i - prev in [-1, 1]: + step = i - prev + start = prev + continue + else: + yield u's[%d]' % prev + if step is None: + yield u's[%d]' % i + else: + yield _genslice(start, i, step) + + test_string = u''.join(map(compat_chr, range(slen))) + cache_res = func(test_string) + cache_spec = [ord(c) for c in cache_res] + expr_code = u' + '.join(gen_sig_code(cache_spec)) + code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code) + self.to_screen(u'Extracted signature function:\n' + code) + + def _parse_sig_js(self, jscode): + funcname = self._search_regex( + r'signature=([a-zA-Z]+)', jscode, + u'Initial JS player signature function name') + + functions = {} + + def argidx(varname): + return string.lowercase.index(varname) + + def interpret_statement(stmt, local_vars, allow_recursion=20): + if allow_recursion < 0: + raise ExtractorError(u'Recursion limit reached') + + if stmt.startswith(u'var '): + stmt = stmt[len(u'var '):] + ass_m = re.match(r'^(?P[a-z]+)(?:\[(?P[^\]]+)\])?' + + r'=(?P.*)$', stmt) + if ass_m: + if ass_m.groupdict().get('index'): + def assign(val): + lvar = local_vars[ass_m.group('out')] + idx = interpret_expression(ass_m.group('index'), + local_vars, allow_recursion) + assert isinstance(idx, int) + lvar[idx] = val + return val + expr = ass_m.group('expr') + else: + def assign(val): + local_vars[ass_m.group('out')] = val + return val + expr = ass_m.group('expr') + elif stmt.startswith(u'return '): + assign = lambda v: v + expr = stmt[len(u'return '):] + else: + raise ExtractorError( + u'Cannot determine left side of statement in %r' % stmt) + + v = interpret_expression(expr, local_vars, allow_recursion) + return assign(v) + + def interpret_expression(expr, local_vars, allow_recursion): + if expr.isdigit(): + return int(expr) + + if expr.isalpha(): + return local_vars[expr] + + m = re.match(r'^(?P[a-z]+)\.(?P.*)$', expr) + if m: + member = m.group('member') + val = local_vars[m.group('in')] + if member == 'split("")': + return list(val) + if member == 'join("")': + return u''.join(val) + if member == 'length': + return len(val) + if member == 'reverse()': + return val[::-1] + slice_m = re.match(r'slice\((?P.*)\)', member) + if slice_m: + idx = interpret_expression( + slice_m.group('idx'), local_vars, allow_recursion-1) + return val[idx:] + + m = re.match( + r'^(?P[a-z]+)\[(?P.+)\]$', expr) + if m: + val = local_vars[m.group('in')] + idx = interpret_expression(m.group('idx'), local_vars, + allow_recursion-1) + return val[idx] + + m = re.match(r'^(?P.+?)(?P[%])(?P.+?)$', expr) + if m: + a = interpret_expression(m.group('a'), + local_vars, allow_recursion) + b = interpret_expression(m.group('b'), + local_vars, allow_recursion) + return a % b + + m = re.match( + r'^(?P[a-zA-Z]+)\((?P[a-z0-9,]+)\)$', expr) + if m: + fname = m.group('func') + if fname not in functions: + functions[fname] = extract_function(fname) + argvals = [int(v) if v.isdigit() else local_vars[v] + for v in m.group('args').split(',')] + return functions[fname](argvals) + raise ExtractorError(u'Unsupported JS expression %r' % expr) + + def extract_function(funcname): + func_m = re.search( + r'function ' + re.escape(funcname) + + r'\((?P[a-z,]+)\){(?P[^}]+)}', + jscode) + argnames = func_m.group('args').split(',') + + def resf(args): + local_vars = dict(zip(argnames, args)) + for stmt in func_m.group('code').split(';'): + res = interpret_statement(stmt, local_vars) + return res + return resf + + initial_function = extract_function(funcname) + return lambda s: initial_function([s]) + + def _parse_sig_swf(self, file_contents): + if file_contents[1:3] != b'WS': + raise ExtractorError( + u'Not an SWF file; header is %r' % file_contents[:3]) + if file_contents[:1] == b'C': + content = zlib.decompress(file_contents[8:]) + else: + raise NotImplementedError(u'Unsupported compression format %r' % + file_contents[:1]) + + def extract_tags(content): + pos = 0 + while pos < len(content): + header16 = struct.unpack('> 6 + tag_len = header16 & 0x3f + if tag_len == 0x3f: + tag_len = struct.unpack('> 4 + methods = {} + if kind in [0x00, 0x06]: # Slot or Const + u30() # Slot id + u30() # type_name_idx + vindex = u30() + if vindex != 0: + read_byte() # vkind + elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter + u30() # disp_id + method_idx = u30() + methods[multinames[trait_name_idx]] = method_idx + elif kind == 0x04: # Class + u30() # slot_id + u30() # classi + elif kind == 0x05: # Function + u30() # slot_id + function_idx = u30() + methods[function_idx] = multinames[trait_name_idx] + else: + raise ExtractorError(u'Unsupported trait kind %d' % kind) + + if attrs & 0x4 != 0: # Metadata present + metadata_count = u30() + for _c3 in range(metadata_count): + u30() # metadata index + + return methods + + # Classes + TARGET_CLASSNAME = u'SignatureDecipher' + searched_idx = multinames.index(TARGET_CLASSNAME) + searched_class_id = None + class_count = u30() + for class_id in range(class_count): + name_idx = u30() + if name_idx == searched_idx: + # We found the class we're looking for! + searched_class_id = class_id + u30() # super_name idx + flags = read_byte() + if flags & 0x08 != 0: # Protected namespace is present + u30() # protected_ns_idx + intrf_count = u30() + for _c2 in range(intrf_count): + u30() + u30() # iinit + trait_count = u30() + for _c2 in range(trait_count): + parse_traits_info() + + if searched_class_id is None: + raise ExtractorError(u'Target class %r not found' % + TARGET_CLASSNAME) + + method_names = {} + method_idxs = {} + for class_id in range(class_count): + u30() # cinit + trait_count = u30() + for _c2 in range(trait_count): + trait_methods = parse_traits_info() + if class_id == searched_class_id: + method_names.update(trait_methods.items()) + method_idxs.update(dict( + (idx, name) + for name, idx in trait_methods.items())) + + # Scripts + script_count = u30() + for _c in range(script_count): + u30() # init + trait_count = u30() + for _c2 in range(trait_count): + parse_traits_info() + + # Method bodies + method_body_count = u30() + Method = collections.namedtuple('Method', ['code', 'local_count']) + methods = {} + for _c in range(method_body_count): + method_idx = u30() + u30() # max_stack + local_count = u30() + u30() # init_scope_depth + u30() # max_scope_depth + code_length = u30() + code = read_bytes(code_length) + if method_idx in method_idxs: + m = Method(code, local_count) + methods[method_idxs[method_idx]] = m + exception_count = u30() + for _c2 in range(exception_count): + u30() # from + u30() # to + u30() # target + u30() # exc_type + u30() # var_name + trait_count = u30() + for _c2 in range(trait_count): + parse_traits_info() + + assert p + code_reader.tell() == len(code_tag) + assert len(methods) == len(method_idxs) + + method_pyfunctions = {} + + def extract_function(func_name): + if func_name in method_pyfunctions: + return method_pyfunctions[func_name] + if func_name not in methods: + raise ExtractorError(u'Cannot find function %r' % func_name) + m = methods[func_name] + + def resfunc(args): + registers = ['(this)'] + list(args) + [None] * m.local_count + stack = [] + coder = io.BytesIO(m.code) + while True: + opcode = struct.unpack('!B', coder.read(1))[0] + if opcode == 36: # pushbyte + v = struct.unpack('!B', coder.read(1))[0] + stack.append(v) + elif opcode == 44: # pushstring + idx = u30(coder) + stack.append(constant_strings[idx]) + elif opcode == 48: # pushscope + # We don't implement the scope register, so we'll just + # ignore the popped value + stack.pop() + elif opcode == 70: # callproperty + index = u30(coder) + mname = multinames[index] + arg_count = u30(coder) + args = list(reversed( + [stack.pop() for _ in range(arg_count)])) + obj = stack.pop() + if mname == u'split': + assert len(args) == 1 + assert isinstance(args[0], compat_str) + assert isinstance(obj, compat_str) + if args[0] == u'': + res = list(obj) + else: + res = obj.split(args[0]) + stack.append(res) + elif mname == u'slice': + assert len(args) == 1 + assert isinstance(args[0], int) + assert isinstance(obj, list) + res = obj[args[0]:] + stack.append(res) + elif mname == u'join': + assert len(args) == 1 + assert isinstance(args[0], compat_str) + assert isinstance(obj, list) + res = args[0].join(obj) + stack.append(res) + elif mname in method_pyfunctions: + stack.append(method_pyfunctions[mname](args)) + else: + raise NotImplementedError( + u'Unsupported property %r on %r' + % (mname, obj)) + elif opcode == 72: # returnvalue + res = stack.pop() + return res + elif opcode == 79: # callpropvoid + index = u30(coder) + mname = multinames[index] + arg_count = u30(coder) + args = list(reversed( + [stack.pop() for _ in range(arg_count)])) + obj = stack.pop() + if mname == u'reverse': + assert isinstance(obj, list) + obj.reverse() + else: + raise NotImplementedError( + u'Unsupported (void) property %r on %r' + % (mname, obj)) + elif opcode == 93: # findpropstrict + index = u30(coder) + mname = multinames[index] + res = extract_function(mname) + stack.append(res) + elif opcode == 97: # setproperty + index = u30(coder) + value = stack.pop() + idx = stack.pop() + obj = stack.pop() + assert isinstance(obj, list) + assert isinstance(idx, int) + obj[idx] = value + elif opcode == 98: # getlocal + index = u30(coder) + stack.append(registers[index]) + elif opcode == 99: # setlocal + index = u30(coder) + value = stack.pop() + registers[index] = value + elif opcode == 102: # getproperty + index = u30(coder) + pname = multinames[index] + if pname == u'length': + obj = stack.pop() + assert isinstance(obj, list) + stack.append(len(obj)) + else: # Assume attribute access + idx = stack.pop() + assert isinstance(idx, int) + obj = stack.pop() + assert isinstance(obj, list) + stack.append(obj[idx]) + elif opcode == 128: # coerce + u30(coder) + elif opcode == 133: # coerce_s + assert isinstance(stack[-1], (type(None), compat_str)) + elif opcode == 164: # modulo + value2 = stack.pop() + value1 = stack.pop() + res = value1 % value2 + stack.append(res) + elif opcode == 208: # getlocal_0 + stack.append(registers[0]) + elif opcode == 209: # getlocal_1 + stack.append(registers[1]) + elif opcode == 210: # getlocal_2 + stack.append(registers[2]) + elif opcode == 211: # getlocal_3 + stack.append(registers[3]) + elif opcode == 214: # setlocal_2 + registers[2] = stack.pop() + elif opcode == 215: # setlocal_3 + registers[3] = stack.pop() + else: + raise NotImplementedError( + u'Unsupported opcode %d' % opcode) + + method_pyfunctions[func_name] = resfunc + return resfunc + + initial_function = extract_function(u'decipher') + return lambda s: initial_function([s]) + + def _decrypt_signature(self, s, video_id, player_url, age_gate=False): """Turn the encrypted s field into a working signature""" - if len(s) == 92: + if player_url is not None: + try: + player_id = (player_url, len(s)) + if player_id not in self._player_cache: + func = self._extract_signature_function( + video_id, player_url, len(s) + ) + self._player_cache[player_id] = func + func = self._player_cache[player_id] + if self._downloader.params.get('youtube_print_sig_code'): + self._print_sig_code(func, len(s)) + return func(s) + except Exception: + tb = traceback.format_exc() + self._downloader.report_warning( + u'Automatic signature extraction failed: ' + tb) + + self._downloader.report_warning( + u'Warning: Falling back to static signature algorithm') + + return self._static_decrypt_signature( + s, video_id, player_url, age_gate) + + def _static_decrypt_signature(self, s, video_id, player_url, age_gate): + if age_gate: + # The videos with age protection use another player, so the + # algorithms can be different. + if len(s) == 86: + return s[2:63] + s[82] + s[64:82] + s[63] + + if len(s) == 93: + return s[86:29:-1] + s[88] + s[28:5:-1] + elif len(s) == 92: return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83] + elif len(s) == 91: + return s[84:27:-1] + s[86] + s[26:5:-1] elif len(s) == 90: return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81] elif len(s) == 89: @@ -427,15 +1080,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): elif len(s) == 87: return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:] elif len(s) == 86: - return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53] + return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1] elif len(s) == 85: return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84] elif len(s) == 84: - return s[81:36:-1] + s[0] + s[35:2:-1] + return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1] elif len(s) == 83: - return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0] + return s[80:63:-1] + s[0] + s[62:0:-1] + s[63] elif len(s) == 82: - return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54] + return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37] elif len(s) == 81: return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9] elif len(s) == 80: @@ -446,15 +1099,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): else: raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s))) - def _decrypt_signature_age_gate(self, s): - # The videos with age protection use another player, so the algorithms - # can be different. - if len(s) == 86: - return s[2:63] + s[82] + s[64:82] + s[63] - else: - # Fallback to the other algortihms - return self._decrypt_signature(s) - def _get_available_subtitles(self, video_id): try: sub_list = self._download_webpage( @@ -472,6 +1116,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'lang': lang, 'v': video_id, 'fmt': self._downloader.params.get('subtitlesformat'), + 'name': l[0], }) url = u'http://www.youtube.com/api/timedtext?' + params sub_lang_list[lang] = url @@ -605,10 +1250,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): url_map[itag] = format_url return url_map - def _real_extract(self, url): - if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url): - self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).') + def _extract_annotations(self, video_id): + url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id + return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.') + def _real_extract(self, url): # Extract original video URL from URL with redirection, like age verification, using next_url parameter mobj = re.search(self._NEXT_URL_RE, url) if mobj: @@ -627,7 +1273,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): video_webpage = video_webpage_bytes.decode('utf-8', 'ignore') # Attempt to extract SWF player URL - mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) + mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) if mobj is not None: player_url = re.sub(r'\\(.)', r'\1', mobj.group(1)) else: @@ -691,9 +1337,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): self._downloader.report_warning(u'unable to extract uploader nickname') # title - if 'title' not in video_info: - raise ExtractorError(u'Unable to extract video title') - video_title = compat_urllib_parse.unquote_plus(video_info['title'][0]) + if 'title' in video_info: + video_title = compat_urllib_parse.unquote_plus(video_info['title'][0]) + else: + self._downloader.report_warning(u'Unable to extract video title') + video_title = u'_' # thumbnail image # We try first to get a high quality image: @@ -703,7 +1351,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): video_thumbnail = m_thumb.group(1) elif 'thumbnail_url' not in video_info: self._downloader.report_warning(u'unable to extract video thumbnail') - video_thumbnail = '' + video_thumbnail = None else: # don't panic if we can't find it video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0]) @@ -738,6 +1386,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): else: video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]) + # annotations + video_annotations = None + if self._downloader.params.get('writeannotations', False): + video_annotations = self._extract_annotations(video_id) + # Decide which formats to download try: @@ -748,6 +1401,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): args = info['args'] # Easy way to know if the 's' value is in url_encoded_fmt_stream_map # this signatures are encrypted + if 'url_encoded_fmt_stream_map' not in args: + raise ValueError(u'No stream_map present') # caught below m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map']) if m_s is not None: self.to_screen(u'%s: Encrypted signatures detected.' % video_id) @@ -780,21 +1435,34 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): if 'sig' in url_data: url += '&signature=' + url_data['sig'][0] elif 's' in url_data: + encrypted_sig = url_data['s'][0] if self._downloader.params.get('verbose'): - s = url_data['s'][0] if age_gate: - player = 'flash player' + if player_url is None: + player_version = 'unknown' + else: + player_version = self._search_regex( + r'-(.+)\.swf$', player_url, + u'flash player', fatal=False) + player_desc = 'flash player %s' % player_version else: - player = u'html5 player %s' % self._search_regex(r'html5player-(.+?)\.js', video_webpage, + player_version = self._search_regex( + r'html5player-(.+?)\.js', video_webpage, 'html5 player', fatal=False) - parts_sizes = u'.'.join(compat_str(len(part)) for part in s.split('.')) + player_desc = u'html5 player %s' % player_version + + parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.')) self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' % - (len(s), parts_sizes, url_data['itag'][0], player)) - encrypted_sig = url_data['s'][0] - if age_gate: - signature = self._decrypt_signature_age_gate(encrypted_sig) - else: - signature = self._decrypt_signature(encrypted_sig) + (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc)) + + if not age_gate: + jsplayer_url_json = self._search_regex( + r'"assets":.+?"js":\s*("[^"]+")', + video_webpage, u'JS player URL') + player_url = json.loads(jsplayer_url_json) + + signature = self._decrypt_signature( + encrypted_sig, video_id, player_url, age_gate) url += '&signature=' + signature if 'ratebypass' not in url: url += '&ratebypass=yes' @@ -810,7 +1478,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): return else: - raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info') + raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') results = [] for format_param, video_real_url in video_url_list: @@ -834,7 +1502,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'description': video_description, 'player_url': player_url, 'subtitles': video_subtitles, - 'duration': video_duration + 'duration': video_duration, + 'age_limit': 18 if age_gate else 0, + 'annotations': video_annotations }) return results @@ -868,9 +1538,19 @@ class YoutubePlaylistIE(InfoExtractor): mobj = re.match(self._VALID_URL, url, re.VERBOSE) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) + playlist_id = mobj.group(1) or mobj.group(2) + + # Check if it's a video-specific URL + query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + if 'v' in query_dict: + video_id = query_dict['v'][0] + if self._downloader.params.get('noplaylist'): + self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id) + return self.url_result('https://www.youtube.com/watch?v=' + video_id, 'Youtube') + else: + self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id)) # Download playlist videos from API - playlist_id = mobj.group(1) or mobj.group(2) videos = [] for page_num in itertools.count(1): @@ -965,7 +1645,7 @@ class YoutubeChannelIE(InfoExtractor): class YoutubeUserIE(InfoExtractor): IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)' - _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)' + _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)' _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s' _GDATA_PAGE_SIZE = 50 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json' @@ -1005,6 +1685,9 @@ class YoutubeUserIE(InfoExtractor): response = json.loads(page) except ValueError as err: raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err)) + if 'entry' not in response['feed']: + # Number of videos is a multiple of self._MAX_RESULTS + break # Extract video identifiers ids_in_page = [] @@ -1155,3 +1838,18 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos') playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id') return self.url_result(playlist_id, 'YoutubePlaylist') + + +class YoutubeTruncatedURLIE(InfoExtractor): + IE_NAME = 'youtube:truncated_url' + IE_DESC = False # Do not list + _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$' + + def _real_extract(self, url): + raise ExtractorError( + u'Did you forget to quote the URL? Remember that & is a meta ' + u'character in most shells, so you want to put the URL in quotes, ' + u'like youtube-dl ' + u'\'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\'' + u' (or simply youtube-dl BaW_jenozKc ).', + expected=True) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 418509cb9..faed7ff7f 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -2,16 +2,14 @@ import re from .common import InfoExtractor from ..utils import ( + determine_ext, ExtractorError, - unescapeHTML, ) + class ZDFIE(InfoExtractor): - _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P[^/\?]+)(?:\?.*)?' - _TITLE = r'(?P.*)</h1>' + _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek(?P<hash>#)?\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?' _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>' - _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"' - _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -19,6 +17,9 @@ class ZDFIE(InfoExtractor): raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('video_id') + if mobj.group('hash'): + url = url.replace(u'#', u'', 1) + html = self._download_webpage(url, video_id) streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)] if streams is None: @@ -27,39 +28,48 @@ class ZDFIE(InfoExtractor): # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url # choose first/default media type and highest quality for now - for s in streams: #find 300 - dsl1000mbit - if s['quality'] == '300' and s['media_type'] == 'wstreaming': - stream_=s - break - for s in streams: #find veryhigh - dsl2000mbit - if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working - stream_=s - break - if stream_ is None: + def stream_pref(s): + TYPE_ORDER = ['ostreaming', 'hstreaming', 'wstreaming'] + try: + type_pref = TYPE_ORDER.index(s['media_type']) + except ValueError: + type_pref = 999 + + QUALITY_ORDER = ['veryhigh', '300'] + try: + quality_pref = QUALITY_ORDER.index(s['quality']) + except ValueError: + quality_pref = 999 + + return (type_pref, quality_pref) + + sorted_streams = sorted(streams, key=stream_pref) + if not sorted_streams: raise ExtractorError(u'No stream found.') + stream = sorted_streams[0] - media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL') + media_link = self._download_webpage( + stream['video_url'], + video_id, + u'Get stream URL') - self.report_extraction(video_id) - mobj = re.search(self._TITLE, html) - if mobj is None: - raise ExtractorError(u'Cannot extract title') - title = unescapeHTML(mobj.group('title')) + MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"' + RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)' - mobj = re.search(self._MMS_STREAM, media_link) + mobj = re.search(self._MEDIA_STREAM, media_link) if mobj is None: - mobj = re.search(self._RTSP_STREAM, media_link) + mobj = re.search(RTSP_STREAM, media_link) if mobj is None: raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL') - mms_url = mobj.group('video_url') + video_url = mobj.group('video_url') - mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url) - if mobj is None: - raise ExtractorError(u'Cannot extract extention') - ext = mobj.group('ext') + title = self._html_search_regex( + r'<h1(?: class="beitragHeadline")?>(.*?)</h1>', + html, u'title') - return [{'id': video_id, - 'url': mms_url, - 'title': title, - 'ext': ext - }] + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'ext': determine_ext(video_url) + } diff --git a/youtube_dl/update.py b/youtube_dl/update.py index ccab6f27f..0689a4891 100644 --- a/youtube_dl/update.py +++ b/youtube_dl/update.py @@ -1,6 +1,9 @@ +import io import json import traceback import hashlib +import subprocess +import sys from zipimport import zipimporter from .utils import * @@ -34,7 +37,7 @@ def rsa_verify(message, signature, key): if signature != sha256(message).digest(): return False return True -def update_self(to_screen, verbose, filename): +def update_self(to_screen, verbose): """Update the program file with the latest version from the repository""" UPDATE_URL = "http://rg3.github.io/youtube-dl/update/" @@ -42,7 +45,6 @@ def update_self(to_screen, verbose, filename): JSON_URL = UPDATE_URL + 'versions.json' UPDATES_RSA_KEY = (0x9d60ee4d8f805312fdb15a62f87b95bd66177b91df176765d13514a0f1754bcd2057295c5b6f1d35daa6742c3ffc9a82d3e118861c207995a8031e151d863c9927e304576bc80692bc8e094896fcf11b66f3e29e04e3a71e9a11558558acea1840aec37fc396fb6b65dc81a1c4144e03bd1c011de62e3f1357b327d08426fe93, 65537) - if not isinstance(globals().get('__loader__'), zipimporter) and not hasattr(sys, "frozen"): to_screen(u'It looks like you installed youtube-dl with a package manager, pip, setup.py or a tarball. Please use that to update.') return @@ -75,11 +77,18 @@ def update_self(to_screen, verbose, filename): to_screen(u'ERROR: the versions file signature is invalid. Aborting.') return - to_screen(u'Updating to version ' + versions_info['latest'] + '...') - version = versions_info['versions'][versions_info['latest']] + version_id = versions_info['latest'] + to_screen(u'Updating to version ' + version_id + '...') + version = versions_info['versions'][version_id] print_notes(to_screen, versions_info['versions']) + filename = sys.argv[0] + # Py2EXE: Filename could be different + if hasattr(sys, "frozen") and not os.path.isfile(filename): + if os.path.isfile(filename + u'.exe'): + filename += u'.exe' + if not os.access(filename, os.W_OK): to_screen(u'ERROR: no write permissions on %s' % filename) return @@ -116,16 +125,18 @@ def update_self(to_screen, verbose, filename): try: bat = os.path.join(directory, 'youtube-dl-updater.bat') - b = open(bat, 'w') - b.write(""" -echo Updating youtube-dl... + with io.open(bat, 'w') as batfile: + batfile.write(u""" +@echo off +echo Waiting for file handle to be closed ... ping 127.0.0.1 -n 5 -w 1000 > NUL -move /Y "%s.new" "%s" -del "%s" - \n""" %(exe, exe, bat)) - b.close() +move /Y "%s.new" "%s" > NUL +echo Updated youtube-dl to version %s. +start /b "" cmd /c del "%%~f0"&exit /b" + \n""" % (exe, exe, version_id)) - os.startfile(bat) + subprocess.Popen([bat]) # Continues to run in the background + return # Do not show premature success messages except (IOError, OSError) as err: if verbose: to_screen(compat_str(traceback.format_exc())) to_screen(u'ERROR: unable to overwrite current version') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 814a9b6be..3e81c308b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -9,6 +9,7 @@ import io import json import locale import os +import pipes import platform import re import socket @@ -66,6 +67,12 @@ try: except ImportError: # Python 2 from urllib2 import HTTPError as compat_HTTPError +try: + from urllib.request import urlretrieve as compat_urlretrieve +except ImportError: # Python 2 + from urllib import urlretrieve as compat_urlretrieve + + try: from subprocess import DEVNULL compat_subprocess_get_DEVNULL = lambda: DEVNULL @@ -169,7 +176,7 @@ def compat_ord(c): compiled_regex_type = type(re.compile('')) std_headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', @@ -223,6 +230,19 @@ else: return f return None +# On python2.6 the xml.etree.ElementTree.Element methods don't support +# the namespace parameter +def xpath_with_ns(path, ns_map): + components = [c.split(':') for c in path.split('/')] + replaced = [] + for c in components: + if len(c) == 1: + replaced.append(c[0]) + else: + ns, tag = c + replaced.append('{%s}%s' % (ns_map[ns], tag)) + return '/'.join(replaced) + def htmlentity_transform(matchobj): """Transforms an HTML entity to a character. @@ -709,6 +729,7 @@ def unified_strdate(date_str): '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M', '%Y-%m-%dT%H:%M:%SZ', + '%Y-%m-%dT%H:%M:%S', ] for expression in format_expressions: try: @@ -818,3 +839,109 @@ def intlist_to_bytes(xs): return ''.join([chr(x) for x in xs]) else: return bytes(xs) + + +def get_cachedir(params={}): + cache_root = os.environ.get('XDG_CACHE_HOME', + os.path.expanduser('~/.cache')) + return params.get('cachedir', os.path.join(cache_root, 'youtube-dl')) + + +# Cross-platform file locking +if sys.platform == 'win32': + import ctypes.wintypes + import msvcrt + + class OVERLAPPED(ctypes.Structure): + _fields_ = [ + ('Internal', ctypes.wintypes.LPVOID), + ('InternalHigh', ctypes.wintypes.LPVOID), + ('Offset', ctypes.wintypes.DWORD), + ('OffsetHigh', ctypes.wintypes.DWORD), + ('hEvent', ctypes.wintypes.HANDLE), + ] + + kernel32 = ctypes.windll.kernel32 + LockFileEx = kernel32.LockFileEx + LockFileEx.argtypes = [ + ctypes.wintypes.HANDLE, # hFile + ctypes.wintypes.DWORD, # dwFlags + ctypes.wintypes.DWORD, # dwReserved + ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow + ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh + ctypes.POINTER(OVERLAPPED) # Overlapped + ] + LockFileEx.restype = ctypes.wintypes.BOOL + UnlockFileEx = kernel32.UnlockFileEx + UnlockFileEx.argtypes = [ + ctypes.wintypes.HANDLE, # hFile + ctypes.wintypes.DWORD, # dwReserved + ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow + ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh + ctypes.POINTER(OVERLAPPED) # Overlapped + ] + UnlockFileEx.restype = ctypes.wintypes.BOOL + whole_low = 0xffffffff + whole_high = 0x7fffffff + + def _lock_file(f, exclusive): + overlapped = OVERLAPPED() + overlapped.Offset = 0 + overlapped.OffsetHigh = 0 + overlapped.hEvent = 0 + f._lock_file_overlapped_p = ctypes.pointer(overlapped) + handle = msvcrt.get_osfhandle(f.fileno()) + if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0, + whole_low, whole_high, f._lock_file_overlapped_p): + raise OSError('Locking file failed: %r' % ctypes.FormatError()) + + def _unlock_file(f): + assert f._lock_file_overlapped_p + handle = msvcrt.get_osfhandle(f.fileno()) + if not UnlockFileEx(handle, 0, + whole_low, whole_high, f._lock_file_overlapped_p): + raise OSError('Unlocking file failed: %r' % ctypes.FormatError()) + +else: + import fcntl + + def _lock_file(f, exclusive): + fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH) + + def _unlock_file(f): + fcntl.lockf(f, fcntl.LOCK_UN) + + +class locked_file(object): + def __init__(self, filename, mode, encoding=None): + assert mode in ['r', 'a', 'w'] + self.f = io.open(filename, mode, encoding=encoding) + self.mode = mode + + def __enter__(self): + exclusive = self.mode != 'r' + try: + _lock_file(self.f, exclusive) + except IOError: + self.f.close() + raise + return self + + def __exit__(self, etype, value, traceback): + try: + _unlock_file(self.f) + finally: + self.f.close() + + def __iter__(self): + return iter(self.f) + + def write(self, *args): + return self.f.write(*args) + + def read(self, *args): + return self.f.read(*args) + + +def shell_quote(args): + return ' '.join(map(pipes.quote, args)) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 80ccfbd4f..1004af116 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.09.17' +__version__ = '2013.10.09'