From: Philipp Hagemeister Date: Wed, 28 Aug 2013 11:10:59 +0000 (+0200) Subject: Merge remote-tracking branch 'Huarong/master' X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=commitdiff_plain;h=f8b362739e4f469b501aa804beb95cf1cfb1c916;hp=d5b00ee6e0ba70fd5d87752e8772fc1c39e4bd59;p=youtube-dl Merge remote-tracking branch 'Huarong/master' --- diff --git a/README.md b/README.md index 560bcdca1..75068fe56 100644 --- a/README.md +++ b/README.md @@ -120,18 +120,20 @@ which means you can modify it, redistribute it or use it however you like. --max-quality FORMAT highest quality format to download -F, --list-formats list all available formats (currently youtube only) + +## Subtitle Options: --write-sub write subtitle file (currently youtube only) --write-auto-sub write automatic subtitle file (currently youtube only) --only-sub [deprecated] alias of --skip-download --all-subs downloads all the available subtitles of the - video (currently youtube only) + video --list-subs lists all available subtitles for the video - (currently youtube only) - --sub-format FORMAT subtitle format [srt/sbv/vtt] (default=srt) - (currently youtube only) - --sub-lang LANG language of the subtitles to download (optional) - use IETF language tags like 'en' + --sub-format FORMAT subtitle format (default=srt) ([sbv/vtt] youtube + only) + --sub-lang LANGS languages of the subtitles to download (optional) + separated by commas, use IETF language tags like + 'en,pt' ## Authentication Options: -u, --username USERNAME account username @@ -153,6 +155,8 @@ which means you can modify it, redistribute it or use it however you like. processing; the video is erased by default --no-post-overwrites do not overwrite post-processed files; the post- processed files are overwritten by default + --embed-subs embed subtitles in the video (only for mp4 + videos) # CONFIGURATION diff --git a/devscripts/gh-pages/add-version.py b/devscripts/gh-pages/add-version.py index 6af8bb9d8..116420ef2 100755 --- a/devscripts/gh-pages/add-version.py +++ b/devscripts/gh-pages/add-version.py @@ -6,28 +6,32 @@ import hashlib import urllib.request if len(sys.argv) <= 1: - print('Specify the version number as parameter') - sys.exit() + print('Specify the version number as parameter') + sys.exit() version = sys.argv[1] with open('update/LATEST_VERSION', 'w') as f: - f.write(version) + f.write(version) versions_info = json.load(open('update/versions.json')) if 'signature' in versions_info: - del versions_info['signature'] + del versions_info['signature'] new_version = {} -filenames = {'bin': 'youtube-dl', 'exe': 'youtube-dl.exe', 'tar': 'youtube-dl-%s.tar.gz' % version} +filenames = { + 'bin': 'youtube-dl', + 'exe': 'youtube-dl.exe', + 'tar': 'youtube-dl-%s.tar.gz' % version} for key, filename in filenames.items(): - print('Downloading and checksumming %s...' %filename) - url = 'http://youtube-dl.org/downloads/%s/%s' % (version, filename) - data = urllib.request.urlopen(url).read() - sha256sum = hashlib.sha256(data).hexdigest() - new_version[key] = (url, sha256sum) + print('Downloading and checksumming %s...' % filename) + url = 'https://yt-dl.org/downloads/%s/%s' % (version, filename) + data = urllib.request.urlopen(url).read() + sha256sum = hashlib.sha256(data).hexdigest() + new_version[key] = (url, sha256sum) versions_info['versions'][version] = new_version versions_info['latest'] = version -json.dump(versions_info, open('update/versions.json', 'w'), indent=4, sort_keys=True) \ No newline at end of file +with open('update/versions.json', 'w') as jsonf: + json.dump(versions_info, jsonf, indent=4, sort_keys=True) diff --git a/devscripts/gh-pages/update-feed.py b/devscripts/gh-pages/update-feed.py index cfff05fc8..16571a924 100755 --- a/devscripts/gh-pages/update-feed.py +++ b/devscripts/gh-pages/update-feed.py @@ -22,7 +22,7 @@ entry_template=textwrap.dedent("""
- Downloads available at http://youtube-dl.org/downloads/@VERSION@/ + Downloads available at https://yt-dl.org/downloads/@VERSION@/
@@ -54,4 +54,3 @@ atom_template = atom_template.replace('@ENTRIES@', entries_str) with open('update/releases.atom','w',encoding='utf-8') as atom_file: atom_file.write(atom_template) - diff --git a/devscripts/release.sh b/devscripts/release.sh index 46c31e437..24c9ad8d8 100755 --- a/devscripts/release.sh +++ b/devscripts/release.sh @@ -67,7 +67,7 @@ RELEASE_FILES="youtube-dl youtube-dl.exe youtube-dl-$version.tar.gz" (cd build/$version/ && sha512sum $RELEASE_FILES > SHA2-512SUMS) git checkout HEAD -- youtube-dl youtube-dl.exe -/bin/echo -e "\n### Signing and uploading the new binaries to youtube-dl.org..." +/bin/echo -e "\n### Signing and uploading the new binaries to yt-dl.org ..." for f in $RELEASE_FILES; do gpg --detach-sig "build/$version/$f"; done scp -r "build/$version" ytdl@yt-dl.org:html/tmp/ ssh ytdl@yt-dl.org "mv html/tmp/$version html/downloads/" diff --git a/devscripts/youtube_genalgo.py b/devscripts/youtube_genalgo.py index fd0120650..917e8f79d 100644 --- a/devscripts/youtube_genalgo.py +++ b/devscripts/youtube_genalgo.py @@ -11,30 +11,36 @@ tests = [ # 90 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`", "mrtyuioplkjhgfdsazxcvbne1234567890QWER[YUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={`]}|"), - # 88 + # 89 + ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'", + "/?;:|}<[{=+-_)(*&^%$#@!MqBVCXZASDFGHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuyt"), + # 88 - vflapUV9V 2013/08/28 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<", - "J:|}][{=+-_)(*&;%$#@>MNBVCXZASDFGH^KLPOIUYTREWQ0987654321mnbvcxzasdfghrklpoiuytej"), - # 87 - vflART1Nf 2013/07/24 + "ioplkjhgfdsazxcvbnm12<4567890QWERTYUIOZLKJHGFDSAeXCVBNM!@#$%^&*()_-+={[]}|:;?/>.3"), + # 87 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>.<", - "tyuioplkjhgfdsazxcv"), - # 86 - vflm_D8eE 2013/07/31 + "uioplkjhgfdsazxcvbnm1t34567890QWE2TYUIOPLKJHGFDSAZXCVeNM!@#$^&*()_-+={[]}|:;?/>.<"), + # 86 - vflh9ybst 2013/08/23 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<", - ">.1}|[{=+-_)(*&^%$#@!MNBVCXZASDFGHJK.<"), + # 85 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?/>.<", - "ertyuiqplkjhgfdsazx$vbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#<%^&*()_-+={[};?/c"), - # 84 + ".>/?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWQ0q876543r1mnbvcx9asdfghjklpoiuyt2"), + # 84 - vflh9ybst 2013/08/23 (sporadic) ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?>.<", - "<.>?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWe098765432rmnbvcxzasdfghjklpoiuyt1"), - # 83 - vflcaqGO8 2013/07/11 + "yuioplkjhgfdsazxcvbnm1234567890QWERrYUIOPLKqHGFDSAZXCVBNM!@#$%^&*()_-+={[};?>.<"), + # 83 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<", - "urty8ioplkjhgfdsazxcvbqm1234567S90QWERTYUIOPLKJHGFDnAZXCVBNM!#$%^&*()_+={[};?/>.<"), - # 82 + ".>/?;}[{=+_)(*&^%<#!MNBVCXZASPFGHJKLwOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytreq"), + # 82 - vflZK4ZYR 2013/08/23 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.<", - "Q>/?;}[{=+-(*<^%$#@!MNBVCXZASDFGHKLPOIUY8REWT0q&7654321mnbvcxzasdfghjklpoiuytrew9"), + "wertyuioplkjhgfdsaqxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&z(-+={[};?/>.<"), # 81 - vflLC8JvQ 2013/07/25 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.", "C>/?;}[{=+-(*&^%$#@!MNBVYXZASDFGHKLPOIU.TREWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"), + # 80 - vflZK4ZYR 2013/08/23 (sporadic) + ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>", + "wertyuioplkjhgfdsaqxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&z(-+={[};?/>"), # 79 - vflLC8JvQ 2013/07/25 (sporadic) ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/", "Z?;}[{=+-(*&^%$#@!MNBVCXRASDFGHKLPOIUYT/EWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"), diff --git a/test/test_all_urls.py b/test/test_all_urls.py index c73d0e467..c54faa380 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -50,6 +50,7 @@ class TestAllURLsMatching(unittest.TestCase): self.assertEqual(YoutubeIE()._extract_id('http://www.youtube.com/watch?&v=BaW_jenozKc'), 'BaW_jenozKc') self.assertEqual(YoutubeIE()._extract_id('https://www.youtube.com/watch?&v=BaW_jenozKc'), 'BaW_jenozKc') self.assertEqual(YoutubeIE()._extract_id('https://www.youtube.com/watch?feature=player_embedded&v=BaW_jenozKc'), 'BaW_jenozKc') + self.assertEqual(YoutubeIE()._extract_id('https://www.youtube.com/watch_popup?v=BaW_jenozKc'), 'BaW_jenozKc') def test_no_duplicates(self): ies = gen_extractors() diff --git a/test/test_youtube_sig.py b/test/test_youtube_sig.py deleted file mode 100644 index d06f3c8aa..000000000 --- a/test/test_youtube_sig.py +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env python - -import unittest -import sys - -# Allow direct execution -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from youtube_dl.extractor.youtube import YoutubeIE -from helper import FakeYDL - -ie = YoutubeIE(FakeYDL()) -sig = ie._decrypt_signature -sig_age_gate = ie._decrypt_signature_age_gate - -class TestYoutubeSig(unittest.TestCase): - def test_92(self): - wrong = "F9F9B6E6FD47029957AB911A964CC20D95A181A5D37A2DBEFD67D403DB0E8BE4F4910053E4E8A79.0B70B.0B80B8" - right = "69B6E6FD47029957AB911A9F4CC20D95A181A5D3.A2DBEFD67D403DB0E8BE4F4910053E4E8A7980B7" - self.assertEqual(sig(wrong), right) - - def test_90(self): - wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`" - right = "mrtyuioplkjhgfdsazxcvbne1234567890QWER[YUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={`]}|" - self.assertEqual(sig(wrong), right) - - def test_88(self): - wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<" - right = "J:|}][{=+-_)(*&;%$#@>MNBVCXZASDFGH^KLPOIUYTREWQ0987654321mnbvcxzasdfghrklpoiuytej" - self.assertEqual(sig(wrong), right) - - def test_87(self): - wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>.<" - right = "tyuioplkjhgfdsazxcv" - self.assertEqual(sig(wrong), right) - - def test_86(self): - wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<" - right = ">.1}|[{=+-_)(*&^%$#@!MNBVCXZASDFGHJK 99: + return '--:--:--' + if hours == 0: + return '%02d:%02d' % (mins, secs) + else: + return '%02d:%02d:%02d' % (hours, mins, secs) + @staticmethod def calc_percent(byte_counter, data_len): if data_len is None: @@ -78,10 +89,7 @@ class FileDownloader(object): return '--:--' rate = float(current) / dif eta = int((float(total) - float(current)) / rate) - (eta_mins, eta_secs) = divmod(eta, 60) - if eta_mins > 99: - return '--:--' - return '%02d:%02d' % (eta_mins, eta_secs) + return FileDownloader.format_seconds(eta) @staticmethod def calc_speed(start, now, bytes): @@ -230,12 +238,14 @@ class FileDownloader(object): """Report it was impossible to resume download.""" self.to_screen(u'[download] Unable to resume') - def report_finish(self): + def report_finish(self, data_len_str, tot_time): """Report download finished.""" if self.params.get('noprogress', False): self.to_screen(u'[download] Download completed') else: - self.to_screen(u'') + clear_line = (u'\x1b[K' if sys.stderr.isatty() and os.name != 'nt' else u'') + self.to_screen(u'\r%s[download] 100%% of %s in %s' % + (clear_line, data_len_str, self.format_seconds(tot_time))) def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url): self.report_destination(filename) @@ -538,7 +548,7 @@ class FileDownloader(object): self.report_error(u'Did not get any data blocks') return False stream.close() - self.report_finish() + self.report_finish(data_len_str, (time.time() - start)) if data_len is not None and byte_counter != data_len: raise ContentTooShortError(byte_counter, int(data_len)) self.try_rename(tmpfilename, filename) diff --git a/youtube_dl/PostProcessor.py b/youtube_dl/PostProcessor.py index fddf58606..c02ed7148 100644 --- a/youtube_dl/PostProcessor.py +++ b/youtube_dl/PostProcessor.py @@ -71,12 +71,17 @@ class FFmpegPostProcessor(PostProcessor): programs = ['avprobe', 'avconv', 'ffmpeg', 'ffprobe'] return dict((program, executable(program)) for program in programs) - def run_ffmpeg(self, path, out_path, opts): + def run_ffmpeg_multiple_files(self, input_paths, out_path, opts): if not self._exes['ffmpeg'] and not self._exes['avconv']: raise FFmpegPostProcessorError(u'ffmpeg or avconv not found. Please install one.') - cmd = ([self._exes['avconv'] or self._exes['ffmpeg'], '-y', '-i', encodeFilename(path)] + + files_cmd = [] + for path in input_paths: + files_cmd.extend(['-i', encodeFilename(path)]) + cmd = ([self._exes['avconv'] or self._exes['ffmpeg'], '-y'] + files_cmd + opts + [encodeFilename(self._ffmpeg_filename_argument(out_path))]) + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout,stderr = p.communicate() if p.returncode != 0: @@ -84,6 +89,9 @@ class FFmpegPostProcessor(PostProcessor): msg = stderr.strip().split('\n')[-1] raise FFmpegPostProcessorError(msg) + def run_ffmpeg(self, path, out_path, opts): + self.run_ffmpeg_multiple_files([path], out_path, opts) + def _ffmpeg_filename_argument(self, fn): # ffmpeg broke --, see https://ffmpeg.org/trac/ffmpeg/ticket/2127 for details if fn.startswith(u'-'): @@ -232,3 +240,227 @@ class FFmpegVideoConvertor(FFmpegPostProcessor): information['format'] = self._preferedformat information['ext'] = self._preferedformat return False,information + + +class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): + # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt + _lang_map = { + 'aa': 'aar', + 'ab': 'abk', + 'ae': 'ave', + 'af': 'afr', + 'ak': 'aka', + 'am': 'amh', + 'an': 'arg', + 'ar': 'ara', + 'as': 'asm', + 'av': 'ava', + 'ay': 'aym', + 'az': 'aze', + 'ba': 'bak', + 'be': 'bel', + 'bg': 'bul', + 'bh': 'bih', + 'bi': 'bis', + 'bm': 'bam', + 'bn': 'ben', + 'bo': 'bod', + 'br': 'bre', + 'bs': 'bos', + 'ca': 'cat', + 'ce': 'che', + 'ch': 'cha', + 'co': 'cos', + 'cr': 'cre', + 'cs': 'ces', + 'cu': 'chu', + 'cv': 'chv', + 'cy': 'cym', + 'da': 'dan', + 'de': 'deu', + 'dv': 'div', + 'dz': 'dzo', + 'ee': 'ewe', + 'el': 'ell', + 'en': 'eng', + 'eo': 'epo', + 'es': 'spa', + 'et': 'est', + 'eu': 'eus', + 'fa': 'fas', + 'ff': 'ful', + 'fi': 'fin', + 'fj': 'fij', + 'fo': 'fao', + 'fr': 'fra', + 'fy': 'fry', + 'ga': 'gle', + 'gd': 'gla', + 'gl': 'glg', + 'gn': 'grn', + 'gu': 'guj', + 'gv': 'glv', + 'ha': 'hau', + 'he': 'heb', + 'hi': 'hin', + 'ho': 'hmo', + 'hr': 'hrv', + 'ht': 'hat', + 'hu': 'hun', + 'hy': 'hye', + 'hz': 'her', + 'ia': 'ina', + 'id': 'ind', + 'ie': 'ile', + 'ig': 'ibo', + 'ii': 'iii', + 'ik': 'ipk', + 'io': 'ido', + 'is': 'isl', + 'it': 'ita', + 'iu': 'iku', + 'ja': 'jpn', + 'jv': 'jav', + 'ka': 'kat', + 'kg': 'kon', + 'ki': 'kik', + 'kj': 'kua', + 'kk': 'kaz', + 'kl': 'kal', + 'km': 'khm', + 'kn': 'kan', + 'ko': 'kor', + 'kr': 'kau', + 'ks': 'kas', + 'ku': 'kur', + 'kv': 'kom', + 'kw': 'cor', + 'ky': 'kir', + 'la': 'lat', + 'lb': 'ltz', + 'lg': 'lug', + 'li': 'lim', + 'ln': 'lin', + 'lo': 'lao', + 'lt': 'lit', + 'lu': 'lub', + 'lv': 'lav', + 'mg': 'mlg', + 'mh': 'mah', + 'mi': 'mri', + 'mk': 'mkd', + 'ml': 'mal', + 'mn': 'mon', + 'mr': 'mar', + 'ms': 'msa', + 'mt': 'mlt', + 'my': 'mya', + 'na': 'nau', + 'nb': 'nob', + 'nd': 'nde', + 'ne': 'nep', + 'ng': 'ndo', + 'nl': 'nld', + 'nn': 'nno', + 'no': 'nor', + 'nr': 'nbl', + 'nv': 'nav', + 'ny': 'nya', + 'oc': 'oci', + 'oj': 'oji', + 'om': 'orm', + 'or': 'ori', + 'os': 'oss', + 'pa': 'pan', + 'pi': 'pli', + 'pl': 'pol', + 'ps': 'pus', + 'pt': 'por', + 'qu': 'que', + 'rm': 'roh', + 'rn': 'run', + 'ro': 'ron', + 'ru': 'rus', + 'rw': 'kin', + 'sa': 'san', + 'sc': 'srd', + 'sd': 'snd', + 'se': 'sme', + 'sg': 'sag', + 'si': 'sin', + 'sk': 'slk', + 'sl': 'slv', + 'sm': 'smo', + 'sn': 'sna', + 'so': 'som', + 'sq': 'sqi', + 'sr': 'srp', + 'ss': 'ssw', + 'st': 'sot', + 'su': 'sun', + 'sv': 'swe', + 'sw': 'swa', + 'ta': 'tam', + 'te': 'tel', + 'tg': 'tgk', + 'th': 'tha', + 'ti': 'tir', + 'tk': 'tuk', + 'tl': 'tgl', + 'tn': 'tsn', + 'to': 'ton', + 'tr': 'tur', + 'ts': 'tso', + 'tt': 'tat', + 'tw': 'twi', + 'ty': 'tah', + 'ug': 'uig', + 'uk': 'ukr', + 'ur': 'urd', + 'uz': 'uzb', + 've': 'ven', + 'vi': 'vie', + 'vo': 'vol', + 'wa': 'wln', + 'wo': 'wol', + 'xh': 'xho', + 'yi': 'yid', + 'yo': 'yor', + 'za': 'zha', + 'zh': 'zho', + 'zu': 'zul', + } + + def __init__(self, downloader=None, subtitlesformat='srt'): + super(FFmpegEmbedSubtitlePP, self).__init__(downloader) + self._subformat = subtitlesformat + + @classmethod + def _conver_lang_code(cls, code): + """Convert language code from ISO 639-1 to ISO 639-2/T""" + return cls._lang_map.get(code[:2]) + + def run(self, information): + if information['ext'] != u'mp4': + self._downloader.to_screen(u'[ffmpeg] Subtitles can only be embedded in mp4 files') + return True, information + sub_langs = [key for key in information['subtitles']] + + filename = information['filepath'] + input_files = [filename] + [subtitles_filename(filename, lang, self._subformat) for lang in sub_langs] + + opts = ['-map', '0:0', '-map', '0:1', '-c:v', 'copy', '-c:a', 'copy'] + for (i, lang) in enumerate(sub_langs): + opts.extend(['-map', '%d:0' % (i+1), '-c:s:%d' % i, 'mov_text']) + lang_code = self._conver_lang_code(lang) + if lang_code is not None: + opts.extend(['-metadata:s:s:%d' % i, 'language=%s' % lang_code]) + opts.extend(['-f', 'mp4']) + + temp_filename = filename + u'.temp' + self._downloader.to_screen(u'[ffmpeg] Embedding subtitles in \'%s\'' % filename) + self.run_ffmpeg_multiple_files(input_files, temp_filename, opts) + os.remove(encodeFilename(filename)) + os.rename(encodeFilename(temp_filename), encodeFilename(filename)) + + return True, information diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 496866900..b289bd9e2 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -76,7 +76,7 @@ class YoutubeDL(object): allsubtitles: Downloads all the subtitles of the video listsubtitles: Lists all available subtitles for the video subtitlesformat: Subtitle format [srt/sbv/vtt] (default=srt) - subtitleslang: Language of the subtitles to download + subtitleslangs: List of languages of the subtitles to download keepvideo: Keep the video file after post-processing daterange: A DateRange object, download only if the upload_date is in the range. skip_download: Skip the actual download of the video file @@ -97,6 +97,7 @@ class YoutubeDL(object): def __init__(self, params): """Create a FileDownloader object with the given options.""" self._ies = [] + self._ies_instances = {} self._pps = [] self._progress_hooks = [] self._download_retcode = 0 @@ -111,8 +112,21 @@ class YoutubeDL(object): def add_info_extractor(self, ie): """Add an InfoExtractor object to the end of the list.""" self._ies.append(ie) + self._ies_instances[ie.ie_key()] = ie ie.set_downloader(self) + def get_info_extractor(self, ie_key): + """ + Get an instance of an IE with name ie_key, it will try to get one from + the _ies list, if there's no instance it will create a new one and add + it to the extractor list. + """ + ie = self._ies_instances.get(ie_key) + if ie is None: + ie = get_info_extractor(ie_key)() + self.add_info_extractor(ie) + return ie + def add_default_info_extractors(self): """ Add the InfoExtractors returned by gen_extractors to the end of the list @@ -264,7 +278,7 @@ class YoutubeDL(object): self.report_error(u'Erroneous output template') return None except ValueError as err: - self.report_error(u'Insufficient system charset ' + repr(preferredencoding())) + self.report_error(u'Error in output template: ' + str(err) + u' (encoding: ' + repr(preferredencoding()) + ')') return None def _match_entry(self, info_dict): @@ -294,9 +308,7 @@ class YoutubeDL(object): ''' if ie_key: - ie = get_info_extractor(ie_key)() - ie.set_downloader(self) - ies = [ie] + ies = [self.get_info_extractor(ie_key)] else: ies = self._ies @@ -448,7 +460,8 @@ class YoutubeDL(object): if self.params.get('forceid', False): compat_print(info_dict['id']) if self.params.get('forceurl', False): - compat_print(info_dict['url']) + # For RTMP URLs, also include the playpath + compat_print(info_dict['url'] + info_dict.get('play_path', u'')) if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict: compat_print(info_dict['thumbnail']) if self.params.get('forcedescription', False) and 'description' in info_dict: @@ -483,41 +496,28 @@ class YoutubeDL(object): self.report_error(u'Cannot write description file ' + descfn) return - if (self.params.get('writesubtitles', False) or self.params.get('writeautomaticsub')) and 'subtitles' in info_dict and info_dict['subtitles']: + subtitles_are_requested = any([self.params.get('writesubtitles', False), + self.params.get('writeautomaticsub'), + self.params.get('allsubtitles', False)]) + + if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']: # subtitles download errors are already managed as troubles in relevant IE # that way it will silently go on when used with unsupporting IE - subtitle = info_dict['subtitles'][0] - (sub_error, sub_lang, sub) = subtitle + subtitles = info_dict['subtitles'] sub_format = self.params.get('subtitlesformat') - if sub_error: - self.report_warning("Some error while getting the subtitles") - else: + for sub_lang in subtitles.keys(): + sub = subtitles[sub_lang] + if sub is None: + continue try: - sub_filename = filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format + sub_filename = subtitles_filename(filename, sub_lang, sub_format) self.report_writesubtitles(sub_filename) with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: - subfile.write(sub) + subfile.write(sub) except (OSError, IOError): self.report_error(u'Cannot write subtitles file ' + descfn) return - if self.params.get('allsubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']: - subtitles = info_dict['subtitles'] - sub_format = self.params.get('subtitlesformat') - for subtitle in subtitles: - (sub_error, sub_lang, sub) = subtitle - if sub_error: - self.report_warning("Some error while getting the subtitles") - else: - try: - sub_filename = filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format - self.report_writesubtitles(sub_filename) - with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: - subfile.write(sub) - except (OSError, IOError): - self.report_error(u'Cannot write subtitles file ' + descfn) - return - if self.params.get('writeinfojson', False): infofn = filename + u'.info.json' self.report_writeinfojson(infofn) @@ -547,7 +547,7 @@ class YoutubeDL(object): try: success = self.fd._do_download(filename, info_dict) except (OSError, IOError) as err: - raise UnavailableVideoError() + raise UnavailableVideoError(err) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: self.report_error(u'unable to download video data: %s' % str(err)) return diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index bf040aacd..b33a18a26 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -27,6 +27,7 @@ __authors__ = ( 'Johny Mo Swag', 'Axel Noack', 'Albert Kim', + 'Pierre Rudloff', ) __license__ = 'Public Domain' @@ -44,6 +45,7 @@ import sys import warnings import platform + from .utils import * from .update import update_self from .version import __version__ @@ -82,6 +84,9 @@ def parseOpts(overrideArguments=None): return "".join(opts) + def _comma_separated_values_options_callback(option, opt_str, value, parser): + setattr(parser.values, option.dest, value.split(',')) + def _find_term_columns(): columns = os.environ.get('COLUMNS', None) if columns: @@ -119,6 +124,7 @@ def parseOpts(overrideArguments=None): selection = optparse.OptionGroup(parser, 'Video Selection') authentication = optparse.OptionGroup(parser, 'Authentication Options') video_format = optparse.OptionGroup(parser, 'Video Format Options') + subtitles = optparse.OptionGroup(parser, 'Subtitle Options') downloader = optparse.OptionGroup(parser, 'Download Options') postproc = optparse.OptionGroup(parser, 'Post-processing Options') filesystem = optparse.OptionGroup(parser, 'Filesystem Options') @@ -185,27 +191,29 @@ def parseOpts(overrideArguments=None): action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download') video_format.add_option('-F', '--list-formats', action='store_true', dest='listformats', help='list all available formats (currently youtube only)') - video_format.add_option('--write-sub', '--write-srt', + + subtitles.add_option('--write-sub', '--write-srt', action='store_true', dest='writesubtitles', help='write subtitle file (currently youtube only)', default=False) - video_format.add_option('--write-auto-sub', '--write-automatic-sub', + subtitles.add_option('--write-auto-sub', '--write-automatic-sub', action='store_true', dest='writeautomaticsub', help='write automatic subtitle file (currently youtube only)', default=False) - video_format.add_option('--only-sub', + subtitles.add_option('--only-sub', action='store_true', dest='skip_download', help='[deprecated] alias of --skip-download', default=False) - video_format.add_option('--all-subs', + subtitles.add_option('--all-subs', action='store_true', dest='allsubtitles', - help='downloads all the available subtitles of the video (currently youtube only)', default=False) - video_format.add_option('--list-subs', + help='downloads all the available subtitles of the video', default=False) + subtitles.add_option('--list-subs', action='store_true', dest='listsubtitles', - help='lists all available subtitles for the video (currently youtube only)', default=False) - video_format.add_option('--sub-format', + help='lists all available subtitles for the video', default=False) + subtitles.add_option('--sub-format', action='store', dest='subtitlesformat', metavar='FORMAT', - help='subtitle format [srt/sbv/vtt] (default=srt) (currently youtube only)', default='srt') - video_format.add_option('--sub-lang', '--srt-lang', - action='store', dest='subtitleslang', metavar='LANG', - help='language of the subtitles to download (optional) use IETF language tags like \'en\'') + help='subtitle format (default=srt) ([sbv/vtt] youtube only)', default='srt') + subtitles.add_option('--sub-lang', '--sub-langs', '--srt-lang', + action='callback', dest='subtitleslang', metavar='LANGS', type='str', + default=[], callback=_comma_separated_values_options_callback, + help='languages of the subtitles to download (optional) separated by commas, use IETF language tags like \'en,pt\'') downloader.add_option('-r', '--rate-limit', dest='ratelimit', metavar='LIMIT', help='maximum download rate (e.g. 50k or 44.6m)') @@ -320,6 +328,8 @@ def parseOpts(overrideArguments=None): help='keeps the video file on disk after the post-processing; the video is erased by default') postproc.add_option('--no-post-overwrites', action='store_true', dest='nopostoverwrites', default=False, help='do not overwrite post-processed files; the post-processed files are overwritten by default') + postproc.add_option('--embed-subs', action='store_true', dest='embedsubtitles', default=False, + help='embed subtitles in the video (only for mp4 videos)') parser.add_option_group(general) @@ -328,6 +338,7 @@ def parseOpts(overrideArguments=None): parser.add_option_group(filesystem) parser.add_option_group(verbosity) parser.add_option_group(video_format) + parser.add_option_group(subtitles) parser.add_option_group(authentication) parser.add_option_group(postproc) @@ -343,7 +354,7 @@ def parseOpts(overrideArguments=None): userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf') systemConf = _readOptions('/etc/youtube-dl.conf') userConf = _readOptions(userConfFile) - commandLineConf = sys.argv[1:] + commandLineConf = sys.argv[1:] argv = systemConf + userConf + commandLineConf opts, args = parser.parse_args(argv) if opts.verbose: @@ -377,7 +388,7 @@ def _real_main(argv=None): # Set user agent if opts.user_agent is not None: std_headers['User-Agent'] = opts.user_agent - + # Set referer if opts.referer is not None: std_headers['Referer'] = opts.referer @@ -420,6 +431,10 @@ def _real_main(argv=None): proxy_handler = compat_urllib_request.ProxyHandler(proxies) https_handler = make_HTTPS_handler(opts) opener = compat_urllib_request.build_opener(https_handler, proxy_handler, cookie_processor, YoutubeDLHandler()) + # Delete the default user-agent header, which would otherwise apply in + # cases where our custom HTTP handler doesn't come into play + # (See https://github.com/rg3/youtube-dl/issues/1309 for details) + opener.addheaders =[] compat_urllib_request.install_opener(opener) socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words) @@ -567,7 +582,7 @@ def _real_main(argv=None): 'allsubtitles': opts.allsubtitles, 'listsubtitles': opts.listsubtitles, 'subtitlesformat': opts.subtitlesformat, - 'subtitleslang': opts.subtitleslang, + 'subtitleslangs': opts.subtitleslang, 'matchtitle': decodeOption(opts.matchtitle), 'rejecttitle': decodeOption(opts.rejecttitle), 'max_downloads': opts.max_downloads, @@ -597,7 +612,7 @@ def _real_main(argv=None): sys.exc_clear() except: pass - sys.stderr.write(u'[debug] Python version %s - %s' %(platform.python_version(), platform.platform()) + u'\n') + sys.stderr.write(u'[debug] Python version %s - %s' %(platform.python_version(), platform_name()) + u'\n') sys.stderr.write(u'[debug] Proxy map: ' + str(proxy_handler.proxies) + u'\n') ydl.add_default_info_extractors() @@ -607,6 +622,8 @@ def _real_main(argv=None): ydl.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, nopostoverwrites=opts.nopostoverwrites)) if opts.recodevideo: ydl.add_post_processor(FFmpegVideoConvertor(preferedformat=opts.recodevideo)) + if opts.embedsubtitles: + ydl.add_post_processor(FFmpegEmbedSubtitlePP(subtitlesformat=opts.subtitlesformat)) # Update version if opts.update_self: diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3a08d676f..6b5037c8c 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -1,3 +1,5 @@ +from .appletrailers import AppleTrailersIE +from .addanime import AddAnimeIE from .archiveorg import ArchiveOrgIE from .ard import ARDIE from .arte import ArteTvIE @@ -6,7 +8,10 @@ from .bandcamp import BandcampIE from .bliptv import BlipTVIE, BlipTVUserIE from .breakcom import BreakIE from .brightcove import BrightcoveIE +from .c56 import C56IE from .canalplus import CanalplusIE +from .canalc2 import Canalc2IE +from .cnn import CNNIE from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE from .condenast import CondeNastIE @@ -29,6 +34,7 @@ from .gametrailers import GametrailersIE from .generic import GenericIE from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE +from .hark import HarkIE from .hotnewhiphop import HotNewHipHopIE from .howcast import HowcastIE from .hypem import HypemIE @@ -36,6 +42,7 @@ from .ign import IGNIE, OneUPIE from .ina import InaIE from .infoq import InfoQIE from .instagram import InstagramIE +from .jeuxvideo import JeuxVideoIE from .jukebox import JukeboxIE from .justintv import JustinTVIE from .kankan import KankanIE @@ -43,18 +50,26 @@ from .keek import KeekIE from .liveleak import LiveLeakIE from .livestream import LivestreamIE from .metacafe import MetacafeIE +from .mit import TechTVMITIE, MITIE from .mixcloud import MixcloudIE from .mtv import MTVIE +from .muzu import MuzuTVIE from .myspass import MySpassIE from .myvideo import MyVideoIE from .nba import NBAIE +from .nbc import NBCNewsIE +from .ooyala import OoyalaIE +from .pbs import PBSIE from .photobucket import PhotobucketIE from .pornotube import PornotubeIE from .rbmaradio import RBMARadioIE from .redtube import RedTubeIE from .ringtv import RingTVIE +from .ro220 import Ro220IE from .roxwel import RoxwelIE +from .rtlnow import RTLnowIE from .sina import SinaIE +from .slashdot import SlashdotIE from .sohu import SohuIE from .soundcloud import SoundcloudIE, SoundcloudSetIE from .spiegel import SpiegelIE @@ -66,16 +81,18 @@ from .ted import TEDIE from .tf1 import TF1IE from .thisav import ThisAVIE from .traileraddict import TrailerAddictIE +from .trilulilu import TriluliluIE from .tudou import TudouIE from .tumblr import TumblrIE from .tutv import TutvIE +from .unistra import UnistraIE from .ustream import UstreamIE from .vbox7 import Vbox7IE from .veoh import VeohIE from .vevo import VevoIE +from .videofyme import VideofyMeIE from .vimeo import VimeoIE, VimeoChannelIE from .vine import VineIE -from .c56 import C56IE from .wat import WatIE from .weibo import WeiboIE from .wimp import WimpIE @@ -109,12 +126,14 @@ _ALL_CLASSES = [ ] _ALL_CLASSES.append(GenericIE) + def gen_extractors(): """ Return a list of an instance of every supported extractor. The order does matter; the first extractor matched is the one handling the URL. """ return [klass() for klass in _ALL_CLASSES] + def get_info_extractor(ie_name): """Returns the info extractor class with the given ie_name""" return globals()[ie_name+'IE'] diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py new file mode 100644 index 000000000..82a785a19 --- /dev/null +++ b/youtube_dl/extractor/addanime.py @@ -0,0 +1,75 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + compat_HTTPError, + compat_str, + compat_urllib_parse, + compat_urllib_parse_urlparse, + + ExtractorError, +) + + +class AddAnimeIE(InfoExtractor): + + _VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video.php\?(?:.*?)v=(?P[\w_]+)(?:.*)' + IE_NAME = u'AddAnime' + _TEST = { + u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9', + u'file': u'24MR3YO5SAS9.flv', + u'md5': u'1036a0e0cd307b95bd8a8c3a5c8cfaf1', + u'info_dict': { + u"description": u"One Piece 606", + u"title": u"One Piece 606" + } + } + + def _real_extract(self, url): + try: + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('video_id') + webpage = self._download_webpage(url, video_id) + except ExtractorError as ee: + if not isinstance(ee.cause, compat_HTTPError): + raise + + redir_webpage = ee.cause.read().decode('utf-8') + action = self._search_regex( + r'
', + redir_webpage, u'redirect vc value') + av = re.search( + r'a\.value = ([0-9]+)[+]([0-9]+)[*]([0-9]+);', + redir_webpage) + if av is None: + raise ExtractorError(u'Cannot find redirect math task') + av_res = int(av.group(1)) + int(av.group(2)) * int(av.group(3)) + + parsed_url = compat_urllib_parse_urlparse(url) + av_val = av_res + len(parsed_url.netloc) + confirm_url = ( + parsed_url.scheme + u'://' + parsed_url.netloc + + action + '?' + + compat_urllib_parse.urlencode({ + 'jschl_vc': vc, 'jschl_answer': compat_str(av_val)})) + self._download_webpage( + confirm_url, video_id, + note=u'Confirming after redirect') + webpage = self._download_webpage(url, video_id) + + video_url = self._search_regex(r"var normal_video_file = '(.*?)';", + webpage, u'video file URL') + video_title = self._og_search_title(webpage) + video_description = self._og_search_description(webpage) + + return { + '_type': 'video', + 'id': video_id, + 'url': video_url, + 'ext': 'flv', + 'title': video_title, + 'description': video_description + } diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py new file mode 100644 index 000000000..8b191c196 --- /dev/null +++ b/youtube_dl/extractor/appletrailers.py @@ -0,0 +1,166 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + determine_ext, +) + + +class AppleTrailersIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?trailers.apple.com/trailers/(?P[^/]+)/(?P[^/]+)' + _TEST = { + u"url": u"http://trailers.apple.com/trailers/wb/manofsteel/", + u"playlist": [ + { + u"file": u"manofsteel-trailer4.mov", + u"md5": u"11874af099d480cc09e103b189805d5f", + u"info_dict": { + u"duration": 111, + u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_11624.jpg", + u"title": u"Trailer 4", + u"upload_date": u"20130523", + u"uploader_id": u"wb", + }, + }, + { + u"file": u"manofsteel-trailer3.mov", + u"md5": u"07a0a262aae5afe68120eed61137ab34", + u"info_dict": { + u"duration": 182, + u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_10793.jpg", + u"title": u"Trailer 3", + u"upload_date": u"20130417", + u"uploader_id": u"wb", + }, + }, + { + u"file": u"manofsteel-trailer.mov", + u"md5": u"e401fde0813008e3307e54b6f384cff1", + u"info_dict": { + u"duration": 148, + u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_8703.jpg", + u"title": u"Trailer", + u"upload_date": u"20121212", + u"uploader_id": u"wb", + }, + }, + { + u"file": u"manofsteel-teaser.mov", + u"md5": u"76b392f2ae9e7c98b22913c10a639c97", + u"info_dict": { + u"duration": 93, + u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_6899.jpg", + u"title": u"Teaser", + u"upload_date": u"20120721", + u"uploader_id": u"wb", + }, + } + ] + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + movie = mobj.group('movie') + uploader_id = mobj.group('company') + + playlist_url = url.partition(u'?')[0] + u'/includes/playlists/web.inc' + playlist_snippet = self._download_webpage(playlist_url, movie) + playlist_cleaned = re.sub(r'(?s)', u'', playlist_snippet) + playlist_html = u'' + playlist_cleaned + u'' + + size_cache = {} + + doc = xml.etree.ElementTree.fromstring(playlist_html) + playlist = [] + for li in doc.findall('./div/ul/li'): + title = li.find('.//h3').text + video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower() + thumbnail = li.find('.//img').attrib['src'] + + date_el = li.find('.//p') + upload_date = None + m = re.search(r':\s?(?P[0-9]{2})/(?P[0-9]{2})/(?P[0-9]{2})', date_el.text) + if m: + upload_date = u'20' + m.group('year') + m.group('month') + m.group('day') + runtime_el = date_el.find('./br') + m = re.search(r':\s?(?P[0-9]+):(?P[0-9]{1,2})', runtime_el.tail) + duration = None + if m: + duration = 60 * int(m.group('minutes')) + int(m.group('seconds')) + + formats = [] + for formats_el in li.findall('.//a'): + if formats_el.attrib['class'] != 'OverlayPanel': + continue + target = formats_el.attrib['target'] + + format_code = formats_el.text + if 'Automatic' in format_code: + continue + + size_q = formats_el.attrib['href'] + size_id = size_q.rpartition('#videos-')[2] + if size_id not in size_cache: + size_url = url + size_q + sizepage_html = self._download_webpage( + size_url, movie, + note=u'Downloading size info %s' % size_id, + errnote=u'Error while downloading size info %s' % size_id, + ) + _doc = xml.etree.ElementTree.fromstring(sizepage_html) + size_cache[size_id] = _doc + + sizepage_doc = size_cache[size_id] + links = sizepage_doc.findall('.//{http://www.w3.org/1999/xhtml}ul/{http://www.w3.org/1999/xhtml}li/{http://www.w3.org/1999/xhtml}a') + for vid_a in links: + href = vid_a.get('href') + if not href.endswith(target): + continue + detail_q = href.partition('#')[0] + detail_url = url + '/' + detail_q + + m = re.match(r'includes/(?P[^/]+)/', detail_q) + detail_id = m.group('detail_id') + + detail_html = self._download_webpage( + detail_url, movie, + note=u'Downloading detail %s %s' % (detail_id, size_id), + errnote=u'Error while downloading detail %s %s' % (detail_id, size_id) + ) + detail_doc = xml.etree.ElementTree.fromstring(detail_html) + movie_link_el = detail_doc.find('.//{http://www.w3.org/1999/xhtml}a') + assert movie_link_el.get('class') == 'movieLink' + movie_link = movie_link_el.get('href').partition('?')[0].replace('_', '_h') + ext = determine_ext(movie_link) + assert ext == 'mov' + + formats.append({ + 'format': format_code, + 'ext': ext, + 'url': movie_link, + }) + + info = { + '_type': 'video', + 'id': video_id, + 'title': title, + 'formats': formats, + 'title': title, + 'duration': duration, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'uploader_id': uploader_id, + 'user_agent': 'QuickTime compatible (youtube-dl)', + } + # TODO: Remove when #980 has been merged + info['url'] = formats[-1]['url'] + info['ext'] = formats[-1]['ext'] + + playlist.append(info) + + return { + '_type': 'playlist', + 'id': movie, + 'entries': playlist, + } diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 18d591658..69b3b0ad7 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -17,13 +17,14 @@ class ArteTvIE(InfoExtractor): """ _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?Pfr|de)/(?:(?:sendungen|emissions)/)?(?P.*?)/(?P.*?)(\?.*)?' _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?Pfr|de)/.*-(?P.*?).html' + _LIVEWEB_URL = r'(?:http://)?liveweb.arte.tv/(?Pfr|de)/(?P.+?)/(?P.+)' _LIVE_URL = r'index-[0-9]+\.html$' IE_NAME = u'arte.tv' @classmethod def suitable(cls, url): - return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL)) + return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL, cls._LIVEWEB_URL)) # TODO implement Live Stream # from ..utils import compat_urllib_parse @@ -68,6 +69,12 @@ class ArteTvIE(InfoExtractor): lang = mobj.group('lang') return self._extract_video(url, id, lang) + mobj = re.match(self._LIVEWEB_URL, url) + if mobj is not None: + name = mobj.group('name') + lang = mobj.group('lang') + return self._extract_liveweb(url, name, lang) + if re.search(self._LIVE_URL, video_id) is not None: raise ExtractorError(u'Arte live streams are not yet supported, sorry') # self.extractLiveStream(url) @@ -85,7 +92,7 @@ class ArteTvIE(InfoExtractor): info_dict = {'id': player_info['VID'], 'title': player_info['VTI'], - 'description': player_info['VDE'], + 'description': player_info.get('VDE'), 'upload_date': unified_strdate(player_info['VDA'].split(' ')[0]), 'thumbnail': player_info['programImage'], 'ext': 'flv', @@ -104,6 +111,8 @@ class ArteTvIE(InfoExtractor): formats = filter(_match_lang, formats) # We order the formats by quality formats = sorted(formats, key=lambda f: int(f['height'])) + # Prefer videos without subtitles in the same language + formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f['versionCode']) is None) # Pick the best quality format_info = formats[-1] if format_info['mediaType'] == u'rtmp': @@ -144,3 +153,22 @@ class ArteTvIE(InfoExtractor): 'url': video_url, 'ext': 'flv', } + + def _extract_liveweb(self, url, name, lang): + """Extract form http://liveweb.arte.tv/""" + webpage = self._download_webpage(url, name) + video_id = self._search_regex(r'eventId=(\d+?)("|&)', webpage, u'event id') + config_xml = self._download_webpage('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id, + video_id, u'Downloading information') + config_doc = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8')) + event_doc = config_doc.find('event') + url_node = event_doc.find('video').find('urlHd') + if url_node is None: + url_node = video_doc.find('urlSd') + + return {'id': video_id, + 'title': event_doc.find('name%s' % lang.capitalize()).text, + 'url': url_node.text.replace('MP4', 'mp4'), + 'ext': 'flv', + 'thumbnail': self._og_search_thumbnail(webpage), + } diff --git a/youtube_dl/extractor/c56.py b/youtube_dl/extractor/c56.py index 4c8a8af09..dc3a8d47d 100644 --- a/youtube_dl/extractor/c56.py +++ b/youtube_dl/extractor/c56.py @@ -12,8 +12,8 @@ class C56IE(InfoExtractor): _TEST ={ u'url': u'http://www.56.com/u39/v_OTM0NDA3MTY.html', - u'file': u'93440716.mp4', - u'md5': u'9dc07b5c8e978112a6441f9e75d2b59e', + u'file': u'93440716.flv', + u'md5': u'e59995ac63d0457783ea05f93f12a866', u'info_dict': { u'title': u'网事知多少 第32期:车怒', }, diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py new file mode 100644 index 000000000..50832217a --- /dev/null +++ b/youtube_dl/extractor/canalc2.py @@ -0,0 +1,35 @@ +# coding: utf-8 +import re + +from .common import InfoExtractor + + +class Canalc2IE(InfoExtractor): + _IE_NAME = 'canalc2.tv' + _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?idVideo=(\d+)&voir=oui' + + _TEST = { + u'url': u'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui', + u'file': u'12163.mp4', + u'md5': u'060158428b650f896c542dfbb3d6487f', + u'info_dict': { + u'title': u'Terrasses du Numérique' + } + } + + def _real_extract(self, url): + video_id = re.match(self._VALID_URL, url).group(1) + webpage = self._download_webpage(url, video_id) + file_name = self._search_regex( + r"so\.addVariable\('file','(.*?)'\);", + webpage, 'file name') + video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name + + title = self._html_search_regex( + r'class="evenement8">(.*?)', webpage, u'title') + + return {'id': video_id, + 'ext': 'mp4', + 'url': video_url, + 'title': title, + } diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 3b1c88876..1f02519a0 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -5,7 +5,7 @@ from .common import InfoExtractor from ..utils import unified_strdate class CanalplusIE(InfoExtractor): - _VALID_URL = r'https?://www\.canalplus\.fr/.*?\?vid=(?P\d+)' + _VALID_URL = r'https?://(www\.canalplus\.fr/.*?\?vid=|player\.canalplus\.fr/#/)(?P\d+)' _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s' IE_NAME = u'canalplus.fr' diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py new file mode 100644 index 000000000..a79f881cd --- /dev/null +++ b/youtube_dl/extractor/cnn.py @@ -0,0 +1,58 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import determine_ext + + +class CNNIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://(edition\.)?cnn\.com/video/(data/.+?|\?)/ + (?P.+?/(?P[^/]+?)(?:\.cnn|(?=&)))''' + + _TESTS = [{ + u'url': u'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', + u'file': u'sports_2013_06_09_nadal-1-on-1.cnn.mp4', + u'md5': u'3e6121ea48df7e2259fe73a0628605c4', + u'info_dict': { + u'title': u'Nadal wins 8th French Open title', + u'description': u'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.', + }, + }, + { + u"url": u"http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29", + u"file": u"us_2013_08_21_sot-student-gives-epic-speech.georgia-institute-of-technology.mp4", + u"md5": u"b5cc60c60a3477d185af8f19a2a26f4e", + u"info_dict": { + u"title": "Student's epic speech stuns new freshmen", + u"description": "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"" + } + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + path = mobj.group('path') + page_title = mobj.group('title') + info_url = u'http://cnn.com/video/data/3.0/%s/index.xml' % path + info_xml = self._download_webpage(info_url, page_title) + info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) + + formats = [] + for f in info.findall('files/file'): + mf = re.match(r'(\d+)x(\d+)(?:_(.*)k)?',f.attrib['bitrate']) + if mf is not None: + formats.append((int(mf.group(1)), int(mf.group(2)), int(mf.group(3) or 0), f.text)) + formats = sorted(formats) + (_,_,_, video_path) = formats[-1] + video_url = 'http://ht.cdn.turner.com/cnn/big%s' % video_path + + thumbnails = sorted([((int(t.attrib['height']),int(t.attrib['width'])), t.text) for t in info.findall('images/image')]) + thumbs_dict = [{'resolution': res, 'url': t_url} for (res, t_url) in thumbnails] + + return {'id': info.attrib['id'], + 'title': info.find('headline').text, + 'url': video_url, + 'ext': determine_ext(video_url), + 'thumbnail': thumbnails[-1][1], + 'thumbnails': thumbs_dict, + 'description': info.find('description').text, + } diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py index 5badde03a..8d4c93d6d 100644 --- a/youtube_dl/extractor/collegehumor.py +++ b/youtube_dl/extractor/collegehumor.py @@ -4,15 +4,16 @@ import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( compat_urllib_parse_urlparse, + determine_ext, ExtractorError, ) class CollegeHumorIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/(video|embed|e)/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$' + _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/(video|embed|e)/(?P<videoid>[0-9]+)/?(?P<shorttitle>.*)$' - _TEST = { + _TESTS = [{ u'url': u'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe', u'file': u'6902724.mp4', u'md5': u'1264c12ad95dca142a9f0bf7968105a0', @@ -20,7 +21,16 @@ class CollegeHumorIE(InfoExtractor): u'title': u'Comic-Con Cosplay Catastrophe', u'description': u'Fans get creative this year at San Diego. Too creative. And yes, that\'s really Joss Whedon.', }, - } + }, + { + u'url': u'http://www.collegehumor.com/video/3505939/font-conference', + u'file': u'3505939.mp4', + u'md5': u'c51ca16b82bb456a4397987791a835f5', + u'info_dict': { + u'title': u'Font Conference', + u'description': u'This video wasn\'t long enough, so we made it double-spaced.', + }, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -49,25 +59,29 @@ class CollegeHumorIE(InfoExtractor): info['description'] = videoNode.findall('./description')[0].text info['title'] = videoNode.findall('./caption')[0].text info['thumbnail'] = videoNode.findall('./thumbnail')[0].text - manifest_url = videoNode.findall('./file')[0].text + next_url = videoNode.findall('./file')[0].text except IndexError: raise ExtractorError(u'Invalid metadata XML file') - manifest_url += '?hdcore=2.10.3' - manifestXml = self._download_webpage(manifest_url, video_id, - u'Downloading XML manifest', - u'Unable to download video info XML') - - adoc = xml.etree.ElementTree.fromstring(manifestXml) - try: - media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0] - node_id = media_node.attrib['url'] - video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text - except IndexError as err: - raise ExtractorError(u'Invalid manifest file') + if next_url.endswith(u'manifest.f4m'): + manifest_url = next_url + '?hdcore=2.10.3' + manifestXml = self._download_webpage(manifest_url, video_id, + u'Downloading XML manifest', + u'Unable to download video info XML') - url_pr = compat_urllib_parse_urlparse(info['thumbnail']) + adoc = xml.etree.ElementTree.fromstring(manifestXml) + try: + media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0] + node_id = media_node.attrib['url'] + video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text + except IndexError as err: + raise ExtractorError(u'Invalid manifest file') + url_pr = compat_urllib_parse_urlparse(info['thumbnail']) + info['url'] = url_pr.scheme + '://' + url_pr.netloc + video_id[:-2].replace('.csmil','').replace(',','') + info['ext'] = 'mp4' + else: + # Old-style direct links + info['url'] = next_url + info['ext'] = determine_ext(info['url']) - info['url'] = url_pr.scheme + '://' + url_pr.netloc + video_id[:-2].replace('.csmil','').replace(',','') - info['ext'] = 'mp4' - return [info] + return info diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index da50abfc1..77a13aea5 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -47,7 +47,8 @@ class InfoExtractor(object): uploader_id: Nickname or id of the video uploader. location: Physical location of the video. player_url: SWF Player URL (used for rtmpdump). - subtitles: The subtitle file contents. + subtitles: The subtitle file contents as a dictionary in the format + {language: subtitles}. view_count: How many users have watched the video on the platform. urlhandle: [internal] The urlHandle to be used to download the file, like returned by urllib.request.urlopen @@ -77,7 +78,13 @@ class InfoExtractor(object): @classmethod def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(cls._VALID_URL, url) is not None + + # This does not use has/getattr intentionally - we want to know whether + # we have cached the regexp for *this* class, whereas getattr would also + # match the superclass + if '_VALID_URL_RE' not in cls.__dict__: + cls._VALID_URL_RE = re.compile(cls._VALID_URL) + return cls._VALID_URL_RE.match(url) is not None @classmethod def working(cls): @@ -107,6 +114,11 @@ class InfoExtractor(object): """Real extraction process. Redefine in subclasses.""" pass + @classmethod + def ie_key(cls): + """A string for getting the InfoExtractor with get_info_extractor""" + return cls.__name__[:-2] + @property def IE_NAME(self): return type(self).__name__[:-2] @@ -122,7 +134,7 @@ class InfoExtractor(object): except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: if errnote is None: errnote = u'Unable to download webpage' - raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2]) + raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err) def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None): """ Returns a tuple (page content as string, URL handle) """ diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index fa8c630d0..1ea449ca8 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -21,7 +21,7 @@ class DailymotionIE(InfoExtractor): u'file': u'x33vw9.mp4', u'md5': u'392c4b85a60a90dc4792da41ce3144eb', u'info_dict': { - u"uploader": u"Alex and Van .", + u"uploader": u"Amphora Alex and Van .", u"title": u"Tutoriel de Youtubeur\"DL DES VIDEO DE YOUTUBE\"" } } diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 67a7e5f76..4508f0dfa 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -21,17 +21,14 @@ class FunnyOrDieIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', + video_url = self._search_regex(r'type: "video/mp4", src: "(.*?)"', webpage, u'video URL', flags=re.DOTALL) - title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", - r'<title>(?P<title>[^<]+?)'), webpage, 'title', flags=re.DOTALL) - info = { 'id': video_id, 'url': video_url, 'ext': 'mp4', - 'title': title, + 'title': self._og_search_title(webpage), 'description': self._og_search_description(webpage), } return [info] diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index b633e896c..dc4dea4ad 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -8,11 +8,13 @@ from ..utils import ( compat_urllib_error, compat_urllib_parse, compat_urllib_request, + compat_urlparse, ExtractorError, ) from .brightcove import BrightcoveIE + class GenericIE(InfoExtractor): IE_DESC = u'Generic downloader that works on some sites' _VALID_URL = r'.*' @@ -23,7 +25,7 @@ class GenericIE(InfoExtractor): u'file': u'13601338388002.mp4', u'md5': u'85b90ccc9d73b4acd9138d3af4c27f89', u'info_dict': { - u"uploader": u"www.hodiho.fr", + u"uploader": u"www.hodiho.fr", u"title": u"R\u00e9gis plante sa Jeep" } }, @@ -107,8 +109,13 @@ class GenericIE(InfoExtractor): return new_url def _real_extract(self, url): - new_url = self._test_redirect(url) - if new_url: return [self.url_result(new_url)] + try: + new_url = self._test_redirect(url) + if new_url: + return [self.url_result(new_url)] + except compat_urllib_error.HTTPError: + # This may be a stupid server that doesn't like HEAD, our UA, or so + pass video_id = url.split('/')[-1] try: @@ -119,7 +126,7 @@ class GenericIE(InfoExtractor): raise ExtractorError(u'Invalid URL: %s' % url) self.report_extraction(video_id) - # Look for BrigthCove: + # Look for BrightCove: m_brightcove = re.search(r'', webpage, re.DOTALL) if m_brightcove is not None: self.to_screen(u'Brightcove video detected.') @@ -144,6 +151,9 @@ class GenericIE(InfoExtractor): # We only look in og:video if the MIME type is a video, don't try if it's a Flash player: if m_video_type is not None: mobj = re.search(r'.*?', webpage) + + xml_link = m_download.group(1) + + id = re.search(r'http://www.jeuxvideo.com/config/\w+/0011/(.*?)/\d+_player\.xml', xml_link).group(1) + + xml_config = self._download_webpage(xml_link, title, + 'Downloading XML config') + config = xml.etree.ElementTree.fromstring(xml_config.encode('utf-8')) + info = re.search(r'(.*?)', + xml_config, re.MULTILINE|re.DOTALL).group(1) + info = json.loads(info)['versions'][0] + + video_url = 'http://video720.jeuxvideo.com/' + info['file'] + + return {'id': id, + 'title' : config.find('titre_video').text, + 'ext' : 'mp4', + 'url' : video_url, + 'description': self._og_search_description(webpage), + 'thumbnail': config.find('image').text, + } diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py new file mode 100644 index 000000000..d09d03e36 --- /dev/null +++ b/youtube_dl/extractor/mit.py @@ -0,0 +1,76 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import ( + clean_html, + get_element_by_id, +) + + +class TechTVMITIE(InfoExtractor): + IE_NAME = u'techtv.mit.edu' + _VALID_URL = r'https?://techtv\.mit\.edu/(videos|embeds)/(?P\d+)' + + _TEST = { + u'url': u'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set', + u'file': u'25418.mp4', + u'md5': u'1f8cb3e170d41fd74add04d3c9330e5f', + u'info_dict': { + u'title': u'MIT DNA Learning Center Set', + u'description': u'md5:82313335e8a8a3f243351ba55bc1b474', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage( + 'http://techtv.mit.edu/videos/%s' % video_id, video_id) + embed_page = self._download_webpage( + 'http://techtv.mit.edu/embeds/%s/' % video_id, video_id, + note=u'Downloading embed page') + + base_url = self._search_regex(r'ipadUrl: \'(.+?cloudfront.net/)', + embed_page, u'base url') + formats_json = self._search_regex(r'bitrates: (\[.+?\])', embed_page, + u'video formats') + formats = json.loads(formats_json) + formats = sorted(formats, key=lambda f: f['bitrate']) + + title = get_element_by_id('edit-title', webpage) + description = clean_html(get_element_by_id('edit-description', webpage)) + thumbnail = self._search_regex(r'playlist:.*?url: \'(.+?)\'', + embed_page, u'thumbnail', flags=re.DOTALL) + + return {'id': video_id, + 'title': title, + 'url': base_url + formats[-1]['url'].replace('mp4:', ''), + 'ext': 'mp4', + 'description': description, + 'thumbnail': thumbnail, + } + + +class MITIE(TechTVMITIE): + IE_NAME = u'video.mit.edu' + _VALID_URL = r'https?://video\.mit\.edu/watch/(?P[^/]+)' + + _TEST = { + u'url': u'http://video.mit.edu/watch/the-government-is-profiling-you-13222/', + u'file': u'21783.mp4', + u'md5': u'7db01d5ccc1895fc5010e9c9e13648da', + u'info_dict': { + u'title': u'The Government is Profiling You', + u'description': u'md5:ad5795fe1e1623b73620dbfd47df9afd', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + page_title = mobj.group('title') + webpage = self._download_webpage(url, page_title) + self.to_screen('%s: Extracting %s url' % (page_title, TechTVMITIE.IE_NAME)) + embed_url = self._search_regex(r'<iframe .*?src="(.+?)"', webpage, + u'embed url') + return self.url_result(embed_url, ie='TechTVMIT') diff --git a/youtube_dl/extractor/muzu.py b/youtube_dl/extractor/muzu.py new file mode 100644 index 000000000..03e31ea1c --- /dev/null +++ b/youtube_dl/extractor/muzu.py @@ -0,0 +1,64 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse, + determine_ext, +) + + +class MuzuTVIE(InfoExtractor): + _VALID_URL = r'https?://www.muzu.tv/(.+?)/(.+?)/(?P<id>\d+)' + IE_NAME = u'muzu.tv' + + _TEST = { + u'url': u'http://www.muzu.tv/defected/marcashken-featuring-sos-cat-walk-original-mix-music-video/1981454/', + u'file': u'1981454.mp4', + u'md5': u'98f8b2c7bc50578d6a0364fff2bfb000', + u'info_dict': { + u'title': u'Cat Walk (Original Mix)', + u'description': u'md5:90e868994de201b2570e4e5854e19420', + u'uploader': u'MarcAshken featuring SOS', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + info_data = compat_urllib_parse.urlencode({'format': 'json', + 'url': url, + }) + video_info_page = self._download_webpage('http://www.muzu.tv/api/oembed/?%s' % info_data, + video_id, u'Downloading video info') + info = json.loads(video_info_page) + + player_info_page = self._download_webpage('http://player.muzu.tv/player/playerInit?ai=%s' % video_id, + video_id, u'Downloading player info') + video_info = json.loads(player_info_page)['videos'][0] + for quality in ['1080' , '720', '480', '360']: + if video_info.get('v%s' % quality): + break + + data = compat_urllib_parse.urlencode({'ai': video_id, + # Even if each time you watch a video the hash changes, + # it seems to work for different videos, and it will work + # even if you use any non empty string as a hash + 'viewhash': 'VBNff6djeV4HV5TRPW5kOHub2k', + 'device': 'web', + 'qv': quality, + }) + video_url_page = self._download_webpage('http://player.muzu.tv/player/requestVideo?%s' % data, + video_id, u'Downloading video url') + video_url_info = json.loads(video_url_page) + video_url = video_url_info['url'] + + return {'id': video_id, + 'title': info['title'], + 'url': video_url, + 'ext': determine_ext(video_url), + 'thumbnail': info['thumbnail_url'], + 'description': info['description'], + 'uploader': info['author_name'], + } diff --git a/youtube_dl/extractor/myvideo.py b/youtube_dl/extractor/myvideo.py index b2a7b1df0..0404e6e43 100644 --- a/youtube_dl/extractor/myvideo.py +++ b/youtube_dl/extractor/myvideo.py @@ -2,11 +2,13 @@ import binascii import base64 import hashlib import re +import json from .common import InfoExtractor from ..utils import ( compat_ord, compat_urllib_parse, + compat_urllib_request, ExtractorError, ) @@ -16,7 +18,7 @@ from ..utils import ( class MyVideoIE(InfoExtractor): """Information Extractor for myvideo.de.""" - _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*' + _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/(?:[^/]+/)?watch/([0-9]+)/([^?/]+).*' IE_NAME = u'myvideo' _TEST = { u'url': u'http://www.myvideo.de/watch/8229274/bowling_fail_or_win', @@ -85,6 +87,20 @@ class MyVideoIE(InfoExtractor): 'ext': video_ext, }] + mobj = re.search(r'data-video-service="/service/data/video/%s/config' % video_id, webpage) + if mobj is not None: + request = compat_urllib_request.Request('http://www.myvideo.de/service/data/video/%s/config' % video_id, '') + response = self._download_webpage(request, video_id, + u'Downloading video info') + info = json.loads(base64.b64decode(response).decode('utf-8')) + return {'id': video_id, + 'title': info['title'], + 'url': info['streaming_url'].replace('rtmpe', 'rtmpt'), + 'play_path': info['filename'], + 'ext': 'flv', + 'thumbnail': info['thumbnail'][0]['url'], + } + # try encxml mobj = re.search('var flashvars={(.+?)}', webpage) if mobj is None: diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py new file mode 100644 index 000000000..3bc9dae6d --- /dev/null +++ b/youtube_dl/extractor/nbc.py @@ -0,0 +1,33 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import find_xpath_attr, compat_str + + +class NBCNewsIE(InfoExtractor): + _VALID_URL = r'https?://www\.nbcnews\.com/video/.+?/(?P<id>\d+)' + + _TEST = { + u'url': u'http://www.nbcnews.com/video/nbc-news/52753292', + u'file': u'52753292.flv', + u'md5': u'47abaac93c6eaf9ad37ee6c4463a5179', + u'info_dict': { + u'title': u'Crew emerges after four-month Mars food study', + u'description': u'md5:24e632ffac72b35f8b67a12d1b6ddfc1', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + info_xml = self._download_webpage('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id) + info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')).find('video') + + return {'id': video_id, + 'title': info.find('headline').text, + 'ext': 'flv', + 'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text, + 'description': compat_str(info.find('caption').text), + 'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text, + } diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py new file mode 100644 index 000000000..b734722d0 --- /dev/null +++ b/youtube_dl/extractor/ooyala.py @@ -0,0 +1,52 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import unescapeHTML + +class OoyalaIE(InfoExtractor): + _VALID_URL = r'https?://.+?\.ooyala\.com/.*?embedCode=(?P<id>.+?)(&|$)' + + _TEST = { + # From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video + u'url': u'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8', + u'file': u'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8.mp4', + u'md5': u'3f5cceb3a7bf461d6c29dc466cf8033c', + u'info_dict': { + u'title': u'Explaining Data Recovery from Hard Drives and SSDs', + u'description': u'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.', + }, + } + + def _extract_result(self, info, more_info): + return {'id': info['embedCode'], + 'ext': 'mp4', + 'title': unescapeHTML(info['title']), + 'url': info['url'], + 'description': unescapeHTML(more_info['description']), + 'thumbnail': more_info['promo'], + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + embedCode = mobj.group('id') + player_url = 'http://player.ooyala.com/player.js?embedCode=%s' % embedCode + player = self._download_webpage(player_url, embedCode) + mobile_url = self._search_regex(r'mobile_player_url="(.+?)&device="', + player, u'mobile player url') + mobile_player = self._download_webpage(mobile_url, embedCode) + videos_info = self._search_regex(r'eval\("\((\[{.*?stream_redirect.*?}\])\)"\);', mobile_player, u'info').replace('\\"','"') + videos_more_info = self._search_regex(r'eval\("\(({.*?\\"promo\\".*?})\)"', mobile_player, u'more info').replace('\\"','"') + videos_info = json.loads(videos_info) + videos_more_info =json.loads(videos_more_info) + + if videos_more_info.get('lineup'): + videos = [self._extract_result(info, more_info) for (info, more_info) in zip(videos_info, videos_more_info['lineup'])] + return {'_type': 'playlist', + 'id': embedCode, + 'title': unescapeHTML(videos_more_info['title']), + 'entries': videos, + } + else: + return self._extract_result(videos_info[0], videos_more_info) + diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py new file mode 100644 index 000000000..65462d867 --- /dev/null +++ b/youtube_dl/extractor/pbs.py @@ -0,0 +1,34 @@ +import re +import json + +from .common import InfoExtractor + + +class PBSIE(InfoExtractor): + _VALID_URL = r'https?://video.pbs.org/video/(?P<id>\d+)/?' + + _TEST = { + u'url': u'http://video.pbs.org/video/2365006249/', + u'file': u'2365006249.mp4', + u'md5': 'ce1888486f0908d555a8093cac9a7362', + u'info_dict': { + u'title': u'A More Perfect Union', + u'description': u'md5:ba0c207295339c8d6eced00b7c363c6a', + u'duration': 3190, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id + info_page = self._download_webpage(info_url, video_id) + info =json.loads(info_page) + return {'id': video_id, + 'title': info['title'], + 'url': info['alternate_encoding']['url'], + 'ext': 'mp4', + 'description': info['program'].get('description'), + 'thumbnail': info.get('image_url'), + 'duration': info.get('duration'), + } diff --git a/youtube_dl/extractor/ro220.py b/youtube_dl/extractor/ro220.py new file mode 100644 index 000000000..c32f64d99 --- /dev/null +++ b/youtube_dl/extractor/ro220.py @@ -0,0 +1,42 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + compat_parse_qs, +) + + +class Ro220IE(InfoExtractor): + IE_NAME = '220.ro' + _VALID_URL = r'(?x)(?:https?://)?(?:www\.)?220\.ro/(?P<category>[^/]+)/(?P<shorttitle>[^/]+)/(?P<video_id>[^/]+)' + _TEST = { + u"url": u"http://www.220.ro/sport/Luati-Le-Banii-Sez-4-Ep-1/LYV6doKo7f/", + u'file': u'LYV6doKo7f.mp4', + u'md5': u'03af18b73a07b4088753930db7a34add', + u'info_dict': { + u"title": u"Luati-le Banii sez 4 ep 1", + u"description": u"Iata-ne reveniti dupa o binemeritata vacanta. Va astept si pe Facebook cu pareri si comentarii.", + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('video_id') + + webpage = self._download_webpage(url, video_id) + flashVars_str = self._search_regex( + r'<param name="flashVars" value="([^"]+)"', + webpage, u'flashVars') + flashVars = compat_parse_qs(flashVars_str) + + info = { + '_type': 'video', + 'id': video_id, + 'ext': 'mp4', + 'url': flashVars['videoURL'][0], + 'title': flashVars['title'][0], + 'description': clean_html(flashVars['desc'][0]), + 'thumbnail': flashVars['preview'][0], + } + return info diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py new file mode 100644 index 000000000..7bb236c2b --- /dev/null +++ b/youtube_dl/extractor/rtlnow.py @@ -0,0 +1,126 @@ +# encoding: utf-8 +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + ExtractorError, +) + +class RTLnowIE(InfoExtractor): + """Information Extractor for RTL NOW, RTL2 NOW, SUPER RTL NOW and VOX NOW""" + _VALID_URL = r'(?:http://)?(?P<url>(?P<base_url>rtl-now\.rtl\.de/|rtl2now\.rtl2\.de/|(?:www\.)?voxnow\.de/|(?:www\.)?superrtlnow\.de/)[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)' + _TESTS = [{ + u'url': u'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1', + u'file': u'90419.flv', + u'info_dict': { + u'upload_date': u'20070416', + u'title': u'Ahornallee - Folge 1 - Der Einzug', + u'description': u'Folge 1 - Der Einzug', + }, + u'params': { + u'skip_download': True, + }, + u'skip': u'Only works from Germany', + }, + { + u'url': u'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5', + u'file': u'69756.flv', + u'info_dict': { + u'upload_date': u'20120519', + u'title': u'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit...', + u'description': u'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.', + u'thumbnail': u'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg', + }, + u'params': { + u'skip_download': True, + }, + u'skip': u'Only works from Germany', + }, + { + u'url': u'www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17', + u'file': u'13883.flv', + u'info_dict': { + u'upload_date': u'20090627', + u'title': u'Voxtours - Südafrika-Reporter II', + u'description': u'Südafrika-Reporter II', + }, + u'params': { + u'skip_download': True, + }, + }, + { + u'url': u'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1', + u'file': u'99205.flv', + u'info_dict': { + u'upload_date': u'20080928', + u'title': u'Medicopter 117 - Angst!', + u'description': u'Angst!', + u'thumbnail': u'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg' + }, + u'params': { + u'skip_download': True, + }, + }] + + def _real_extract(self,url): + mobj = re.match(self._VALID_URL, url) + + webpage_url = u'http://' + mobj.group('url') + video_page_url = u'http://' + mobj.group('base_url') + video_id = mobj.group(u'video_id') + + webpage = self._download_webpage(webpage_url, video_id) + + note_m = re.search(r'''(?sx) + <div[ ]style="margin-left:[ ]20px;[ ]font-size:[ ]13px;">(.*?) + <div[ ]id="playerteaser">''', webpage) + if note_m: + msg = clean_html(note_m.group(1)) + raise ExtractorError(msg) + + video_title = self._html_search_regex(r'<title>(?P<title>[^<]+)', + webpage, u'title') + playerdata_url = self._html_search_regex(r'\'playerdata\': \'(?P[^\']+)\'', + webpage, u'playerdata_url') + + playerdata = self._download_webpage(playerdata_url, video_id) + mobj = re.search(r'<!\[CDATA\[(?P<description>.+?)\s+- (?:Sendung )?vom (?P<upload_date_d>[0-9]{2})\.(?P<upload_date_m>[0-9]{2})\.(?:(?P<upload_date_Y>[0-9]{4})|(?P<upload_date_y>[0-9]{2})) [0-9]{2}:[0-9]{2} Uhr\]\]>', playerdata) + if mobj: + video_description = mobj.group(u'description') + if mobj.group('upload_date_Y'): + video_upload_date = mobj.group('upload_date_Y') + else: + video_upload_date = u'20' + mobj.group('upload_date_y') + video_upload_date += mobj.group('upload_date_m')+mobj.group('upload_date_d') + else: + video_description = None + video_upload_date = None + self._downloader.report_warning(u'Unable to extract description and upload date') + + # Thumbnail: not every video has an thumbnail + mobj = re.search(r'', webpage) + if mobj: + video_thumbnail = mobj.group(u'thumbnail') + else: + video_thumbnail = None + + mobj = re.search(r']+>rtmpe://(?:[^/]+/){2})(?P[^\]]+)\]\]>', playerdata) + if mobj is None: + raise ExtractorError(u'Unable to extract media URL') + video_url = mobj.group(u'url') + video_play_path = u'mp4:' + mobj.group(u'play_path') + video_player_url = video_page_url + u'includes/vodplayer.swf' + + return [{ + 'id': video_id, + 'url': video_url, + 'play_path': video_play_path, + 'page_url': video_page_url, + 'player_url': video_player_url, + 'ext': 'flv', + 'title': video_title, + 'description': video_description, + 'upload_date': video_upload_date, + 'thumbnail': video_thumbnail, + }] diff --git a/youtube_dl/extractor/slashdot.py b/youtube_dl/extractor/slashdot.py new file mode 100644 index 000000000..2cba53076 --- /dev/null +++ b/youtube_dl/extractor/slashdot.py @@ -0,0 +1,23 @@ +import re + +from .common import InfoExtractor + + +class SlashdotIE(InfoExtractor): + _VALID_URL = r'https?://tv.slashdot.org/video/\?embed=(?P.*?)(&|$)' + + _TEST = { + u'url': u'http://tv.slashdot.org/video/?embed=JscHMzZDplD0p-yNLOzTfzC3Q3xzJaUz', + u'file': u'JscHMzZDplD0p-yNLOzTfzC3Q3xzJaUz.mp4', + u'md5': u'd2222e7a4a4c1541b3e0cf732fb26735', + u'info_dict': { + u'title': u' Meet the Stampede Supercomputing Cluster\'s Administrator', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + ooyala_url = self._search_regex(r'