X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;ds=sidebyside;f=youtube-dl;h=042b852671b01e5f12092d5a980337360129199d;hb=8abf76ddb90c83a5afd7517d4797f3c07bd1cf0f;hp=251254765b4c831ca55581c4b8d8b535e88b4523;hpb=6a4f0a114d88965c171d0117db68be64b4db9acd;p=youtube-dl diff --git a/youtube-dl b/youtube-dl index 251254765..042b85267 100755 --- a/youtube-dl +++ b/youtube-dl @@ -2,24 +2,31 @@ # -*- coding: utf-8 -*- __author__ = ( - "Ricardo Garcia Gonzalez", - "Danny Colligan", - "Benjamin Johnson", - "Vasyl' Vavrychuk", - "Witold Baryluk", - "PaweŠPaprota", - "Gergely Imreh", + 'Ricardo Garcia Gonzalez', + 'Danny Colligan', + 'Benjamin Johnson', + 'Vasyl\' Vavrychuk', + 'Witold Baryluk', + 'PaweŠPaprota', + 'Gergely Imreh', + 'Rogério Brito', + 'Philipp Hagemeister', + 'Sören Schulze', + 'Kevin Ngo', + 'Ori Avtalion', + 'shizeeg', ) -__license__ = "Public Domain" -__version__ = '2011.08.04' +__license__ = 'Public Domain' +__version__ = '2011.11.23' + +UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl' import cookielib -import ctypes import datetime -import email.utils import gzip import htmlentitydefs +import HTMLParser import httplib import locale import math @@ -29,20 +36,42 @@ import os.path import re import socket import string -import StringIO import subprocess import sys import time import urllib import urllib2 +import warnings import zlib +if os.name == 'nt': + import ctypes + +try: + import email.utils +except ImportError: # Python 2.4 + import email.Utils +try: + import cStringIO as StringIO +except ImportError: + import StringIO + # parse_qs was moved from the cgi module to the urlparse module recently. try: from urlparse import parse_qs except ImportError: from cgi import parse_qs +try: + import lxml.etree +except ImportError: + pass # Handled below + +try: + import xml.etree.ElementTree +except ImportError: # Python<2.5: Not officially supported, but let it slip + warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.') + std_headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', @@ -51,7 +80,118 @@ std_headers = { 'Accept-Language': 'en-us,en;q=0.5', } -simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii') +try: + import json +except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson): + import re + class json(object): + @staticmethod + def loads(s): + s = s.decode('UTF-8') + def raiseError(msg, i): + raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:])) + def skipSpace(i, expectMore=True): + while i < len(s) and s[i] in ' \t\r\n': + i += 1 + if expectMore: + if i >= len(s): + raiseError('Premature end', i) + return i + def decodeEscape(match): + esc = match.group(1) + _STATIC = { + '"': '"', + '\\': '\\', + '/': '/', + 'b': unichr(0x8), + 'f': unichr(0xc), + 'n': '\n', + 'r': '\r', + 't': '\t', + } + if esc in _STATIC: + return _STATIC[esc] + if esc[0] == 'u': + if len(esc) == 1+4: + return unichr(int(esc[1:5], 16)) + if len(esc) == 5+6 and esc[5:7] == '\\u': + hi = int(esc[1:5], 16) + low = int(esc[7:11], 16) + return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000) + raise ValueError('Unknown escape ' + str(esc)) + def parseString(i): + i += 1 + e = i + while True: + e = s.index('"', e) + bslashes = 0 + while s[e-bslashes-1] == '\\': + bslashes += 1 + if bslashes % 2 == 1: + e += 1 + continue + break + rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)') + stri = rexp.sub(decodeEscape, s[i:e]) + return (e+1,stri) + def parseObj(i): + i += 1 + res = {} + i = skipSpace(i) + if s[i] == '}': # Empty dictionary + return (i+1,res) + while True: + if s[i] != '"': + raiseError('Expected a string object key', i) + i,key = parseString(i) + i = skipSpace(i) + if i >= len(s) or s[i] != ':': + raiseError('Expected a colon', i) + i,val = parse(i+1) + res[key] = val + i = skipSpace(i) + if s[i] == '}': + return (i+1, res) + if s[i] != ',': + raiseError('Expected comma or closing curly brace', i) + i = skipSpace(i+1) + def parseArray(i): + res = [] + i = skipSpace(i+1) + if s[i] == ']': # Empty array + return (i+1,res) + while True: + i,val = parse(i) + res.append(val) + i = skipSpace(i) # Raise exception if premature end + if s[i] == ']': + return (i+1, res) + if s[i] != ',': + raiseError('Expected a comma or closing bracket', i) + i = skipSpace(i+1) + def parseDiscrete(i): + for k,v in {'true': True, 'false': False, 'null': None}.items(): + if s.startswith(k, i): + return (i+len(k), v) + raiseError('Not a boolean (or null)', i) + def parseNumber(i): + mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:]) + if mobj is None: + raiseError('Not a number', i) + nums = mobj.group(1) + if '.' in nums or 'e' in nums or 'E' in nums: + return (i+len(nums), float(nums)) + return (i+len(nums), int(nums)) + CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete} + def parse(i): + i = skipSpace(i) + i,res = CHARMAP.get(s[i], parseNumber)(i) + i = skipSpace(i, False) + return (i,res) + i,res = parse(0) + if i < len(s): + raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')') + return res def preferredencoding(): """Get preferred encoding. @@ -69,6 +209,7 @@ def preferredencoding(): yield pref return yield_preferredencoding().next() + def htmlentity_transform(matchobj): """Transforms an HTML entity to a Unicode character. @@ -95,11 +236,13 @@ def htmlentity_transform(matchobj): # Unknown entity in name, return its literal representation return (u'&%s;' % entity) + def sanitize_title(utitle): """Sanitizes a video title so it could be used as part of a filename.""" utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle) return utitle.replace(unicode(os.sep), u'%') + def sanitize_open(filename, open_mode): """Try to open the given filename, and slightly tweak it if this fails. @@ -126,13 +269,18 @@ def sanitize_open(filename, open_mode): stream = open(filename, open_mode) return (stream, filename) + def timeconvert(timestr): - """Convert RFC 2822 defined time string into system timestamp""" - timestamp = None - timetuple = email.utils.parsedate_tz(timestr) - if timetuple is not None: - timestamp = email.utils.mktime_tz(timetuple) - return timestamp + """Convert RFC 2822 defined time string into system timestamp""" + timestamp = None + timetuple = email.utils.parsedate_tz(timestr) + if timetuple is not None: + timestamp = email.utils.mktime_tz(timetuple) + return timestamp + +def _simplify_title(title): + expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE) + return expr.sub(u'_', title).strip(u'_') class DownloadError(Exception): """Download Error exception. @@ -143,6 +291,7 @@ class DownloadError(Exception): """ pass + class SameFileError(Exception): """Same File exception. @@ -151,6 +300,7 @@ class SameFileError(Exception): """ pass + class PostProcessingError(Exception): """Post Processing exception. @@ -159,6 +309,7 @@ class PostProcessingError(Exception): """ pass + class UnavailableVideoError(Exception): """Unavailable Format exception. @@ -167,6 +318,7 @@ class UnavailableVideoError(Exception): """ pass + class ContentTooShortError(Exception): """Content Too Short exception. @@ -182,6 +334,7 @@ class ContentTooShortError(Exception): self.downloaded = downloaded self.expected = expected + class YoutubeDLHandler(urllib2.HTTPHandler): """Handler for HTTP requests and responses. @@ -191,11 +344,11 @@ class YoutubeDLHandler(urllib2.HTTPHandler): a particular request, the original request in the program code only has to include the HTTP header "Youtubedl-No-Compression", which will be removed before making the real request. - + Part of this code was copied from: - http://techknack.net/python-urllib2-handlers/ - + http://techknack.net/python-urllib2-handlers/ + Andrew Rowls, the author of that code, agreed to release it to the public domain. """ @@ -206,7 +359,7 @@ class YoutubeDLHandler(urllib2.HTTPHandler): return zlib.decompress(data, -zlib.MAX_WBITS) except zlib.error: return zlib.decompress(data) - + @staticmethod def addinfourl_wrapper(stream, headers, url, code): if hasattr(urllib2.addinfourl, 'getcode'): @@ -214,7 +367,7 @@ class YoutubeDLHandler(urllib2.HTTPHandler): ret = urllib2.addinfourl(stream, headers, url) ret.code = code return ret - + def http_request(self, req): for h in std_headers: if h in req.headers: @@ -240,6 +393,7 @@ class YoutubeDLHandler(urllib2.HTTPHandler): resp.msg = old_resp.msg return resp + class FileDownloader(object): """File Downloader class. @@ -288,10 +442,14 @@ class FileDownloader(object): noprogress: Do not print the progress bar. playliststart: Playlist item to start at. playlistend: Playlist item to end at. + matchtitle: Download only matching titles. + rejecttitle: Reject downloads for matching titles. logtostderr: Log messages to stderr instead of stdout. consoletitle: Display progress in console window's titlebar. nopart: Do not use temporary .part files. updatetime: Use the Last-modified header to set output file timestamps. + writedescription: Write the video description to a .description file + writeinfojson: Write the video description to a .info.json file """ params = None @@ -310,16 +468,6 @@ class FileDownloader(object): self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)] self.params = params - @staticmethod - def pmkdir(filename): - """Create directory components in filename. Similar to Unix "mkdir -p".""" - components = filename.split(os.sep) - aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))] - aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator - for dir in aggregate: - if not os.path.exists(dir): - os.mkdir(dir) - @staticmethod def format_bytes(bytes): if bytes is None: @@ -331,7 +479,7 @@ class FileDownloader(object): else: exponent = long(math.log(bytes, 1024.0)) suffix = 'bkMGTPEZY'[exponent] - converted = float(bytes) / float(1024**exponent) + converted = float(bytes) / float(1024 ** exponent) return '%.2f%s' % (converted, suffix) @staticmethod @@ -469,7 +617,7 @@ class FileDownloader(object): os.rename(old_filename, new_filename) except (IOError, OSError), err: self.trouble(u'ERROR: unable to rename file') - + def try_utime(self, filename, last_modified_hdr): """Try to set the last-modified time of the given file.""" if last_modified_hdr is None: @@ -481,11 +629,20 @@ class FileDownloader(object): return filetime = timeconvert(timestr) if filetime is None: - return + return filetime try: - os.utime(filename,(time.time(), filetime)) + os.utime(filename, (time.time(), filetime)) except: pass + return filetime + + def report_writedescription(self, descfn): + """ Report that the description file is being written """ + self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True) + + def report_writeinfojson(self, infofn): + """ Report that the metadata file has been written """ + self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True) def report_destination(self, filename): """Report destination filename.""" @@ -542,54 +699,117 @@ class FileDownloader(object): self.trouble(u'ERROR: invalid system charset or erroneous output template') return None + def _match_entry(self, info_dict): + """ Returns None iff the file should be downloaded """ + + title = info_dict['title'] + matchtitle = self.params.get('matchtitle', False) + if matchtitle and not re.search(matchtitle, title, re.IGNORECASE): + return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"' + rejecttitle = self.params.get('rejecttitle', False) + if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE): + return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"' + return None + def process_info(self, info_dict): """Process a single dictionary returned by an InfoExtractor.""" + + reason = self._match_entry(info_dict) + if reason is not None: + self.to_screen(u'[download] ' + reason) + return + + max_downloads = self.params.get('max_downloads') + if max_downloads is not None: + if self._num_downloads > int(max_downloads): + self.to_screen(u'[download] Maximum number of downloads reached. Skipping ' + info_dict['title']) + return + filename = self.prepare_filename(info_dict) + + # Forced printings + if self.params.get('forcetitle', False): + print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace') + if self.params.get('forceurl', False): + print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace') + if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict: + print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace') + if self.params.get('forcedescription', False) and 'description' in info_dict: + print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace') + if self.params.get('forcefilename', False) and filename is not None: + print filename.encode(preferredencoding(), 'xmlcharrefreplace') + if self.params.get('forceformat', False): + print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace') + # Do nothing else if in simulate mode if self.params.get('simulate', False): - # Forced printings - if self.params.get('forcetitle', False): - print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace') - if self.params.get('forceurl', False): - print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace') - if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict: - print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace') - if self.params.get('forcedescription', False) and 'description' in info_dict: - print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace') - if self.params.get('forcefilename', False) and filename is not None: - print filename.encode(preferredencoding(), 'xmlcharrefreplace') - return if filename is None: return + if self.params.get('nooverwrites', False) and os.path.exists(filename): self.to_stderr(u'WARNING: file exists and will be skipped') return try: - self.pmkdir(filename) + dn = os.path.dirname(filename) + if dn != '' and not os.path.exists(dn): + os.makedirs(dn) except (OSError, IOError), err: - self.trouble(u'ERROR: unable to create directories: %s' % str(err)) + self.trouble(u'ERROR: unable to create directory ' + unicode(err)) return - try: - success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None)) - except (OSError, IOError), err: - raise UnavailableVideoError - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self.trouble(u'ERROR: unable to download video data: %s' % str(err)) - return - except (ContentTooShortError, ), err: - self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded)) - return + if self.params.get('writedescription', False): + try: + descfn = filename + '.description' + self.report_writedescription(descfn) + descfile = open(descfn, 'wb') + try: + descfile.write(info_dict['description'].encode('utf-8')) + finally: + descfile.close() + except (OSError, IOError): + self.trouble(u'ERROR: Cannot write description file ' + descfn) + return + + if self.params.get('writeinfojson', False): + infofn = filename + '.info.json' + self.report_writeinfojson(infofn) + try: + json.dump + except (NameError,AttributeError): + self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.') + return + try: + infof = open(infofn, 'wb') + try: + json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',)) + json.dump(json_info_dict, infof) + finally: + infof.close() + except (OSError, IOError): + self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn) + return - if success: + if not self.params.get('skip_download', False): try: - self.post_process(filename, info_dict) - except (PostProcessingError), err: - self.trouble(u'ERROR: postprocessing: %s' % str(err)) + success = self._do_download(filename, info_dict) + except (OSError, IOError), err: + raise UnavailableVideoError + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self.trouble(u'ERROR: unable to download video data: %s' % str(err)) + return + except (ContentTooShortError, ), err: + self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded)) return + + if success: + try: + self.post_process(filename, info_dict) + except (PostProcessingError), err: + self.trouble(u'ERROR: postprocessing: %s' % str(err)) + return def download(self, url_list): """Download a given list of URLs.""" @@ -650,6 +870,11 @@ class FileDownloader(object): cursize = os.path.getsize(tmpfilename) if prevsize == cursize and retval == 1: break + # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those + if prevsize == cursize and retval == 2 and cursize > 1024: + self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.') + retval = 0 + break if retval == 0: self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename)) self.try_rename(tmpfilename, filename) @@ -658,7 +883,10 @@ class FileDownloader(object): self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval) return False - def _do_download(self, filename, url, player_url): + def _do_download(self, filename, info_dict): + url = info_dict['url'] + player_url = info_dict.get('player_url', None) + # Check file already present if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False): self.report_file_already_downloaded(filename) @@ -670,7 +898,6 @@ class FileDownloader(object): tmpfilename = self.temp_name(filename) stream = None - open_mode = 'wb' # Do not include the Accept-Encoding header headers = {'Youtubedl-no-compression': 'True'} @@ -683,17 +910,22 @@ class FileDownloader(object): else: resume_len = 0 - # Request parameters in case of being able to resume - if self.params.get('continuedl', False) and resume_len != 0: - self.report_resuming_byte(resume_len) - request.add_header('Range','bytes=%d-' % resume_len) - open_mode = 'ab' + open_mode = 'wb' + if resume_len != 0: + if self.params.get('continuedl', False): + self.report_resuming_byte(resume_len) + request.add_header('Range','bytes=%d-' % resume_len) + open_mode = 'ab' + else: + resume_len = 0 count = 0 retries = self.params.get('retries', 0) while count <= retries: # Establish connection try: + if count == 0 and 'urlhandle' in info_dict: + data = info_dict['urlhandle'] data = urllib2.urlopen(request) break except (urllib2.HTTPError, ), err: @@ -712,7 +944,7 @@ class FileDownloader(object): else: # Examine the reported length if (content_length is not None and - (resume_len - 100 < long(content_length) < resume_len + 100)): + (resume_len - 100 < long(content_length) < resume_len + 100)): # The file had already been fully downloaded. # Explanation to the above condition: in issue #175 it was revealed that # YouTube sometimes adds or removes a few bytes from the end of the file, @@ -757,6 +989,7 @@ class FileDownloader(object): if stream is None: try: (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode) + assert stream is not None filename = self.undo_temp_name(tmpfilename) self.report_destination(filename) except (OSError, IOError), err: @@ -770,14 +1003,20 @@ class FileDownloader(object): block_size = self.best_block_size(after - before, len(data_block)) # Progress message - percent_str = self.calc_percent(byte_counter, data_len) - eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len) - self.report_progress(percent_str, data_len_str, speed_str, eta_str) + if data_len is None: + self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA') + else: + percent_str = self.calc_percent(byte_counter, data_len) + eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) + self.report_progress(percent_str, data_len_str, speed_str, eta_str) # Apply rate limit self.slow_down(start, byte_counter - resume_len) + if stream is None: + self.trouble(u'\nERROR: Did not get any data blocks') + return False stream.close() self.report_finish() if data_len is not None and byte_counter != data_len: @@ -786,10 +1025,11 @@ class FileDownloader(object): # Update file modification time if self.params.get('updatetime', True): - self.try_utime(filename, data.info().get('last-modified', None)) + info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None)) return True + class InfoExtractor(object): """Information Extractor class. @@ -820,9 +1060,8 @@ class InfoExtractor(object): description: One-line video description. Subclasses of this one should re-define the _real_initialize() and - _real_extract() methods, as well as the suitable() static method. - Probably, they should also be instantiated and added to the main - downloader. + _real_extract() methods and define a _VALID_URL regexp. + Probably, they should also be added to the list of extractors. """ _ready = False @@ -833,10 +1072,9 @@ class InfoExtractor(object): self._ready = False self.set_downloader(downloader) - @staticmethod - def suitable(url): + def suitable(self, url): """Receives a URL and returns True if suitable for this IE.""" - return False + return re.match(self._VALID_URL, url) is not None def initialize(self): """Initializes an instance (authentication, etc).""" @@ -861,16 +1099,17 @@ class InfoExtractor(object): """Real extraction process. Redefine in subclasses.""" pass + class YoutubeIE(InfoExtractor): """Information extractor for youtube.com.""" - _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$' + _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$' _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en' _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' _NETRC_MACHINE = 'youtube' # Listed in order of quality - _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13'] + _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13'] _video_extensions = { '13': '3gp', '17': 'mp4', @@ -879,12 +1118,25 @@ class YoutubeIE(InfoExtractor): '37': 'mp4', '38': 'video', # You actually don't know if this will be MOV, AVI or whatever '43': 'webm', + '44': 'webm', '45': 'webm', } - - @staticmethod - def suitable(url): - return (re.match(YoutubeIE._VALID_URL, url) is not None) + _video_dimensions = { + '5': '240x400', + '6': '???', + '13': '???', + '17': '144x176', + '18': '360x640', + '22': '720x1280', + '34': '360x640', + '35': '480x854', + '37': '1080x1920', + '38': '3072x4096', + '43': '360x640', + '44': '480x854', + '45': '720x1280', + } + IE_NAME = u'youtube' def report_lang(self): """Report attempt to set language.""" @@ -918,6 +1170,11 @@ class YoutubeIE(InfoExtractor): """Indicate the download will use the RTMP protocol.""" self._downloader.to_screen(u'[youtube] RTMP download detected') + def _print_formats(self, formats): + print 'Available formats:' + for x in formats: + print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')) + def _real_initialize(self): if self._downloader is None: return @@ -997,7 +1254,7 @@ class YoutubeIE(InfoExtractor): # Get video webpage self.report_video_webpage_download(video_id) - request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id) + request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id) try: video_webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: @@ -1015,7 +1272,7 @@ class YoutubeIE(InfoExtractor): self.report_video_info_webpage_download(video_id) for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' - % (video_id, el_type)) + % (video_id, el_type)) request = urllib2.Request(video_info_url) try: video_info_webpage = urllib2.urlopen(request).read() @@ -1050,8 +1307,7 @@ class YoutubeIE(InfoExtractor): video_title = sanitize_title(video_title) # simplified title - simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title) - simple_title = simple_title.strip(ur'_') + simple_title = _simplify_title(video_title) # thumbnail image if 'thumbnail_url' not in video_info: @@ -1073,11 +1329,19 @@ class YoutubeIE(InfoExtractor): pass # description - video_description = 'No description available.' - if self._downloader.params.get('forcedescription', False): - mobj = re.search(r'', video_webpage) - if mobj is not None: - video_description = mobj.group(1) + try: + lxml.etree + except NameError: + video_description = u'No description available.' + if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False): + mobj = re.search(r'', video_webpage) + if mobj is not None: + video_description = mobj.group(1).decode('utf-8') + else: + html_parser = lxml.etree.HTMLParser(encoding='utf-8') + vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser) + video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()')) + # TODO use another parser # token video_token = urllib.unquote_plus(video_info['token'][0]) @@ -1085,10 +1349,15 @@ class YoutubeIE(InfoExtractor): # Decide which formats to download req_format = self._downloader.params.get('format', None) - if 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1: + if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): + self.report_rtmp_download() + video_url_list = [(None, video_info['conn'][0])] + elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1: url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',') - url_data = [dict(pairStr.split('=') for pairStr in uds.split('&')) for uds in url_data_strs] - url_map = dict((ud['itag'], urllib.unquote(ud['url'])) for ud in url_data) + url_data = [parse_qs(uds) for uds in url_data_strs] + url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data) + url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data) + format_limit = self._downloader.params.get('format_limit', None) if format_limit is not None and format_limit in self._available_formats: format_list = self._available_formats[self._available_formats.index(format_limit):] @@ -1098,23 +1367,29 @@ class YoutubeIE(InfoExtractor): if len(existing_formats) == 0: self._downloader.trouble(u'ERROR: no known formats available for video') return - if req_format is None: + if self._downloader.params.get('listformats', None): + self._print_formats(existing_formats) + return + if req_format is None or req_format == 'best': video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality - elif req_format == '-1': + elif req_format == 'worst': + video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality + elif req_format in ('-1', 'all'): video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats else: - # Specific format - if req_format not in url_map: + # Specific formats. We pick the first in a slash-delimeted sequence. + # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'. + req_formats = req_format.split('/') + video_url_list = None + for rf in req_formats: + if rf in url_map: + video_url_list = [(rf, url_map[rf])] + break + if video_url_list is None: self._downloader.trouble(u'ERROR: requested format not available') return - video_url_list = [(req_format, url_map[req_format])] # Specific format - - elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): - self.report_rtmp_download() - video_url_list = [(None, video_info['conn'][0])] - else: - self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info') + self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info') return for format_param, video_real_url in video_url_list: @@ -1124,7 +1399,6 @@ class YoutubeIE(InfoExtractor): # Extension video_extension = self._video_extensions.get(format_param, 'flv') - # Find the video URL in fmt_url_map or conn paramters try: # Process video information self._downloader.process_info({ @@ -1137,7 +1411,7 @@ class YoutubeIE(InfoExtractor): 'ext': video_extension.decode('utf-8'), 'format': (format_param is None and u'NA' or format_param.decode('utf-8')), 'thumbnail': video_thumbnail.decode('utf-8'), - 'description': video_description.decode('utf-8'), + 'description': video_description, 'player_url': player_url, }) except UnavailableVideoError, err: @@ -1151,15 +1425,12 @@ class MetacafeIE(InfoExtractor): _DISCLAIMER = 'http://www.metacafe.com/family_filter/' _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' _youtube_ie = None + IE_NAME = u'metacafe' def __init__(self, youtube_ie, downloader=None): InfoExtractor.__init__(self, downloader) self._youtube_ie = youtube_ie - @staticmethod - def suitable(url): - return (re.match(MetacafeIE._VALID_URL, url) is not None) - def report_disclaimer(self): """Report disclaimer retrieval.""" self._downloader.to_screen(u'[metacafe] Retrieving disclaimer') @@ -1293,14 +1564,11 @@ class DailymotionIE(InfoExtractor): """Information Extractor for Dailymotion""" _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)' + IE_NAME = u'dailymotion' def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) - @staticmethod - def suitable(url): - return (re.match(DailymotionIE._VALID_URL, url) is not None) - def report_download_webpage(self, video_id): """Report webpage download.""" self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id) @@ -1309,9 +1577,6 @@ class DailymotionIE(InfoExtractor): """Report information extraction.""" self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id) - def _real_initialize(self): - return - def _real_extract(self, url): # Extract id and simplified title from URL mobj = re.match(self._VALID_URL, url) @@ -1328,6 +1593,7 @@ class DailymotionIE(InfoExtractor): # Retrieve video webpage to extract further information request = urllib2.Request(url) + request.add_header('Cookie', 'family_filter=off') try: self.report_download_webpage(video_id) webpage = urllib2.urlopen(request).read() @@ -1337,25 +1603,29 @@ class DailymotionIE(InfoExtractor): # Extract URL, uploader and title from webpage self.report_extraction(video_id) - mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage) + mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage) if mobj is None: self._downloader.trouble(u'ERROR: unable to extract media URL') return - mediaURL = urllib.unquote(mobj.group(1)) + sequence = urllib.unquote(mobj.group(1)) + mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence) + if mobj is None: + self._downloader.trouble(u'ERROR: unable to extract media URL') + return + mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '') # if needed add http://www.dailymotion.com/ if relative URL video_url = mediaURL - # '' - mobj = re.search(r'(?im)
(.*?)
', webpage) + if mobj: + description = mobj.group(1) + + # upload date + upload_date = None + mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)", webpage) + if mobj: + try: + upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d') + except Exception, e: + print str(e) + + # for soundcloud, a request to a cross domain is required for cookies + request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers) + + try: + self._downloader.process_info({ + 'id': video_id.decode('utf-8'), + 'url': mediaURL, + 'uploader': uploader.decode('utf-8'), + 'upload_date': upload_date, + 'title': simple_title.decode('utf-8'), + 'stitle': simple_title.decode('utf-8'), + 'ext': u'mp3', + 'format': u'NA', + 'player_url': None, + 'description': description.decode('utf-8') + }) + except UnavailableVideoError: + self._downloader.trouble(u'\nERROR: unable to download video') + + +class InfoQIE(InfoExtractor): + """Information extractor for infoq.com""" + + _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$' + IE_NAME = u'infoq' + + def report_webpage(self, video_id): + """Report information extraction.""" + self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id)) + + def report_extraction(self, video_id): + """Report information extraction.""" + self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) + + def _real_extract(self, url): + htmlParser = HTMLParser.HTMLParser() + + mobj = re.match(self._VALID_URL, url) + if mobj is None: + self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + return + + self.report_webpage(url) + + request = urllib2.Request(url) + try: + webpage = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err)) + return + + self.report_extraction(url) + + + # Extract video URL + mobj = re.search(r"jsclassref='([^']*)'", webpage) + if mobj is None: + self._downloader.trouble(u'ERROR: unable to extract video url') + return + video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64')) + + + # Extract title + mobj = re.search(r'contentTitle = "(.*?)";', webpage) + if mobj is None: + self._downloader.trouble(u'ERROR: unable to extract video title') + return + video_title = mobj.group(1).decode('utf-8') + + # Extract description + video_description = u'No description available.' + mobj = re.search(r'', webpage) + if mobj is not None: + video_description = mobj.group(1).decode('utf-8') + + video_filename = video_url.split('/')[-1] + video_id, extension = video_filename.split('.') + + self._downloader.increment_downloads() + info = { + 'id': video_id, + 'url': video_url, + 'uploader': None, + 'upload_date': None, + 'title': video_title, + 'stitle': _simplify_title(video_title), + 'ext': extension, + 'format': extension, # Extension is always(?) mp4, but seems to be flv + 'thumbnail': None, + 'description': video_description, + 'player_url': None, + } + + try: + self._downloader.process_info(info) + except UnavailableVideoError, err: + self._downloader.trouble(u'\nERROR: unable to download ' + video_url) + +class MixcloudIE(InfoExtractor): + """Information extractor for www.mixcloud.com""" + _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)' + IE_NAME = u'mixcloud' + + def __init__(self, downloader=None): + InfoExtractor.__init__(self, downloader) + + def report_download_json(self, file_id): + """Report JSON download.""" + self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME) + + def report_extraction(self, file_id): + """Report information extraction.""" + self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id)) + + def get_urls(self, jsonData, fmt, bitrate='best'): + """Get urls from 'audio_formats' section in json""" + file_url = None + try: + bitrate_list = jsonData[fmt] + if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list: + bitrate = max(bitrate_list) # select highest + + url_list = jsonData[fmt][bitrate] + except TypeError: # we have no bitrate info. + url_list = jsonData[fmt] + + return url_list + + def check_urls(self, url_list): + """Returns 1st active url from list""" + for url in url_list: + try: + urllib2.urlopen(url) + return url + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + url = None + + return None + + def _print_formats(self, formats): + print 'Available formats:' + for fmt in formats.keys(): + for b in formats[fmt]: + try: + ext = formats[fmt][b][0] + print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]) + except TypeError: # we have no bitrate info + ext = formats[fmt][0] + print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]) + break + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + return + # extract uploader & filename from url + uploader = mobj.group(1).decode('utf-8') + file_id = uploader + "-" + mobj.group(2).decode('utf-8') + + # construct API request + file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json' + # retrieve .json file with links to files + request = urllib2.Request(file_url) + try: + self.report_download_json(file_url) + jsonData = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err)) + return + + # parse JSON + json_data = json.loads(jsonData) + player_url = json_data['player_swf_url'] + formats = dict(json_data['audio_formats']) + + req_format = self._downloader.params.get('format', None) + bitrate = None + + if self._downloader.params.get('listformats', None): + self._print_formats(formats) + return + + if req_format is None or req_format == 'best': + for format_param in formats.keys(): + url_list = self.get_urls(formats, format_param) + # check urls + file_url = self.check_urls(url_list) + if file_url is not None: + break # got it! + else: + if req_format not in formats.keys(): + self._downloader.trouble(u'ERROR: format is not available') + return + + url_list = self.get_urls(formats, req_format) + file_url = self.check_urls(url_list) + format_param = req_format + + # We have audio + self._downloader.increment_downloads() + try: + # Process file information + self._downloader.process_info({ + 'id': file_id.decode('utf-8'), + 'url': file_url.decode('utf-8'), + 'uploader': uploader.decode('utf-8'), + 'upload_date': u'NA', + 'title': json_data['name'], + 'stitle': _simplify_title(json_data['name']), + 'ext': file_url.split('.')[-1].decode('utf-8'), + 'format': (format_param is None and u'NA' or format_param.decode('utf-8')), + 'thumbnail': json_data['thumbnail_url'], + 'description': json_data['description'], + 'player_url': player_url.decode('utf-8'), + }) + except UnavailableVideoError, err: + self._downloader.trouble(u'ERROR: unable to download file') + + + +class PostProcessor(object): + """Post Processor class. + + PostProcessor objects can be added to downloaders with their + add_post_processor() method. When the downloader has finished a + successful download, it will take its internal chain of PostProcessors + and start calling the run() method on each one of them, first with + an initial argument and then with the returned value of the previous + PostProcessor. + + The chain will be stopped if one of them ever returns None or the end + of the chain is reached. + + PostProcessor objects follow a "mutual registration" process similar + to InfoExtractor objects. + """ _downloader = None @@ -2617,13 +3792,16 @@ class PostProcessor(object): """ return information # by default, do nothing + class FFmpegExtractAudioPP(PostProcessor): - def __init__(self, downloader=None, preferredcodec=None): + def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False): PostProcessor.__init__(self, downloader) if preferredcodec is None: preferredcodec = 'best' self._preferredcodec = preferredcodec + self._preferredquality = preferredquality + self._keepvideo = keepvideo @staticmethod def get_audio_codec(path): @@ -2662,24 +3840,32 @@ class FFmpegExtractAudioPP(PostProcessor): more_opts = [] if self._preferredcodec == 'best' or self._preferredcodec == filecodec: - if filecodec == 'aac' or filecodec == 'mp3': + if filecodec in ['aac', 'mp3', 'vorbis']: # Lossless if possible acodec = 'copy' extension = filecodec if filecodec == 'aac': more_opts = ['-f', 'adts'] + if filecodec == 'vorbis': + extension = 'ogg' else: # MP3 otherwise. acodec = 'libmp3lame' extension = 'mp3' - more_opts = ['-ab', '128k'] + more_opts = [] + if self._preferredquality is not None: + more_opts += ['-ab', self._preferredquality] else: # We convert the audio (lossy) - acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec] + acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec] extension = self._preferredcodec - more_opts = ['-ab', '128k'] + more_opts = [] + if self._preferredquality is not None: + more_opts += ['-ab', self._preferredquality] if self._preferredcodec == 'aac': more_opts += ['-f', 'adts'] + if self._preferredcodec == 'vorbis': + extension = 'ogg' (prefix, ext) = os.path.splitext(path) new_path = prefix + '.' + extension @@ -2690,11 +3876,19 @@ class FFmpegExtractAudioPP(PostProcessor): self._downloader.to_stderr(u'WARNING: error running ffmpeg') return None - try: - os.remove(path) - except (IOError, OSError): - self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file') - return None + # Try to update the date time for extracted audio file. + if information.get('filetime') is not None: + try: + os.utime(new_path, (time.time(), information['filetime'])) + except: + self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file') + + if not self._keepvideo: + try: + os.remove(path) + except (IOError, OSError): + self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file') + return None information['filepath'] = new_path return information @@ -2706,30 +3900,51 @@ def updateSelf(downloader, filename): if not os.access(filename, os.W_OK): sys.exit('ERROR: no write permissions on %s' % filename) - downloader.to_screen('Updating to latest stable version...') + downloader.to_screen('Updating to latest version...') try: - latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION' - latest_version = urllib.urlopen(latest_url).read().strip() - prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version - newcontent = urllib.urlopen(prog_url).read() + try: + urlh = urllib.urlopen(UPDATE_URL) + newcontent = urlh.read() + + vmatch = re.search("__version__ = '([^']+)'", newcontent) + if vmatch is not None and vmatch.group(1) == __version__: + downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')') + return + finally: + urlh.close() except (IOError, OSError), err: sys.exit('ERROR: unable to download latest version') try: - stream = open(filename, 'w') - stream.write(newcontent) - stream.close() + outf = open(filename, 'wb') + try: + outf.write(newcontent) + finally: + outf.close() except (IOError, OSError), err: sys.exit('ERROR: unable to overwrite current version') - downloader.to_screen('Updated to version %s' % latest_version) - + downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.') def parseOpts(): # Deferred imports import getpass import optparse + import shlex + + def _readOptions(filename): + try: + optionf = open(filename) + except IOError: + return [] # silently skip if file is not present + try: + res = [] + for l in optionf: + res += shlex.split(l, comments=True) + finally: + optionf.close() + return res def _format_option_string(option): ''' ('-o', '--option') -> -o, --format METAVAR''' @@ -2746,13 +3961,16 @@ def parseOpts(): def _find_term_columns(): columns = os.environ.get('COLUMNS', None) - if columns: return int(columns) - - if sys.platform.startswith('linux'): - try: columns = os.popen('stty size', 'r').read().split()[1] - except: pass + if columns: + return int(columns) - if columns: return int(columns) + try: + sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + out,err = sp.communicate() + return int(out.split()[1]) + except: + pass + return None max_width = 80 max_help_position = 80 @@ -2767,7 +3985,7 @@ def parseOpts(): kw = { 'version' : __version__, 'formatter' : fmt, - 'usage' : 'Usage : %prog [options] url...', + 'usage' : '%prog [options] url [url...]', 'conflict_handler' : 'resolve', } @@ -2775,6 +3993,7 @@ def parseOpts(): # option groups general = optparse.OptionGroup(parser, 'General Options') + selection = optparse.OptionGroup(parser, 'Video Selection') authentication = optparse.OptionGroup(parser, 'Authentication Options') video_format = optparse.OptionGroup(parser, 'Video Format Options') postproc = optparse.OptionGroup(parser, 'Post-processing Options') @@ -2786,20 +4005,27 @@ def parseOpts(): general.add_option('-v', '--version', action='version', help='print program version and exit') general.add_option('-U', '--update', - action='store_true', dest='update_self', help='update this program to latest stable version') + action='store_true', dest='update_self', help='update this program to latest version') general.add_option('-i', '--ignore-errors', action='store_true', dest='ignoreerrors', help='continue on download errors', default=False) general.add_option('-r', '--rate-limit', dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)') general.add_option('-R', '--retries', dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10) - general.add_option('--playlist-start', - dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1) - general.add_option('--playlist-end', - dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1) general.add_option('--dump-user-agent', action='store_true', dest='dump_user_agent', help='display the current browser identification', default=False) + general.add_option('--list-extractors', + action='store_true', dest='list_extractors', + help='List all supported extractors and the URLs they would handle', default=False) + + selection.add_option('--playlist-start', + dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1) + selection.add_option('--playlist-end', + dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1) + selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)') + selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)') + selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None) authentication.add_option('-u', '--username', dest='username', metavar='USERNAME', help='account username') @@ -2812,15 +4038,19 @@ def parseOpts(): video_format.add_option('-f', '--format', action='store', dest='format', metavar='FORMAT', help='video format code') video_format.add_option('--all-formats', - action='store_const', dest='format', help='download all available video formats', const='-1') + action='store_const', dest='format', help='download all available video formats', const='all') video_format.add_option('--max-quality', action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download') + video_format.add_option('-F', '--list-formats', + action='store_true', dest='listformats', help='list all available formats (currently youtube only)') verbosity.add_option('-q', '--quiet', action='store_true', dest='quiet', help='activates quiet mode', default=False) verbosity.add_option('-s', '--simulate', - action='store_true', dest='simulate', help='do not download video', default=False) + action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False) + verbosity.add_option('--skip-download', + action='store_true', dest='skip_download', help='do not download the video', default=False) verbosity.add_option('-g', '--get-url', action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False) verbosity.add_option('-e', '--get-title', @@ -2834,6 +4064,9 @@ def parseOpts(): verbosity.add_option('--get-filename', action='store_true', dest='getfilename', help='simulate, quiet but print output filename', default=False) + verbosity.add_option('--get-format', + action='store_true', dest='getformat', + help='simulate, quiet but print output format', default=False) verbosity.add_option('--no-progress', action='store_true', dest='noprogress', help='do not print progress bar', default=False) verbosity.add_option('--console-title', @@ -2849,40 +4082,95 @@ def parseOpts(): action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False) filesystem.add_option('-o', '--output', - dest='outtmpl', metavar='TEMPLATE', help='output filename template') + dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.') filesystem.add_option('-a', '--batch-file', dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)') filesystem.add_option('-w', '--no-overwrites', action='store_true', dest='nooverwrites', help='do not overwrite files', default=False) filesystem.add_option('-c', '--continue', action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False) + filesystem.add_option('--no-continue', + action='store_false', dest='continue_dl', + help='do not resume partially downloaded files (restart from beginning)') filesystem.add_option('--cookies', - dest='cookiefile', metavar='FILE', help='file to dump cookie jar to') + dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in') filesystem.add_option('--no-part', action='store_true', dest='nopart', help='do not use .part files', default=False) filesystem.add_option('--no-mtime', action='store_false', dest='updatetime', help='do not use the Last-modified header to set the file modification time', default=True) + filesystem.add_option('--write-description', + action='store_true', dest='writedescription', + help='write video description to a .description file', default=False) + filesystem.add_option('--write-info-json', + action='store_true', dest='writeinfojson', + help='write video metadata to a .info.json file', default=False) postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False, help='convert video files to audio-only files (requires ffmpeg and ffprobe)') postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best', - help='"best", "aac" or "mp3"; best by default') + help='"best", "aac", "vorbis" or "mp3"; best by default') + postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K', + help='ffmpeg audio bitrate specification, 128k by default') + postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False, + help='keeps the video file on disk after the post-processing; the video is erased by default') parser.add_option_group(general) + parser.add_option_group(selection) parser.add_option_group(filesystem) parser.add_option_group(verbosity) parser.add_option_group(video_format) parser.add_option_group(authentication) parser.add_option_group(postproc) - opts, args = parser.parse_args() + xdg_config_home = os.environ.get('XDG_CONFIG_HOME') + if xdg_config_home: + userConf = os.path.join(xdg_config_home, 'youtube-dl.conf') + else: + userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf') + argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:] + opts, args = parser.parse_args(argv) return parser, opts, args -def main(): +def gen_extractors(): + """ Return a list of an instance of every supported extractor. + The order does matter; the first extractor matched is the one handling the URL. + """ + youtube_ie = YoutubeIE() + google_ie = GoogleIE() + yahoo_ie = YahooIE() + return [ + YoutubePlaylistIE(youtube_ie), + YoutubeUserIE(youtube_ie), + YoutubeSearchIE(youtube_ie), + youtube_ie, + MetacafeIE(youtube_ie), + DailymotionIE(), + google_ie, + GoogleSearchIE(google_ie), + PhotobucketIE(), + yahoo_ie, + YahooSearchIE(yahoo_ie), + DepositFilesIE(), + FacebookIE(), + BlipTVIE(), + VimeoIE(), + MyVideoIE(), + ComedyCentralIE(), + EscapistIE(), + CollegeHumorIE(), + XVideosIE(), + SoundcloudIE(), + InfoQIE(), + MixcloudIE(), + + GenericIE() + ] + +def _real_main(): parser, opts, args = parseOpts() # Open appropriate CookieJar @@ -2901,11 +4189,6 @@ def main(): print std_headers['User-Agent'] sys.exit(0) - # General configuration - cookie_processor = urllib2.HTTPCookieProcessor(jar) - urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())) - socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words) - # Batch file verification batchurls = [] if opts.batchfile is not None: @@ -2921,6 +4204,23 @@ def main(): sys.exit(u'ERROR: batch file could not be read') all_urls = batchurls + args + # General configuration + cookie_processor = urllib2.HTTPCookieProcessor(jar) + opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()) + urllib2.install_opener(opener) + socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words) + + extractors = gen_extractors() + + if opts.list_extractors: + for ie in extractors: + print(ie.IE_NAME) + matchedUrls = filter(lambda url: ie.suitable(url), all_urls) + all_urls = filter(lambda url: url not in matchedUrls, all_urls) + for mu in matchedUrls: + print(u' ' + mu) + sys.exit(0) + # Conflicting, missing and erroneous options if opts.usenetrc and (opts.username is not None or opts.password is not None): parser.error(u'using .netrc conflicts with giving username/password') @@ -2943,51 +4243,38 @@ def main(): except (TypeError, ValueError), err: parser.error(u'invalid retry count specified') try: - opts.playliststart = long(opts.playliststart) + opts.playliststart = int(opts.playliststart) if opts.playliststart <= 0: - raise ValueError + raise ValueError(u'Playlist start must be positive') except (TypeError, ValueError), err: parser.error(u'invalid playlist start number specified') try: - opts.playlistend = long(opts.playlistend) + opts.playlistend = int(opts.playlistend) if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart): - raise ValueError + raise ValueError(u'Playlist end must be greater than playlist start') except (TypeError, ValueError), err: parser.error(u'invalid playlist end number specified') if opts.extractaudio: - if opts.audioformat not in ['best', 'aac', 'mp3']: + if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis']: parser.error(u'invalid audio format specified') - # Information extractors - youtube_ie = YoutubeIE() - metacafe_ie = MetacafeIE(youtube_ie) - dailymotion_ie = DailymotionIE() - youtube_pl_ie = YoutubePlaylistIE(youtube_ie) - youtube_user_ie = YoutubeUserIE(youtube_ie) - youtube_search_ie = YoutubeSearchIE(youtube_ie) - google_ie = GoogleIE() - google_search_ie = GoogleSearchIE(google_ie) - photobucket_ie = PhotobucketIE() - yahoo_ie = YahooIE() - yahoo_search_ie = YahooSearchIE(yahoo_ie) - deposit_files_ie = DepositFilesIE() - facebook_ie = FacebookIE() - generic_ie = GenericIE() - # File downloader fd = FileDownloader({ 'usenetrc': opts.usenetrc, 'username': opts.username, 'password': opts.password, - 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename), + 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat), 'forceurl': opts.geturl, 'forcetitle': opts.gettitle, 'forcethumbnail': opts.getthumbnail, 'forcedescription': opts.getdescription, 'forcefilename': opts.getfilename, - 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename), + 'forceformat': opts.getformat, + 'simulate': opts.simulate, + 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat), 'format': opts.format, 'format_limit': opts.format_limit, + 'listformats': opts.listformats, 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding())) or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s') or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s') @@ -3010,28 +4297,18 @@ def main(): 'consoletitle': opts.consoletitle, 'nopart': opts.nopart, 'updatetime': opts.updatetime, + 'writedescription': opts.writedescription, + 'writeinfojson': opts.writeinfojson, + 'matchtitle': opts.matchtitle, + 'rejecttitle': opts.rejecttitle, + 'max_downloads': opts.max_downloads, }) - fd.add_info_extractor(youtube_search_ie) - fd.add_info_extractor(youtube_pl_ie) - fd.add_info_extractor(youtube_user_ie) - fd.add_info_extractor(metacafe_ie) - fd.add_info_extractor(dailymotion_ie) - fd.add_info_extractor(youtube_ie) - fd.add_info_extractor(google_ie) - fd.add_info_extractor(google_search_ie) - fd.add_info_extractor(photobucket_ie) - fd.add_info_extractor(yahoo_ie) - fd.add_info_extractor(yahoo_search_ie) - fd.add_info_extractor(deposit_files_ie) - fd.add_info_extractor(facebook_ie) - - # This must come last since it's the - # fallback if none of the others work - fd.add_info_extractor(generic_ie) + for extractor in extractors: + fd.add_info_extractor(extractor) # PostProcessors if opts.extractaudio: - fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat)) + fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo)) # Update version if opts.update_self: @@ -3054,10 +4331,9 @@ def main(): sys.exit(retcode) - -if __name__ == '__main__': +def main(): try: - main() + _real_main() except DownloadError: sys.exit(1) except SameFileError: @@ -3065,4 +4341,7 @@ if __name__ == '__main__': except KeyboardInterrupt: sys.exit(u'\nERROR: Interrupted by user') +if __name__ == '__main__': + main() + # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: