X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2F__init__.py;h=bf77fb90d3d782a7169e7c60d9a57b5c7f2eb4af;hb=fff24d5e358eadea493e5781c17c476e6907c0d5;hp=fe0fe987e4312feb5b57b5c5d5fb5ab460644550;hpb=0b14e0b367f1ee280fc97e853922758b07d2c742;p=youtube-dl
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py
index fe0fe987e..bf77fb90d 100755
--- a/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
-__author__ = (
+__authors__ = (
'Ricardo Garcia Gonzalez',
'Danny Colligan',
'Benjamin Johnson',
@@ -15,15 +15,18 @@ __author__ = (
'Kevin Ngo',
'Ori Avtalion',
'shizeeg',
+ 'Filippo Valsorda',
)
__license__ = 'Public Domain'
-__version__ = '2011.11.23'
+__version__ = '2012.02.27'
UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
+
import cookielib
import datetime
+import getpass
import gzip
import htmlentitydefs
import HTMLParser
@@ -31,9 +34,11 @@ import httplib
import locale
import math
import netrc
+import optparse
import os
import os.path
import re
+import shlex
import socket
import string
import subprocess
@@ -259,14 +264,14 @@ def sanitize_open(filename, open_mode):
import msvcrt
msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
return (sys.stdout, filename)
- stream = open(filename, open_mode)
+ stream = open(_encodeFilename(filename), open_mode)
return (stream, filename)
except (IOError, OSError), err:
# In case of error, try to remove win32 forbidden chars
filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
# An exception here should be caught in the caller
- stream = open(filename, open_mode)
+ stream = open(_encodeFilename(filename), open_mode)
return (stream, filename)
@@ -290,6 +295,30 @@ def _orderedSet(iterable):
res.append(el)
return res
+def _unescapeHTML(s):
+ """
+ @param s a string (of type unicode)
+ """
+ assert type(s) == type(u'')
+
+ htmlParser = HTMLParser.HTMLParser()
+ return htmlParser.unescape(s)
+
+def _encodeFilename(s):
+ """
+ @param s The name of the file (of type unicode)
+ """
+
+ assert type(s) == type(u'')
+
+ if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
+ # Pass u'' directly to use Unicode APIs on Windows 2000 and up
+ # (Detecting Windows NT 4 is tricky because 'major >= 4' would
+ # match Windows 9x series as well. Besides, NT 4 is obsolete.)
+ return s
+ else:
+ return s.encode(sys.getfilesystemencoding(), 'ignore')
+
class DownloadError(Exception):
"""Download Error exception.
@@ -317,6 +346,10 @@ class PostProcessingError(Exception):
"""
pass
+class MaxDownloadsReached(Exception):
+ """ --max-downloads limit has been reached. """
+ pass
+
class UnavailableVideoError(Exception):
"""Unavailable Format exception.
@@ -458,6 +491,8 @@ class FileDownloader(object):
updatetime: Use the Last-modified header to set output file timestamps.
writedescription: Write the video description to a .description file
writeinfojson: Write the video description to a .info.json file
+ writesubtitles: Write the video subtitles to a .srt file
+ subtitleslang: Language of the subtitles to download
"""
params = None
@@ -550,16 +585,17 @@ class FileDownloader(object):
self._pps.append(pp)
pp.set_downloader(self)
- def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
+ def to_screen(self, message, skip_eol=False):
"""Print message to stdout if not in quiet mode."""
- try:
- if not self.params.get('quiet', False):
- terminator = [u'\n', u''][skip_eol]
- print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
+ assert type(message) == type(u'')
+ if not self.params.get('quiet', False):
+ terminator = [u'\n', u''][skip_eol]
+ output = message + terminator
+
+ if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
+ output = output.encode(preferredencoding(), 'ignore')
+ self._screen_file.write(output)
self._screen_file.flush()
- except (UnicodeEncodeError), err:
- if not ignore_encoding_errors:
- raise
def to_stderr(self, message):
"""Print message to stderr."""
@@ -609,7 +645,7 @@ class FileDownloader(object):
def temp_name(self, filename):
"""Returns a temporary filename for the given filename."""
if self.params.get('nopart', False) or filename == u'-' or \
- (os.path.exists(filename) and not os.path.isfile(filename)):
+ (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
return filename
return filename + u'.part'
@@ -622,7 +658,7 @@ class FileDownloader(object):
try:
if old_filename == new_filename:
return
- os.rename(old_filename, new_filename)
+ os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
except (IOError, OSError), err:
self.trouble(u'ERROR: unable to rename file')
@@ -630,7 +666,7 @@ class FileDownloader(object):
"""Try to set the last-modified time of the given file."""
if last_modified_hdr is None:
return
- if not os.path.isfile(filename):
+ if not os.path.isfile(_encodeFilename(filename)):
return
timestr = last_modified_hdr
if timestr is None:
@@ -646,15 +682,19 @@ class FileDownloader(object):
def report_writedescription(self, descfn):
""" Report that the description file is being written """
- self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
+ self.to_screen(u'[info] Writing video description to: ' + descfn)
+
+ def report_writesubtitles(self, srtfn):
+ """ Report that the subtitles file is being written """
+ self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
def report_writeinfojson(self, infofn):
""" Report that the metadata file has been written """
- self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
+ self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
def report_destination(self, filename):
"""Report destination filename."""
- self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
+ self.to_screen(u'[download] Destination: ' + filename)
def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
"""Report download progress."""
@@ -730,8 +770,7 @@ class FileDownloader(object):
max_downloads = self.params.get('max_downloads')
if max_downloads is not None:
if self._num_downloads > int(max_downloads):
- self.to_screen(u'[download] Maximum number of downloads reached. Skipping ' + info_dict['title'])
- return
+ raise MaxDownloadsReached()
filename = self.prepare_filename(info_dict)
@@ -756,13 +795,9 @@ class FileDownloader(object):
if filename is None:
return
- if self.params.get('nooverwrites', False) and os.path.exists(filename):
- self.to_stderr(u'WARNING: file exists and will be skipped')
- return
-
try:
- dn = os.path.dirname(filename)
- if dn != '' and not os.path.exists(dn):
+ dn = os.path.dirname(_encodeFilename(filename))
+ if dn != '' and not os.path.exists(dn): # dn is already encoded
os.makedirs(dn)
except (OSError, IOError), err:
self.trouble(u'ERROR: unable to create directory ' + unicode(err))
@@ -770,9 +805,9 @@ class FileDownloader(object):
if self.params.get('writedescription', False):
try:
- descfn = filename + '.description'
+ descfn = filename + u'.description'
self.report_writedescription(descfn)
- descfile = open(descfn, 'wb')
+ descfile = open(_encodeFilename(descfn), 'wb')
try:
descfile.write(info_dict['description'].encode('utf-8'))
finally:
@@ -780,9 +815,24 @@ class FileDownloader(object):
except (OSError, IOError):
self.trouble(u'ERROR: Cannot write description file ' + descfn)
return
+
+ if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
+ # subtitles download errors are already managed as troubles in relevant IE
+ # that way it will silently go on when used with unsupporting IE
+ try:
+ srtfn = filename.rsplit('.', 1)[0] + u'.srt'
+ self.report_writesubtitles(srtfn)
+ srtfile = open(_encodeFilename(srtfn), 'wb')
+ try:
+ srtfile.write(info_dict['subtitles'].encode('utf-8'))
+ finally:
+ srtfile.close()
+ except (OSError, IOError):
+ self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
+ return
if self.params.get('writeinfojson', False):
- infofn = filename + '.info.json'
+ infofn = filename + u'.info.json'
self.report_writeinfojson(infofn)
try:
json.dump
@@ -790,7 +840,7 @@ class FileDownloader(object):
self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
return
try:
- infof = open(infofn, 'wb')
+ infof = open(_encodeFilename(infofn), 'wb')
try:
json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
json.dump(json_info_dict, infof)
@@ -801,16 +851,19 @@ class FileDownloader(object):
return
if not self.params.get('skip_download', False):
- try:
- success = self._do_download(filename, info_dict)
- except (OSError, IOError), err:
- raise UnavailableVideoError
- except (urllib2.URLError, httplib.HTTPException, socket.error), err:
- self.trouble(u'ERROR: unable to download video data: %s' % str(err))
- return
- except (ContentTooShortError, ), err:
- self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
- return
+ if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
+ success = True
+ else:
+ try:
+ success = self._do_download(filename, info_dict)
+ except (OSError, IOError), err:
+ raise UnavailableVideoError
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self.trouble(u'ERROR: unable to download video data: %s' % str(err))
+ return
+ except (ContentTooShortError, ), err:
+ self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
+ return
if success:
try:
@@ -869,13 +922,21 @@ class FileDownloader(object):
# the connection was interrumpted and resuming appears to be
# possible. This is part of rtmpdump's normal usage, AFAIK.
basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
- retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
+ args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
+ if self.params.get('verbose', False):
+ try:
+ import pipes
+ shell_quote = lambda args: ' '.join(map(pipes.quote, args))
+ except ImportError:
+ shell_quote = repr
+ self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
+ retval = subprocess.call(args)
while retval == 2 or retval == 1:
- prevsize = os.path.getsize(tmpfilename)
+ prevsize = os.path.getsize(_encodeFilename(tmpfilename))
self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
time.sleep(5.0) # This seems to be needed
retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
- cursize = os.path.getsize(tmpfilename)
+ cursize = os.path.getsize(_encodeFilename(tmpfilename))
if prevsize == cursize and retval == 1:
break
# Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
@@ -884,7 +945,7 @@ class FileDownloader(object):
retval = 0
break
if retval == 0:
- self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
+ self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
self.try_rename(tmpfilename, filename)
return True
else:
@@ -896,7 +957,7 @@ class FileDownloader(object):
player_url = info_dict.get('player_url', None)
# Check file already present
- if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
+ if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
self.report_file_already_downloaded(filename)
return True
@@ -913,8 +974,8 @@ class FileDownloader(object):
request = urllib2.Request(url, None, headers)
# Establish possible resume length
- if os.path.isfile(tmpfilename):
- resume_len = os.path.getsize(tmpfilename)
+ if os.path.isfile(_encodeFilename(tmpfilename)):
+ resume_len = os.path.getsize(_encodeFilename(tmpfilename))
else:
resume_len = 0
@@ -1118,6 +1179,7 @@ class YoutubeIE(InfoExtractor):
_NETRC_MACHINE = 'youtube'
# Listed in order of quality
_available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
+ _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
_video_extensions = {
'13': '3gp',
'17': 'mp4',
@@ -1166,6 +1228,10 @@ class YoutubeIE(InfoExtractor):
"""Report attempt to download video info webpage."""
self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
+ def report_video_subtitles_download(self, video_id):
+ """Report attempt to download video info webpage."""
+ self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
+
def report_information_extraction(self, video_id):
"""Report attempt to extract video information."""
self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
@@ -1178,6 +1244,23 @@ class YoutubeIE(InfoExtractor):
"""Indicate the download will use the RTMP protocol."""
self._downloader.to_screen(u'[youtube] RTMP download detected')
+ def _closed_captions_xml_to_srt(self, xml_string):
+ srt = ''
+ texts = re.findall(r'([^<]+)', xml_string, re.MULTILINE)
+ # TODO parse xml instead of regex
+ for n, (start, dur_tag, dur, caption) in enumerate(texts):
+ if not dur: dur = '4'
+ start = float(start)
+ end = start + float(dur)
+ start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
+ end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
+ caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
+ caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
+ srt += str(n) + '\n'
+ srt += start + ' --> ' + end + '\n'
+ srt += caption + '\n\n'
+ return srt
+
def _print_formats(self, formats):
print 'Available formats:'
for x in formats:
@@ -1341,15 +1424,45 @@ class YoutubeIE(InfoExtractor):
lxml.etree
except NameError:
video_description = u'No description available.'
- if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
- mobj = re.search(r'', video_webpage)
- if mobj is not None:
- video_description = mobj.group(1).decode('utf-8')
+ mobj = re.search(r'', video_webpage)
+ if mobj is not None:
+ video_description = mobj.group(1).decode('utf-8')
else:
html_parser = lxml.etree.HTMLParser(encoding='utf-8')
vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
# TODO use another parser
+
+ # closed captions
+ video_subtitles = None
+ if self._downloader.params.get('writesubtitles', False):
+ self.report_video_subtitles_download(video_id)
+ request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
+ try:
+ srt_list = urllib2.urlopen(request).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
+ else:
+ srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
+ if srt_lang_list:
+ if self._downloader.params.get('subtitleslang', False):
+ srt_lang = self._downloader.params.get('subtitleslang')
+ elif 'en' in srt_lang_list:
+ srt_lang = 'en'
+ else:
+ srt_lang = srt_lang_list[0]
+ if not srt_lang in srt_lang_list:
+ self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
+ else:
+ request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
+ try:
+ srt_xml = urllib2.urlopen(request).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
+ else:
+ video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
+ else:
+ self._downloader.trouble(u'WARNING: video has no closed captions')
# token
video_token = urllib.unquote_plus(video_info['token'][0])
@@ -1367,10 +1480,11 @@ class YoutubeIE(InfoExtractor):
url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
format_limit = self._downloader.params.get('format_limit', None)
- if format_limit is not None and format_limit in self._available_formats:
- format_list = self._available_formats[self._available_formats.index(format_limit):]
+ available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
+ if format_limit is not None and format_limit in available_formats:
+ format_list = available_formats[available_formats.index(format_limit):]
else:
- format_list = self._available_formats
+ format_list = available_formats
existing_formats = [x for x in format_list if x in url_map]
if len(existing_formats) == 0:
self._downloader.trouble(u'ERROR: no known formats available for video')
@@ -1421,6 +1535,7 @@ class YoutubeIE(InfoExtractor):
'thumbnail': video_thumbnail.decode('utf-8'),
'description': video_description,
'player_url': player_url,
+ 'subtitles': video_subtitles
})
except UnavailableVideoError, err:
self._downloader.trouble(u'\nERROR: unable to download video')
@@ -1596,7 +1711,6 @@ class DailymotionIE(InfoExtractor):
self._downloader.increment_downloads()
video_id = mobj.group(1)
- simple_title = mobj.group(2).decode('utf-8')
video_extension = 'flv'
# Retrieve video webpage to extract further information
@@ -1626,12 +1740,13 @@ class DailymotionIE(InfoExtractor):
video_url = mediaURL
- mobj = re.search(r'(?im)
\s*(.+)\s*-\s*Video\s+Dailymotion', webpage)
+ mobj = re.search(r'', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract title')
return
- video_title = mobj.group(1).decode('utf-8')
+ video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
video_title = sanitize_title(video_title)
+ simple_title = _simplify_title(video_title)
mobj = re.search(r'(?im)[^<]+?]+?>([^<]+?)', webpage)
if mobj is None:
@@ -2018,7 +2133,7 @@ class VimeoIE(InfoExtractor):
video_id = mobj.group(1)
# Retrieve video webpage to extract further information
- request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
+ request = urllib2.Request(url, None, std_headers)
try:
self.report_download_webpage(video_id)
webpage = urllib2.urlopen(request).read()
@@ -2031,77 +2146,75 @@ class VimeoIE(InfoExtractor):
# and latter we extract those that are Vimeo specific.
self.report_extraction(video_id)
- # Extract title
- mobj = re.search(r'(.*?)', webpage)
- if mobj is None:
- self._downloader.trouble(u'ERROR: unable to extract video title')
+ # Extract the config JSON
+ config = webpage.split(' = {config:')[1].split(',assets:')[0]
+ try:
+ config = json.loads(config)
+ except:
+ self._downloader.trouble(u'ERROR: unable to extract info section')
return
- video_title = mobj.group(1).decode('utf-8')
+
+ # Extract title
+ video_title = config["video"]["title"]
simple_title = _simplify_title(video_title)
# Extract uploader
- mobj = re.search(r'http://vimeo.com/(.*?)', webpage)
- if mobj is None:
- self._downloader.trouble(u'ERROR: unable to extract video uploader')
- return
- video_uploader = mobj.group(1).decode('utf-8')
+ video_uploader = config["video"]["owner"]["name"]
# Extract video thumbnail
- mobj = re.search(r'(.*?)', webpage)
- if mobj is None:
- self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
- return
- video_thumbnail = mobj.group(1).decode('utf-8')
+ video_thumbnail = config["video"]["thumbnail"]
- # # Extract video description
- # mobj = re.search(r'', webpage)
- # if mobj is None:
- # self._downloader.trouble(u'ERROR: unable to extract video description')
- # return
- # video_description = mobj.group(1).decode('utf-8')
- # if not video_description: video_description = 'No description available.'
- video_description = 'Foo.'
-
- # Vimeo specific: extract request signature
- mobj = re.search(r'(.*?)', webpage)
- if mobj is None:
- self._downloader.trouble(u'ERROR: unable to extract request signature')
- return
- sig = mobj.group(1).decode('utf-8')
-
- # Vimeo specific: extract video quality information
- mobj = re.search(r'(\d+)', webpage)
- if mobj is None:
- self._downloader.trouble(u'ERROR: unable to extract video quality information')
- return
- quality = mobj.group(1).decode('utf-8')
-
- if int(quality) == 1:
- quality = 'hd'
+ # Extract video description
+ try:
+ lxml.etree
+ except NameError:
+ video_description = u'No description available.'
+ mobj = re.search(r'', webpage, re.MULTILINE)
+ if mobj is not None:
+ video_description = mobj.group(1)
else:
- quality = 'sd'
+ html_parser = lxml.etree.HTMLParser()
+ vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
+ video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
+ # TODO use another parser
- # Vimeo specific: Extract request signature expiration
- mobj = re.search(r'(.*?)', webpage)
- if mobj is None:
- self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
+ # Extract upload date
+ video_upload_date = u'NA'
+ mobj = re.search(r'[^:]*: (.*?)( \([^\(]*\))?', webpage)
+ if mobj is not None:
+ video_upload_date = mobj.group(1)
+
+ # Vimeo specific: extract request signature and timestamp
+ sig = config['request']['signature']
+ timestamp = config['request']['timestamp']
+
+ # Vimeo specific: extract video codec and quality information
+ # TODO bind to format param
+ codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
+ for codec in codecs:
+ if codec[0] in config["video"]["files"]:
+ video_codec = codec[0]
+ video_extension = codec[1]
+ if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
+ else: quality = 'sd'
+ break
+ else:
+ self._downloader.trouble(u'ERROR: no known codec found')
return
- sig_exp = mobj.group(1).decode('utf-8')
- video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
+ video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
+ %(video_id, sig, timestamp, quality, video_codec.upper())
try:
# Process video information
self._downloader.process_info({
- 'id': video_id.decode('utf-8'),
+ 'id': video_id,
'url': video_url,
'uploader': video_uploader,
- 'upload_date': u'NA',
+ 'upload_date': video_upload_date,
'title': video_title,
'stitle': simple_title,
- 'ext': u'mp4',
- 'thumbnail': video_thumbnail.decode('utf-8'),
- 'description': video_description,
+ 'ext': video_extension,
'thumbnail': video_thumbnail,
'description': video_description,
'player_url': None,
@@ -2210,9 +2323,7 @@ class GenericIE(InfoExtractor):
class YoutubeSearchIE(InfoExtractor):
"""Information Extractor for YouTube search queries."""
_VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
- _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
- _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
- _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*'
+ _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
_youtube_ie = None
_max_youtube_results = 1000
IE_NAME = u'youtube:search'
@@ -2263,45 +2374,39 @@ class YoutubeSearchIE(InfoExtractor):
"""Downloads a specified number of results for a query"""
video_ids = []
- already_seen = set()
- pagenum = 1
+ pagenum = 0
+ limit = n
- while True:
- self.report_download_page(query, pagenum)
- result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
+ while (50 * pagenum) < limit:
+ self.report_download_page(query, pagenum+1)
+ result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
request = urllib2.Request(result_url)
try:
- page = urllib2.urlopen(request).read()
+ data = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
- self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
+ self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
return
+ api_response = json.loads(data)['data']
- # Extract video identifiers
- for mobj in re.finditer(self._VIDEO_INDICATOR, page):
- video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
- if video_id not in already_seen:
- video_ids.append(video_id)
- already_seen.add(video_id)
- if len(video_ids) == n:
- # Specified n videos reached
- for id in video_ids:
- self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
- return
+ new_ids = list(video['id'] for video in api_response['items'])
+ video_ids += new_ids
- if re.search(self._MORE_PAGES_INDICATOR, page) is None:
- for id in video_ids:
- self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
- return
+ limit = min(n, api_response['totalItems'])
+ pagenum += 1
- pagenum = pagenum + 1
+ if len(video_ids) > n:
+ video_ids = video_ids[:n]
+ for id in video_ids:
+ self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
+ return
class GoogleSearchIE(InfoExtractor):
"""Information Extractor for Google Video search queries."""
_VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
_TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
- _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
- _MORE_PAGES_INDICATOR = r'Next'
+ _VIDEO_INDICATOR = r'\s*Next\s*'
_youtube_ie = None
IE_NAME = u'youtube:playlist'
@@ -2531,7 +2634,7 @@ class YoutubePlaylistIE(InfoExtractor):
# Extract video identifiers
ids_in_page = []
- for mobj in re.finditer(self._VIDEO_INDICATOR, page):
+ for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
if mobj.group(1) not in ids_in_page:
ids_in_page.append(mobj.group(1))
video_ids.extend(ids_in_page)
@@ -2542,7 +2645,10 @@ class YoutubePlaylistIE(InfoExtractor):
playliststart = self._downloader.params.get('playliststart', 1) - 1
playlistend = self._downloader.params.get('playlistend', -1)
- video_ids = video_ids[playliststart:playlistend]
+ if playlistend == -1:
+ video_ids = video_ids[playliststart:]
+ else:
+ video_ids = video_ids[playliststart:playlistend]
for id in video_ids:
self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
@@ -2630,7 +2736,7 @@ class YoutubeUserIE(InfoExtractor):
else:
video_ids = video_ids[playliststart:playlistend]
- self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
+ self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
(username, all_ids_count, len(video_ids)))
for video_id in video_ids:
@@ -2988,14 +3094,14 @@ class BlipTVIE(InfoExtractor):
data = json_data['Post']
else:
data = json_data
-
+
upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
video_url = data['media']['url']
umobj = re.match(self._URL_EXT, video_url)
if umobj is None:
raise ValueError('Can not determine filename extension')
ext = umobj.group(1)
-
+
info = {
'id': data['item_id'],
'url': video_url,
@@ -3096,7 +3202,7 @@ class ComedyCentralIE(InfoExtractor):
def report_extraction(self, episode_id):
self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
-
+
def report_config_download(self, episode_id):
self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
@@ -3145,7 +3251,7 @@ class ComedyCentralIE(InfoExtractor):
return
epTitle = mobj.group('episode')
- mMovieParams = re.findall('', html)
+ mMovieParams = re.findall('(?:https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P[0-9]+)/[^/]+$'
+ IE_NAME = u'mtv'
+
+ def report_webpage(self, video_id):
+ """Report information extraction."""
+ self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
+
+ def report_extraction(self, video_id):
+ """Report information extraction."""
+ self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
+ return
+ if not mobj.group('proto'):
+ url = 'http://' + url
+ video_id = mobj.group('videoid')
+ self.report_webpage(video_id)
+
+ request = urllib2.Request(url)
+ try:
+ webpage = urllib2.urlopen(request).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
+ return
+
+ mobj = re.search(r'', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract song name')
+ return
+ song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
+ mobj = re.search(r'', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract performer')
+ return
+ performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
+ video_title = performer + ' - ' + song_name
+
+ mobj = re.search(r'', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to mtvn_uri')
+ return
+ mtvn_uri = mobj.group(1)
+
+ mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: unable to extract content id')
+ return
+ content_id = mobj.group(1)
+
+ videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
+ self.report_extraction(video_id)
+ request = urllib2.Request(videogen_url)
+ try:
+ metadataXml = urllib2.urlopen(request).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
+ return
+
+ mdoc = xml.etree.ElementTree.fromstring(metadataXml)
+ renditions = mdoc.findall('.//rendition')
+
+ # For now, always pick the highest quality.
+ rendition = renditions[-1]
+
+ try:
+ _,_,ext = rendition.attrib['type'].partition('/')
+ format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
+ video_url = rendition.find('./src').text
+ except KeyError:
+ self._downloader.trouble('Invalid rendition field.')
+ return
+
+ self._downloader.increment_downloads()
+ info = {
+ 'id': video_id,
+ 'url': video_url,
+ 'uploader': performer,
+ 'title': video_title,
+ 'stitle': _simplify_title(video_title),
+ 'ext': ext,
+ 'format': format,
+ }
+
+ try:
+ self._downloader.process_info(info)
+ except UnavailableVideoError, err:
+ self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
+
class PostProcessor(object):
"""Post Processor class.
@@ -3918,6 +4117,9 @@ class PostProcessor(object):
"""
return information # by default, do nothing
+class AudioConversionError(BaseException):
+ def __init__(self, message):
+ self.message = message
class FFmpegExtractAudioPP(PostProcessor):
@@ -3932,7 +4134,7 @@ class FFmpegExtractAudioPP(PostProcessor):
@staticmethod
def get_audio_codec(path):
try:
- cmd = ['ffprobe', '-show_streams', '--', path]
+ cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
output = handle.communicate()[0]
if handle.wait() != 0:
@@ -3949,12 +4151,23 @@ class FFmpegExtractAudioPP(PostProcessor):
@staticmethod
def run_ffmpeg(path, out_path, codec, more_opts):
+ if codec is None:
+ acodec_opts = []
+ else:
+ acodec_opts = ['-acodec', codec]
+ cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
try:
- cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
- ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
- return (ret == 0)
+ p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ stdout,stderr = p.communicate()
except (IOError, OSError):
- return False
+ e = sys.exc_info()[1]
+ if isinstance(e, OSError) and e.errno == 2:
+ raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
+ else:
+ raise e
+ if p.returncode != 0:
+ msg = stderr.strip().split('\n')[-1]
+ raise AudioConversionError(msg)
def run(self, information):
path = information['filepath']
@@ -3965,8 +4178,13 @@ class FFmpegExtractAudioPP(PostProcessor):
return None
more_opts = []
- if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
- if filecodec in ['aac', 'mp3', 'vorbis']:
+ if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
+ if self._preferredcodec == 'm4a' and filecodec == 'aac':
+ # Lossless, but in another container
+ acodec = 'copy'
+ extension = self._preferredcodec
+ more_opts = ['-absf', 'aac_adtstoasc']
+ elif filecodec in ['aac', 'mp3', 'vorbis']:
# Lossless if possible
acodec = 'copy'
extension = filecodec
@@ -3983,35 +4201,44 @@ class FFmpegExtractAudioPP(PostProcessor):
more_opts += ['-ab', self._preferredquality]
else:
# We convert the audio (lossy)
- acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
+ acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
extension = self._preferredcodec
more_opts = []
if self._preferredquality is not None:
more_opts += ['-ab', self._preferredquality]
if self._preferredcodec == 'aac':
more_opts += ['-f', 'adts']
+ if self._preferredcodec == 'm4a':
+ more_opts += ['-absf', 'aac_adtstoasc']
if self._preferredcodec == 'vorbis':
extension = 'ogg'
+ if self._preferredcodec == 'wav':
+ extension = 'wav'
+ more_opts += ['-f', 'wav']
- (prefix, ext) = os.path.splitext(path)
- new_path = prefix + '.' + extension
- self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
- status = self.run_ffmpeg(path, new_path, acodec, more_opts)
-
- if not status:
- self._downloader.to_stderr(u'WARNING: error running ffmpeg')
+ prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
+ new_path = prefix + sep + extension
+ self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
+ try:
+ self.run_ffmpeg(path, new_path, acodec, more_opts)
+ except:
+ etype,e,tb = sys.exc_info()
+ if isinstance(e, AudioConversionError):
+ self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
+ else:
+ self._downloader.to_stderr(u'ERROR: error running ffmpeg')
return None
# Try to update the date time for extracted audio file.
if information.get('filetime') is not None:
try:
- os.utime(new_path, (time.time(), information['filetime']))
+ os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
except:
self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
if not self._keepvideo:
try:
- os.remove(path)
+ os.remove(_encodeFilename(path))
except (IOError, OSError):
self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
return None
@@ -4026,7 +4253,7 @@ def updateSelf(downloader, filename):
if not os.access(filename, os.W_OK):
sys.exit('ERROR: no write permissions on %s' % filename)
- downloader.to_screen('Updating to latest version...')
+ downloader.to_screen(u'Updating to latest version...')
try:
try:
@@ -4035,7 +4262,7 @@ def updateSelf(downloader, filename):
vmatch = re.search("__version__ = '([^']+)'", newcontent)
if vmatch is not None and vmatch.group(1) == __version__:
- downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
+ downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
return
finally:
urlh.close()
@@ -4051,17 +4278,12 @@ def updateSelf(downloader, filename):
except (IOError, OSError), err:
sys.exit('ERROR: unable to overwrite current version')
- downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
+ downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
def parseOpts():
- # Deferred imports
- import getpass
- import optparse
- import shlex
-
- def _readOptions(filename):
+ def _readOptions(filename_bytes):
try:
- optionf = open(filename)
+ optionf = open(filename_bytes)
except IOError:
return [] # silently skip if file is not present
try:
@@ -4165,10 +4387,18 @@ def parseOpts():
action='store', dest='format', metavar='FORMAT', help='video format code')
video_format.add_option('--all-formats',
action='store_const', dest='format', help='download all available video formats', const='all')
+ video_format.add_option('--prefer-free-formats',
+ action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
video_format.add_option('--max-quality',
action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
video_format.add_option('-F', '--list-formats',
action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
+ video_format.add_option('--write-srt',
+ action='store_true', dest='writesubtitles',
+ help='write video closed captions to a .srt file (currently youtube only)', default=False)
+ video_format.add_option('--srt-lang',
+ action='store', dest='subtitleslang', metavar='LANG',
+ help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
verbosity.add_option('-q', '--quiet',
@@ -4198,6 +4428,8 @@ def parseOpts():
verbosity.add_option('--console-title',
action='store_true', dest='consoletitle',
help='display progress in console titlebar', default=False)
+ verbosity.add_option('-v', '--verbose',
+ action='store_true', dest='verbose', help='print various debugging information', default=False)
filesystem.add_option('-t', '--title',
@@ -4214,7 +4446,7 @@ def parseOpts():
filesystem.add_option('-w', '--no-overwrites',
action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
filesystem.add_option('-c', '--continue',
- action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
+ action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
filesystem.add_option('--no-continue',
action='store_false', dest='continue_dl',
help='do not resume partially downloaded files (restart from beginning)')
@@ -4236,7 +4468,7 @@ def parseOpts():
postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
- help='"best", "aac", "vorbis" or "mp3"; best by default')
+ help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
help='ffmpeg audio bitrate specification, 128k by default')
postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
@@ -4293,6 +4525,7 @@ def gen_extractors():
InfoQIE(),
MixcloudIE(),
StanfordOpenClassroomIE(),
+ MTVIE(),
GenericIE()
]
@@ -4333,10 +4566,14 @@ def _real_main():
# General configuration
cookie_processor = urllib2.HTTPCookieProcessor(jar)
- opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
+ proxy_handler = urllib2.ProxyHandler()
+ opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
urllib2.install_opener(opener)
socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
+ if opts.verbose:
+ print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
+
extractors = gen_extractors()
if opts.list_extractors:
@@ -4382,7 +4619,7 @@ def _real_main():
except (TypeError, ValueError), err:
parser.error(u'invalid playlist end number specified')
if opts.extractaudio:
- if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis']:
+ if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
parser.error(u'invalid audio format specified')
# File downloader
@@ -4426,9 +4663,13 @@ def _real_main():
'updatetime': opts.updatetime,
'writedescription': opts.writedescription,
'writeinfojson': opts.writeinfojson,
+ 'writesubtitles': opts.writesubtitles,
+ 'subtitleslang': opts.subtitleslang,
'matchtitle': opts.matchtitle,
'rejecttitle': opts.rejecttitle,
'max_downloads': opts.max_downloads,
+ 'prefer_free_formats': opts.prefer_free_formats,
+ 'verbose': opts.verbose,
})
for extractor in extractors:
fd.add_info_extractor(extractor)
@@ -4447,7 +4688,12 @@ def _real_main():
parser.error(u'you must provide at least one URL')
else:
sys.exit()
- retcode = fd.download(all_urls)
+
+ try:
+ retcode = fd.download(all_urls)
+ except MaxDownloadsReached:
+ fd.to_screen(u'--max-download limit reached, aborting.')
+ retcode = 101
# Dump cookie jar if requested
if opts.cookiefile is not None: