2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # Author: Vasyl' Vavrychuk
7 # Author: Witold Baryluk
8 # Author: Paweł Paprota
9 # Author: Gergely Imreh
10 # Author: Philipp Hagemeister <phihag@phihag.de>
11 # License: Public domain code
12 from __future__ import with_statement
38 import cStringIO as StringIO
42 # parse_qs was moved from the cgi module to the urlparse module recently.
44 from urlparse import parse_qs
46 from cgi import parse_qs
50 except ImportError: # Python < 2.6
54 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b11) Gecko/20100101 Firefox/4.0b11',
55 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
56 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
57 'Accept-Encoding': 'gzip, deflate',
58 'Accept-Language': 'en-us,en;q=0.5',
61 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
65 except ImportError: # Python <2.5, use trivialjson (https://github.com/phihag/trivialjson):
71 def raiseError(msg, i):
72 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
73 def skipSpace(i, expectMore=True):
74 while i < len(s) and s[i] in ' \t\r\n':
78 raiseError('Premature end', i)
80 def decodeEscape(match):
96 return unichr(int(esc[1:5], 16))
97 if len(esc) == 5+6 and esc[5:7] == '\\u':
98 hi = int(esc[1:5], 16)
99 low = int(esc[7:11], 16)
100 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
101 raise ValueError('Unknown escape ' + str(esc))
108 while s[e-bslashes-1] == '\\':
110 if bslashes % 2 == 1:
114 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
115 stri = rexp.sub(decodeEscape, s[i:e])
121 if s[i] == '}': # Empty dictionary
125 raiseError('Expected a string object key', i)
126 i,key = parseString(i)
128 if i >= len(s) or s[i] != ':':
129 raiseError('Expected a colon', i)
136 raiseError('Expected comma or closing curly brace', i)
141 if s[i] == ']': # Empty array
146 i = skipSpace(i) # Raise exception if premature end
150 raiseError('Expected a comma or closing bracket', i)
152 def parseDiscrete(i):
153 for k,v in {'true': True, 'false': False, 'null': None}.items():
154 if s.startswith(k, i):
156 raiseError('Not a boolean (or null)', i)
158 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
160 raiseError('Not a number', i)
162 if '.' in nums or 'e' in nums or 'E' in nums:
163 return (i+len(nums), float(nums))
164 return (i+len(nums), int(nums))
165 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
168 i,res = CHARMAP.get(s[i], parseNumber)(i)
169 i = skipSpace(i, False)
173 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
176 def preferredencoding():
177 """Get preferred encoding.
179 Returns the best encoding scheme for the system, based on
180 locale.getpreferredencoding() and some further tweaks.
182 def yield_preferredencoding():
184 pref = locale.getpreferredencoding()
190 return yield_preferredencoding().next()
192 def htmlentity_transform(matchobj):
193 """Transforms an HTML entity to a Unicode character.
195 This function receives a match object and is intended to be used with
196 the re.sub() function.
198 entity = matchobj.group(1)
200 # Known non-numeric HTML entity
201 if entity in htmlentitydefs.name2codepoint:
202 return unichr(htmlentitydefs.name2codepoint[entity])
205 mobj = re.match(ur'(?u)#(x?\d+)', entity)
207 numstr = mobj.group(1)
208 if numstr.startswith(u'x'):
210 numstr = u'0%s' % numstr
213 return unichr(long(numstr, base))
215 # Unknown entity in name, return its literal representation
216 return (u'&%s;' % entity)
218 def sanitize_title(utitle):
219 """Sanitizes a video title so it could be used as part of a filename."""
220 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
221 return utitle.replace(unicode(os.sep), u'%')
223 def sanitize_open(filename, open_mode):
224 """Try to open the given filename, and slightly tweak it if this fails.
226 Attempts to open the given filename. If this fails, it tries to change
227 the filename slightly, step by step, until it's either able to open it
228 or it fails and raises a final exception, like the standard open()
231 It returns the tuple (stream, definitive_file_name).
235 if sys.platform == 'win32':
237 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
238 return (sys.stdout, filename)
239 stream = open(filename, open_mode)
240 return (stream, filename)
241 except (IOError, OSError), err:
242 # In case of error, try to remove win32 forbidden chars
243 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
245 # An exception here should be caught in the caller
246 stream = open(filename, open_mode)
247 return (stream, filename)
249 def timeconvert(timestr):
250 """Convert RFC 2822 defined time string into system timestamp"""
252 timetuple = email.utils.parsedate_tz(timestr)
253 if timetuple is not None:
254 timestamp = email.utils.mktime_tz(timetuple)
257 class DownloadError(Exception):
258 """Download Error exception.
260 This exception may be thrown by FileDownloader objects if they are not
261 configured to continue on errors. They will contain the appropriate
266 class SameFileError(Exception):
267 """Same File exception.
269 This exception will be thrown by FileDownloader objects if they detect
270 multiple files would have to be downloaded to the same file on disk.
274 class PostProcessingError(Exception):
275 """Post Processing exception.
277 This exception may be raised by PostProcessor's .run() method to
278 indicate an error in the postprocessing task.
282 class UnavailableVideoError(Exception):
283 """Unavailable Format exception.
285 This exception will be thrown when a video is requested
286 in a format that is not available for that video.
290 class ContentTooShortError(Exception):
291 """Content Too Short exception.
293 This exception may be raised by FileDownloader objects when a file they
294 download is too small for what the server announced first, indicating
295 the connection was probably interrupted.
301 def __init__(self, downloaded, expected):
302 self.downloaded = downloaded
303 self.expected = expected
305 class YoutubeDLHandler(urllib2.HTTPHandler):
306 """Handler for HTTP requests and responses.
308 This class, when installed with an OpenerDirector, automatically adds
309 the standard headers to every HTTP request and handles gzipped and
310 deflated responses from web servers. If compression is to be avoided in
311 a particular request, the original request in the program code only has
312 to include the HTTP header "Youtubedl-No-Compression", which will be
313 removed before making the real request.
315 Part of this code was copied from:
317 http://techknack.net/python-urllib2-handlers/
319 Andrew Rowls, the author of that code, agreed to release it to the
326 return zlib.decompress(data, -zlib.MAX_WBITS)
328 return zlib.decompress(data)
331 def addinfourl_wrapper(stream, headers, url, code):
332 if hasattr(urllib2.addinfourl, 'getcode'):
333 return urllib2.addinfourl(stream, headers, url, code)
334 ret = urllib2.addinfourl(stream, headers, url)
338 def http_request(self, req):
339 for h in std_headers:
342 req.add_header(h, std_headers[h])
343 if 'Youtubedl-no-compression' in req.headers:
344 if 'Accept-encoding' in req.headers:
345 del req.headers['Accept-encoding']
346 del req.headers['Youtubedl-no-compression']
349 def http_response(self, req, resp):
352 if resp.headers.get('Content-encoding', '') == 'gzip':
353 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
354 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
355 resp.msg = old_resp.msg
357 if resp.headers.get('Content-encoding', '') == 'deflate':
358 gz = StringIO.StringIO(self.deflate(resp.read()))
359 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
360 resp.msg = old_resp.msg
363 class FileDownloader(object):
364 """File Downloader class.
366 File downloader objects are the ones responsible of downloading the
367 actual video file and writing it to disk if the user has requested
368 it, among some other tasks. In most cases there should be one per
369 program. As, given a video URL, the downloader doesn't know how to
370 extract all the needed information, task that InfoExtractors do, it
371 has to pass the URL to one of them.
373 For this, file downloader objects have a method that allows
374 InfoExtractors to be registered in a given order. When it is passed
375 a URL, the file downloader handles it to the first InfoExtractor it
376 finds that reports being able to handle it. The InfoExtractor extracts
377 all the information about the video or videos the URL refers to, and
378 asks the FileDownloader to process the video information, possibly
379 downloading the video.
381 File downloaders accept a lot of parameters. In order not to saturate
382 the object constructor with arguments, it receives a dictionary of
383 options instead. These options are available through the params
384 attribute for the InfoExtractors to use. The FileDownloader also
385 registers itself as the downloader in charge for the InfoExtractors
386 that are added to it, so this is a "mutual registration".
390 username: Username for authentication purposes.
391 password: Password for authentication purposes.
392 usenetrc: Use netrc for authentication instead.
393 quiet: Do not print messages to stdout.
394 forceurl: Force printing final URL.
395 forcetitle: Force printing title.
396 forcethumbnail: Force printing thumbnail URL.
397 forcedescription: Force printing description.
398 forcefilename: Force printing final filename.
399 simulate: Do not download the video files.
400 format: Video format code.
401 format_limit: Highest quality format to try.
402 outtmpl: Template for output names.
403 ignoreerrors: Do not stop on download errors.
404 ratelimit: Download speed limit, in bytes/sec.
405 nooverwrites: Prevent overwriting files.
406 retries: Number of times to retry for HTTP error 5xx
407 continuedl: Try to continue downloads if possible.
408 noprogress: Do not print the progress bar.
409 playliststart: Playlist item to start at.
410 playlistend: Playlist item to end at.
411 logtostderr: Log messages to stderr instead of stdout.
412 consoletitle: Display progress in console window's titlebar.
413 nopart: Do not use temporary .part files.
414 updatetime: Use the Last-modified header to set output file timestamps.
415 writedescription: Write the video description to a .description file
421 _download_retcode = None
422 _num_downloads = None
425 def __init__(self, params):
426 """Create a FileDownloader object with the given options."""
429 self._download_retcode = 0
430 self._num_downloads = 0
431 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
435 def pmkdir(filename):
436 """Create directory components in filename. Similar to Unix "mkdir -p"."""
437 components = filename.split(os.sep)
438 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
439 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
440 for dir in aggregate:
441 if not os.path.exists(dir):
445 def format_bytes(bytes):
448 if type(bytes) is str:
453 exponent = long(math.log(bytes, 1024.0))
454 suffix = 'bkMGTPEZY'[exponent]
455 converted = float(bytes) / float(1024**exponent)
456 return '%.2f%s' % (converted, suffix)
459 def calc_percent(byte_counter, data_len):
462 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
465 def calc_eta(start, now, total, current):
469 if current == 0 or dif < 0.001: # One millisecond
471 rate = float(current) / dif
472 eta = long((float(total) - float(current)) / rate)
473 (eta_mins, eta_secs) = divmod(eta, 60)
476 return '%02d:%02d' % (eta_mins, eta_secs)
479 def calc_speed(start, now, bytes):
481 if bytes == 0 or dif < 0.001: # One millisecond
482 return '%10s' % '---b/s'
483 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
486 def best_block_size(elapsed_time, bytes):
487 new_min = max(bytes / 2.0, 1.0)
488 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
489 if elapsed_time < 0.001:
491 rate = bytes / elapsed_time
499 def parse_bytes(bytestr):
500 """Parse a string indicating a byte quantity into a long integer."""
501 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
504 number = float(matchobj.group(1))
505 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
506 return long(round(number * multiplier))
508 def add_info_extractor(self, ie):
509 """Add an InfoExtractor object to the end of the list."""
511 ie.set_downloader(self)
513 def add_post_processor(self, pp):
514 """Add a PostProcessor object to the end of the chain."""
516 pp.set_downloader(self)
518 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
519 """Print message to stdout if not in quiet mode."""
521 if not self.params.get('quiet', False):
522 terminator = [u'\n', u''][skip_eol]
523 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
524 self._screen_file.flush()
525 except (UnicodeEncodeError), err:
526 if not ignore_encoding_errors:
529 def to_stderr(self, message):
530 """Print message to stderr."""
531 print >>sys.stderr, message.encode(preferredencoding())
533 def to_cons_title(self, message):
534 """Set console/terminal window title to message."""
535 if not self.params.get('consoletitle', False):
537 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
538 # c_wchar_p() might not be necessary if `message` is
539 # already of type unicode()
540 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
541 elif 'TERM' in os.environ:
542 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
544 def fixed_template(self):
545 """Checks if the output template is fixed."""
546 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
548 def trouble(self, message=None):
549 """Determine action to take when a download problem appears.
551 Depending on if the downloader has been configured to ignore
552 download errors or not, this method may throw an exception or
553 not when errors are found, after printing the message.
555 if message is not None:
556 self.to_stderr(message)
557 if not self.params.get('ignoreerrors', False):
558 raise DownloadError(message)
559 self._download_retcode = 1
561 def slow_down(self, start_time, byte_counter):
562 """Sleep if the download speed is over the rate limit."""
563 rate_limit = self.params.get('ratelimit', None)
564 if rate_limit is None or byte_counter == 0:
567 elapsed = now - start_time
570 speed = float(byte_counter) / elapsed
571 if speed > rate_limit:
572 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
574 def temp_name(self, filename):
575 """Returns a temporary filename for the given filename."""
576 if self.params.get('nopart', False) or filename == u'-' or \
577 (os.path.exists(filename) and not os.path.isfile(filename)):
579 return filename + u'.part'
581 def undo_temp_name(self, filename):
582 if filename.endswith(u'.part'):
583 return filename[:-len(u'.part')]
586 def try_rename(self, old_filename, new_filename):
588 if old_filename == new_filename:
590 os.rename(old_filename, new_filename)
591 except (IOError, OSError), err:
592 self.trouble(u'ERROR: unable to rename file')
594 def try_utime(self, filename, last_modified_hdr):
595 """Try to set the last-modified time of the given file."""
596 if last_modified_hdr is None:
598 if not os.path.isfile(filename):
600 timestr = last_modified_hdr
603 filetime = timeconvert(timestr)
607 os.utime(filename,(time.time(), filetime))
611 def report_writedescription(self, descfn):
612 """ Report that the description file has been written """
613 self.to_screen(u'[info] Video description written to: %s' % descfn, ignore_encoding_errors=True)
615 def report_destination(self, filename):
616 """Report destination filename."""
617 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
619 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
620 """Report download progress."""
621 if self.params.get('noprogress', False):
623 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
624 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
625 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
626 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
628 def report_resuming_byte(self, resume_len):
629 """Report attempt to resume at given byte."""
630 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
632 def report_retry(self, count, retries):
633 """Report retry in case of HTTP error 5xx"""
634 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
636 def report_file_already_downloaded(self, file_name):
637 """Report file has already been fully downloaded."""
639 self.to_screen(u'[download] %s has already been downloaded' % file_name)
640 except (UnicodeEncodeError), err:
641 self.to_screen(u'[download] The file has already been downloaded')
643 def report_unable_to_resume(self):
644 """Report it was impossible to resume download."""
645 self.to_screen(u'[download] Unable to resume')
647 def report_finish(self):
648 """Report download finished."""
649 if self.params.get('noprogress', False):
650 self.to_screen(u'[download] Download completed')
654 def increment_downloads(self):
655 """Increment the ordinal that assigns a number to each file."""
656 self._num_downloads += 1
658 def prepare_filename(self, info_dict):
659 """Generate the output filename."""
661 template_dict = dict(info_dict)
662 template_dict['epoch'] = unicode(long(time.time()))
663 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
664 filename = self.params['outtmpl'] % template_dict
666 except (ValueError, KeyError), err:
667 self.trouble(u'ERROR: invalid system charset or erroneous output template')
670 def process_info(self, info_dict):
671 """Process a single dictionary returned by an InfoExtractor."""
672 filename = self.prepare_filename(info_dict)
673 # Do nothing else if in simulate mode
674 if self.params.get('simulate', False):
676 if self.params.get('forcetitle', False):
677 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
678 if self.params.get('forceurl', False):
679 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
680 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
681 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
682 if self.params.get('forcedescription', False) and 'description' in info_dict:
683 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
684 if self.params.get('forcefilename', False) and filename is not None:
685 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
691 if self.params.get('nooverwrites', False) and os.path.exists(filename):
692 self.to_stderr(u'WARNING: file exists and will be skipped')
696 self.pmkdir(filename)
697 except (OSError, IOError), err:
698 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
701 if self.params.get('writedescription', False):
703 descfn = filename + '.description'
704 with contextlib.closing(open(descfn, 'wb')) as descfile:
705 descfile.write(info_dict['description'].encode('utf-8'))
706 self.report_writedescription(descfn)
707 except (OSError, IOError):
708 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
712 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
713 except (OSError, IOError), err:
714 raise UnavailableVideoError
715 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
716 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
718 except (ContentTooShortError, ), err:
719 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
724 self.post_process(filename, info_dict)
725 except (PostProcessingError), err:
726 self.trouble(u'ERROR: postprocessing: %s' % str(err))
729 def download(self, url_list):
730 """Download a given list of URLs."""
731 if len(url_list) > 1 and self.fixed_template():
732 raise SameFileError(self.params['outtmpl'])
735 suitable_found = False
737 # Go to next InfoExtractor if not suitable
738 if not ie.suitable(url):
741 # Suitable InfoExtractor found
742 suitable_found = True
744 # Extract information from URL and process it
747 # Suitable InfoExtractor had been found; go to next URL
750 if not suitable_found:
751 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
753 return self._download_retcode
755 def post_process(self, filename, ie_info):
756 """Run the postprocessing chain on the given file."""
758 info['filepath'] = filename
764 def _download_with_rtmpdump(self, filename, url, player_url):
765 self.report_destination(filename)
766 tmpfilename = self.temp_name(filename)
768 # Check for rtmpdump first
770 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
771 except (OSError, IOError):
772 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
775 # Download using rtmpdump. rtmpdump returns exit code 2 when
776 # the connection was interrumpted and resuming appears to be
777 # possible. This is part of rtmpdump's normal usage, AFAIK.
778 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
779 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
780 while retval == 2 or retval == 1:
781 prevsize = os.path.getsize(tmpfilename)
782 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
783 time.sleep(5.0) # This seems to be needed
784 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
785 cursize = os.path.getsize(tmpfilename)
786 if prevsize == cursize and retval == 1:
789 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
790 self.try_rename(tmpfilename, filename)
793 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
796 def _do_download(self, filename, url, player_url):
797 # Check file already present
798 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
799 self.report_file_already_downloaded(filename)
802 # Attempt to download using rtmpdump
803 if url.startswith('rtmp'):
804 return self._download_with_rtmpdump(filename, url, player_url)
806 tmpfilename = self.temp_name(filename)
810 # Do not include the Accept-Encoding header
811 headers = {'Youtubedl-no-compression': 'True'}
812 basic_request = urllib2.Request(url, None, headers)
813 request = urllib2.Request(url, None, headers)
815 # Establish possible resume length
816 if os.path.isfile(tmpfilename):
817 resume_len = os.path.getsize(tmpfilename)
821 # Request parameters in case of being able to resume
822 if self.params.get('continuedl', False) and resume_len != 0:
823 self.report_resuming_byte(resume_len)
824 request.add_header('Range','bytes=%d-' % resume_len)
828 retries = self.params.get('retries', 0)
829 while count <= retries:
830 # Establish connection
832 data = urllib2.urlopen(request)
834 except (urllib2.HTTPError, ), err:
835 if (err.code < 500 or err.code >= 600) and err.code != 416:
836 # Unexpected HTTP error
838 elif err.code == 416:
839 # Unable to resume (requested range not satisfiable)
841 # Open the connection again without the range header
842 data = urllib2.urlopen(basic_request)
843 content_length = data.info()['Content-Length']
844 except (urllib2.HTTPError, ), err:
845 if err.code < 500 or err.code >= 600:
848 # Examine the reported length
849 if (content_length is not None and
850 (resume_len - 100 < long(content_length) < resume_len + 100)):
851 # The file had already been fully downloaded.
852 # Explanation to the above condition: in issue #175 it was revealed that
853 # YouTube sometimes adds or removes a few bytes from the end of the file,
854 # changing the file size slightly and causing problems for some users. So
855 # I decided to implement a suggested change and consider the file
856 # completely downloaded if the file size differs less than 100 bytes from
857 # the one in the hard drive.
858 self.report_file_already_downloaded(filename)
859 self.try_rename(tmpfilename, filename)
862 # The length does not match, we start the download over
863 self.report_unable_to_resume()
869 self.report_retry(count, retries)
872 self.trouble(u'ERROR: giving up after %s retries' % retries)
875 data_len = data.info().get('Content-length', None)
876 if data_len is not None:
877 data_len = long(data_len) + resume_len
878 data_len_str = self.format_bytes(data_len)
879 byte_counter = 0 + resume_len
885 data_block = data.read(block_size)
887 if len(data_block) == 0:
889 byte_counter += len(data_block)
891 # Open file just in time
894 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
895 filename = self.undo_temp_name(tmpfilename)
896 self.report_destination(filename)
897 except (OSError, IOError), err:
898 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
901 stream.write(data_block)
902 except (IOError, OSError), err:
903 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
905 block_size = self.best_block_size(after - before, len(data_block))
908 percent_str = self.calc_percent(byte_counter, data_len)
909 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
910 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
911 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
914 self.slow_down(start, byte_counter - resume_len)
918 if data_len is not None and byte_counter != data_len:
919 raise ContentTooShortError(byte_counter, long(data_len))
920 self.try_rename(tmpfilename, filename)
922 # Update file modification time
923 if self.params.get('updatetime', True):
924 self.try_utime(filename, data.info().get('last-modified', None))
928 class InfoExtractor(object):
929 """Information Extractor class.
931 Information extractors are the classes that, given a URL, extract
932 information from the video (or videos) the URL refers to. This
933 information includes the real video URL, the video title and simplified
934 title, author and others. The information is stored in a dictionary
935 which is then passed to the FileDownloader. The FileDownloader
936 processes this information possibly downloading the video to the file
937 system, among other possible outcomes. The dictionaries must include
938 the following fields:
940 id: Video identifier.
941 url: Final video URL.
942 uploader: Nickname of the video uploader.
943 title: Literal title.
944 stitle: Simplified title.
945 ext: Video filename extension.
946 format: Video format.
947 player_url: SWF Player URL (may be None).
949 The following fields are optional. Their primary purpose is to allow
950 youtube-dl to serve as the backend for a video search function, such
951 as the one in youtube2mp3. They are only used when their respective
952 forced printing functions are called:
954 thumbnail: Full URL to a video thumbnail image.
955 description: One-line video description.
957 Subclasses of this one should re-define the _real_initialize() and
958 _real_extract() methods, as well as the suitable() static method.
959 Probably, they should also be instantiated and added to the main
966 def __init__(self, downloader=None):
967 """Constructor. Receives an optional downloader."""
969 self.set_downloader(downloader)
973 """Receives a URL and returns True if suitable for this IE."""
976 def initialize(self):
977 """Initializes an instance (authentication, etc)."""
979 self._real_initialize()
982 def extract(self, url):
983 """Extracts URL information and returns it in list of dicts."""
985 return self._real_extract(url)
987 def set_downloader(self, downloader):
988 """Sets the downloader for this IE."""
989 self._downloader = downloader
991 def _real_initialize(self):
992 """Real initialization process. Redefine in subclasses."""
995 def _real_extract(self, url):
996 """Real extraction process. Redefine in subclasses."""
999 class YoutubeIE(InfoExtractor):
1000 """Information extractor for youtube.com."""
1002 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
1003 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1004 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1005 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1006 _NETRC_MACHINE = 'youtube'
1007 # Listed in order of quality
1008 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
1009 _video_extensions = {
1015 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1022 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1024 def report_lang(self):
1025 """Report attempt to set language."""
1026 self._downloader.to_screen(u'[youtube] Setting language')
1028 def report_login(self):
1029 """Report attempt to log in."""
1030 self._downloader.to_screen(u'[youtube] Logging in')
1032 def report_age_confirmation(self):
1033 """Report attempt to confirm age."""
1034 self._downloader.to_screen(u'[youtube] Confirming age')
1036 def report_video_webpage_download(self, video_id):
1037 """Report attempt to download video webpage."""
1038 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1040 def report_video_info_webpage_download(self, video_id):
1041 """Report attempt to download video info webpage."""
1042 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1044 def report_information_extraction(self, video_id):
1045 """Report attempt to extract video information."""
1046 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1048 def report_unavailable_format(self, video_id, format):
1049 """Report extracted video URL."""
1050 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1052 def report_rtmp_download(self):
1053 """Indicate the download will use the RTMP protocol."""
1054 self._downloader.to_screen(u'[youtube] RTMP download detected')
1056 def _real_initialize(self):
1057 if self._downloader is None:
1062 downloader_params = self._downloader.params
1064 # Attempt to use provided username and password or .netrc data
1065 if downloader_params.get('username', None) is not None:
1066 username = downloader_params['username']
1067 password = downloader_params['password']
1068 elif downloader_params.get('usenetrc', False):
1070 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1071 if info is not None:
1075 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1076 except (IOError, netrc.NetrcParseError), err:
1077 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1081 request = urllib2.Request(self._LANG_URL)
1084 urllib2.urlopen(request).read()
1085 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1086 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1089 # No authentication to be performed
1090 if username is None:
1095 'current_form': 'loginForm',
1097 'action_login': 'Log In',
1098 'username': username,
1099 'password': password,
1101 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1104 login_results = urllib2.urlopen(request).read()
1105 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1106 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1108 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1109 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1115 'action_confirm': 'Confirm',
1117 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1119 self.report_age_confirmation()
1120 age_results = urllib2.urlopen(request).read()
1121 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1122 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1125 def _real_extract(self, url):
1126 # Extract video id from URL
1127 mobj = re.match(self._VALID_URL, url)
1129 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1131 video_id = mobj.group(2)
1134 self.report_video_webpage_download(video_id)
1135 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1137 video_webpage = urllib2.urlopen(request).read()
1138 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1139 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1142 # Attempt to extract SWF player URL
1143 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1144 if mobj is not None:
1145 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1150 self.report_video_info_webpage_download(video_id)
1151 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1152 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1153 % (video_id, el_type))
1154 request = urllib2.Request(video_info_url)
1156 video_info_webpage = urllib2.urlopen(request).read()
1157 video_info = parse_qs(video_info_webpage)
1158 if 'token' in video_info:
1160 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1161 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1163 if 'token' not in video_info:
1164 if 'reason' in video_info:
1165 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1167 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1170 # Start extracting information
1171 self.report_information_extraction(video_id)
1174 if 'author' not in video_info:
1175 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1177 video_uploader = urllib.unquote_plus(video_info['author'][0])
1180 if 'title' not in video_info:
1181 self._downloader.trouble(u'ERROR: unable to extract video title')
1183 video_title = urllib.unquote_plus(video_info['title'][0])
1184 video_title = video_title.decode('utf-8')
1185 video_title = sanitize_title(video_title)
1188 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1189 simple_title = simple_title.strip(ur'_')
1192 if 'thumbnail_url' not in video_info:
1193 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1194 video_thumbnail = ''
1195 else: # don't panic if we can't find it
1196 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1200 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1201 if mobj is not None:
1202 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1203 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1204 for expression in format_expressions:
1206 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1214 video_description = u'No description available.'
1215 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1216 warnings.warn(u'You are using an old Python version, install Python 2.6+ or lxml. Falling back to old video description extractor.')
1217 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1218 if mobj is not None:
1219 video_description = mobj.group(1).decode('utf-8')
1221 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1222 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1223 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1226 video_token = urllib.unquote_plus(video_info['token'][0])
1228 # Decide which formats to download
1229 req_format = self._downloader.params.get('format', None)
1231 if 'fmt_url_map' in video_info and len(video_info['fmt_url_map']) >= 1 and ',' in video_info['fmt_url_map'][0]:
1232 url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
1233 format_limit = self._downloader.params.get('format_limit', None)
1234 if format_limit is not None and format_limit in self._available_formats:
1235 format_list = self._available_formats[self._available_formats.index(format_limit):]
1237 format_list = self._available_formats
1238 existing_formats = [x for x in format_list if x in url_map]
1239 if len(existing_formats) == 0:
1240 self._downloader.trouble(u'ERROR: no known formats available for video')
1242 if req_format is None:
1243 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1244 elif req_format == '-1':
1245 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1248 if req_format not in url_map:
1249 self._downloader.trouble(u'ERROR: requested format not available')
1251 video_url_list = [(req_format, url_map[req_format])] # Specific format
1253 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1254 self.report_rtmp_download()
1255 video_url_list = [(None, video_info['conn'][0])]
1258 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1261 for format_param, video_real_url in video_url_list:
1262 # At this point we have a new video
1263 self._downloader.increment_downloads()
1266 video_extension = self._video_extensions.get(format_param, 'flv')
1268 # Find the video URL in fmt_url_map or conn paramters
1270 # Process video information
1271 self._downloader.process_info({
1272 'id': video_id.decode('utf-8'),
1273 'url': video_real_url.decode('utf-8'),
1274 'uploader': video_uploader.decode('utf-8'),
1275 'upload_date': upload_date,
1276 'title': video_title,
1277 'stitle': simple_title,
1278 'ext': video_extension.decode('utf-8'),
1279 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1280 'thumbnail': video_thumbnail.decode('utf-8'),
1281 'description': video_description,
1282 'player_url': player_url,
1284 except UnavailableVideoError, err:
1285 self._downloader.trouble(u'\nERROR: unable to download video')
1288 class MetacafeIE(InfoExtractor):
1289 """Information Extractor for metacafe.com."""
1291 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1292 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1293 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1296 def __init__(self, youtube_ie, downloader=None):
1297 InfoExtractor.__init__(self, downloader)
1298 self._youtube_ie = youtube_ie
1302 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1304 def report_disclaimer(self):
1305 """Report disclaimer retrieval."""
1306 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1308 def report_age_confirmation(self):
1309 """Report attempt to confirm age."""
1310 self._downloader.to_screen(u'[metacafe] Confirming age')
1312 def report_download_webpage(self, video_id):
1313 """Report webpage download."""
1314 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1316 def report_extraction(self, video_id):
1317 """Report information extraction."""
1318 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1320 def _real_initialize(self):
1321 # Retrieve disclaimer
1322 request = urllib2.Request(self._DISCLAIMER)
1324 self.report_disclaimer()
1325 disclaimer = urllib2.urlopen(request).read()
1326 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1327 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1333 'submit': "Continue - I'm over 18",
1335 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1337 self.report_age_confirmation()
1338 disclaimer = urllib2.urlopen(request).read()
1339 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1340 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1343 def _real_extract(self, url):
1344 # Extract id and simplified title from URL
1345 mobj = re.match(self._VALID_URL, url)
1347 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1350 video_id = mobj.group(1)
1352 # Check if video comes from YouTube
1353 mobj2 = re.match(r'^yt-(.*)$', video_id)
1354 if mobj2 is not None:
1355 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1358 # At this point we have a new video
1359 self._downloader.increment_downloads()
1361 simple_title = mobj.group(2).decode('utf-8')
1363 # Retrieve video webpage to extract further information
1364 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1366 self.report_download_webpage(video_id)
1367 webpage = urllib2.urlopen(request).read()
1368 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1369 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1372 # Extract URL, uploader and title from webpage
1373 self.report_extraction(video_id)
1374 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1375 if mobj is not None:
1376 mediaURL = urllib.unquote(mobj.group(1))
1377 video_extension = mediaURL[-3:]
1379 # Extract gdaKey if available
1380 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1382 video_url = mediaURL
1384 gdaKey = mobj.group(1)
1385 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1387 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1389 self._downloader.trouble(u'ERROR: unable to extract media URL')
1391 vardict = parse_qs(mobj.group(1))
1392 if 'mediaData' not in vardict:
1393 self._downloader.trouble(u'ERROR: unable to extract media URL')
1395 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1397 self._downloader.trouble(u'ERROR: unable to extract media URL')
1399 mediaURL = mobj.group(1).replace('\\/', '/')
1400 video_extension = mediaURL[-3:]
1401 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1403 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1405 self._downloader.trouble(u'ERROR: unable to extract title')
1407 video_title = mobj.group(1).decode('utf-8')
1408 video_title = sanitize_title(video_title)
1410 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1412 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1414 video_uploader = mobj.group(1)
1417 # Process video information
1418 self._downloader.process_info({
1419 'id': video_id.decode('utf-8'),
1420 'url': video_url.decode('utf-8'),
1421 'uploader': video_uploader.decode('utf-8'),
1422 'upload_date': u'NA',
1423 'title': video_title,
1424 'stitle': simple_title,
1425 'ext': video_extension.decode('utf-8'),
1429 except UnavailableVideoError:
1430 self._downloader.trouble(u'\nERROR: unable to download video')
1433 class DailymotionIE(InfoExtractor):
1434 """Information Extractor for Dailymotion"""
1436 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1438 def __init__(self, downloader=None):
1439 InfoExtractor.__init__(self, downloader)
1443 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1445 def report_download_webpage(self, video_id):
1446 """Report webpage download."""
1447 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1449 def report_extraction(self, video_id):
1450 """Report information extraction."""
1451 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1453 def _real_initialize(self):
1456 def _real_extract(self, url):
1457 # Extract id and simplified title from URL
1458 mobj = re.match(self._VALID_URL, url)
1460 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1463 # At this point we have a new video
1464 self._downloader.increment_downloads()
1465 video_id = mobj.group(1)
1467 simple_title = mobj.group(2).decode('utf-8')
1468 video_extension = 'flv'
1470 # Retrieve video webpage to extract further information
1471 request = urllib2.Request(url)
1473 self.report_download_webpage(video_id)
1474 webpage = urllib2.urlopen(request).read()
1475 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1476 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1479 # Extract URL, uploader and title from webpage
1480 self.report_extraction(video_id)
1481 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1483 self._downloader.trouble(u'ERROR: unable to extract media URL')
1485 mediaURL = urllib.unquote(mobj.group(1))
1487 # if needed add http://www.dailymotion.com/ if relative URL
1489 video_url = mediaURL
1491 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1492 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1494 self._downloader.trouble(u'ERROR: unable to extract title')
1496 video_title = mobj.group(1).decode('utf-8')
1497 video_title = sanitize_title(video_title)
1499 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1501 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1503 video_uploader = mobj.group(1)
1506 # Process video information
1507 self._downloader.process_info({
1508 'id': video_id.decode('utf-8'),
1509 'url': video_url.decode('utf-8'),
1510 'uploader': video_uploader.decode('utf-8'),
1511 'upload_date': u'NA',
1512 'title': video_title,
1513 'stitle': simple_title,
1514 'ext': video_extension.decode('utf-8'),
1518 except UnavailableVideoError:
1519 self._downloader.trouble(u'\nERROR: unable to download video')
1521 class GoogleIE(InfoExtractor):
1522 """Information extractor for video.google.com."""
1524 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1526 def __init__(self, downloader=None):
1527 InfoExtractor.__init__(self, downloader)
1531 return (re.match(GoogleIE._VALID_URL, url) is not None)
1533 def report_download_webpage(self, video_id):
1534 """Report webpage download."""
1535 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1537 def report_extraction(self, video_id):
1538 """Report information extraction."""
1539 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1541 def _real_initialize(self):
1544 def _real_extract(self, url):
1545 # Extract id from URL
1546 mobj = re.match(self._VALID_URL, url)
1548 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1551 # At this point we have a new video
1552 self._downloader.increment_downloads()
1553 video_id = mobj.group(1)
1555 video_extension = 'mp4'
1557 # Retrieve video webpage to extract further information
1558 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1560 self.report_download_webpage(video_id)
1561 webpage = urllib2.urlopen(request).read()
1562 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1563 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1566 # Extract URL, uploader, and title from webpage
1567 self.report_extraction(video_id)
1568 mobj = re.search(r"download_url:'([^']+)'", webpage)
1570 video_extension = 'flv'
1571 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1573 self._downloader.trouble(u'ERROR: unable to extract media URL')
1575 mediaURL = urllib.unquote(mobj.group(1))
1576 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1577 mediaURL = mediaURL.replace('\\x26', '\x26')
1579 video_url = mediaURL
1581 mobj = re.search(r'<title>(.*)</title>', webpage)
1583 self._downloader.trouble(u'ERROR: unable to extract title')
1585 video_title = mobj.group(1).decode('utf-8')
1586 video_title = sanitize_title(video_title)
1587 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1589 # Extract video description
1590 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1592 self._downloader.trouble(u'ERROR: unable to extract video description')
1594 video_description = mobj.group(1).decode('utf-8')
1595 if not video_description:
1596 video_description = 'No description available.'
1598 # Extract video thumbnail
1599 if self._downloader.params.get('forcethumbnail', False):
1600 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1602 webpage = urllib2.urlopen(request).read()
1603 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1604 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1606 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1608 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1610 video_thumbnail = mobj.group(1)
1611 else: # we need something to pass to process_info
1612 video_thumbnail = ''
1616 # Process video information
1617 self._downloader.process_info({
1618 'id': video_id.decode('utf-8'),
1619 'url': video_url.decode('utf-8'),
1621 'upload_date': u'NA',
1622 'title': video_title,
1623 'stitle': simple_title,
1624 'ext': video_extension.decode('utf-8'),
1628 except UnavailableVideoError:
1629 self._downloader.trouble(u'\nERROR: unable to download video')
1632 class PhotobucketIE(InfoExtractor):
1633 """Information extractor for photobucket.com."""
1635 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1637 def __init__(self, downloader=None):
1638 InfoExtractor.__init__(self, downloader)
1642 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1644 def report_download_webpage(self, video_id):
1645 """Report webpage download."""
1646 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1648 def report_extraction(self, video_id):
1649 """Report information extraction."""
1650 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1652 def _real_initialize(self):
1655 def _real_extract(self, url):
1656 # Extract id from URL
1657 mobj = re.match(self._VALID_URL, url)
1659 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1662 # At this point we have a new video
1663 self._downloader.increment_downloads()
1664 video_id = mobj.group(1)
1666 video_extension = 'flv'
1668 # Retrieve video webpage to extract further information
1669 request = urllib2.Request(url)
1671 self.report_download_webpage(video_id)
1672 webpage = urllib2.urlopen(request).read()
1673 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1674 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1677 # Extract URL, uploader, and title from webpage
1678 self.report_extraction(video_id)
1679 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1681 self._downloader.trouble(u'ERROR: unable to extract media URL')
1683 mediaURL = urllib.unquote(mobj.group(1))
1685 video_url = mediaURL
1687 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1689 self._downloader.trouble(u'ERROR: unable to extract title')
1691 video_title = mobj.group(1).decode('utf-8')
1692 video_title = sanitize_title(video_title)
1693 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1695 video_uploader = mobj.group(2).decode('utf-8')
1698 # Process video information
1699 self._downloader.process_info({
1700 'id': video_id.decode('utf-8'),
1701 'url': video_url.decode('utf-8'),
1702 'uploader': video_uploader,
1703 'upload_date': u'NA',
1704 'title': video_title,
1705 'stitle': simple_title,
1706 'ext': video_extension.decode('utf-8'),
1710 except UnavailableVideoError:
1711 self._downloader.trouble(u'\nERROR: unable to download video')
1714 class YahooIE(InfoExtractor):
1715 """Information extractor for video.yahoo.com."""
1717 # _VALID_URL matches all Yahoo! Video URLs
1718 # _VPAGE_URL matches only the extractable '/watch/' URLs
1719 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1720 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1722 def __init__(self, downloader=None):
1723 InfoExtractor.__init__(self, downloader)
1727 return (re.match(YahooIE._VALID_URL, url) is not None)
1729 def report_download_webpage(self, video_id):
1730 """Report webpage download."""
1731 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1733 def report_extraction(self, video_id):
1734 """Report information extraction."""
1735 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1737 def _real_initialize(self):
1740 def _real_extract(self, url, new_video=True):
1741 # Extract ID from URL
1742 mobj = re.match(self._VALID_URL, url)
1744 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1747 # At this point we have a new video
1748 self._downloader.increment_downloads()
1749 video_id = mobj.group(2)
1750 video_extension = 'flv'
1752 # Rewrite valid but non-extractable URLs as
1753 # extractable English language /watch/ URLs
1754 if re.match(self._VPAGE_URL, url) is None:
1755 request = urllib2.Request(url)
1757 webpage = urllib2.urlopen(request).read()
1758 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1759 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1762 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1764 self._downloader.trouble(u'ERROR: Unable to extract id field')
1766 yahoo_id = mobj.group(1)
1768 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1770 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1772 yahoo_vid = mobj.group(1)
1774 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1775 return self._real_extract(url, new_video=False)
1777 # Retrieve video webpage to extract further information
1778 request = urllib2.Request(url)
1780 self.report_download_webpage(video_id)
1781 webpage = urllib2.urlopen(request).read()
1782 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1783 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1786 # Extract uploader and title from webpage
1787 self.report_extraction(video_id)
1788 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1790 self._downloader.trouble(u'ERROR: unable to extract video title')
1792 video_title = mobj.group(1).decode('utf-8')
1793 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1795 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1797 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1799 video_uploader = mobj.group(1).decode('utf-8')
1801 # Extract video thumbnail
1802 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1804 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1806 video_thumbnail = mobj.group(1).decode('utf-8')
1808 # Extract video description
1809 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1811 self._downloader.trouble(u'ERROR: unable to extract video description')
1813 video_description = mobj.group(1).decode('utf-8')
1814 if not video_description: video_description = 'No description available.'
1816 # Extract video height and width
1817 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1819 self._downloader.trouble(u'ERROR: unable to extract video height')
1821 yv_video_height = mobj.group(1)
1823 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1825 self._downloader.trouble(u'ERROR: unable to extract video width')
1827 yv_video_width = mobj.group(1)
1829 # Retrieve video playlist to extract media URL
1830 # I'm not completely sure what all these options are, but we
1831 # seem to need most of them, otherwise the server sends a 401.
1832 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1833 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1834 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1835 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1836 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1838 self.report_download_webpage(video_id)
1839 webpage = urllib2.urlopen(request).read()
1840 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1841 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1844 # Extract media URL from playlist XML
1845 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1847 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1849 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1850 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1853 # Process video information
1854 self._downloader.process_info({
1855 'id': video_id.decode('utf-8'),
1857 'uploader': video_uploader,
1858 'upload_date': u'NA',
1859 'title': video_title,
1860 'stitle': simple_title,
1861 'ext': video_extension.decode('utf-8'),
1862 'thumbnail': video_thumbnail.decode('utf-8'),
1863 'description': video_description,
1864 'thumbnail': video_thumbnail,
1865 'description': video_description,
1868 except UnavailableVideoError:
1869 self._downloader.trouble(u'\nERROR: unable to download video')
1872 class GenericIE(InfoExtractor):
1873 """Generic last-resort information extractor."""
1875 def __init__(self, downloader=None):
1876 InfoExtractor.__init__(self, downloader)
1882 def report_download_webpage(self, video_id):
1883 """Report webpage download."""
1884 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1885 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1887 def report_extraction(self, video_id):
1888 """Report information extraction."""
1889 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1891 def _real_initialize(self):
1894 def _real_extract(self, url):
1895 # At this point we have a new video
1896 self._downloader.increment_downloads()
1898 video_id = url.split('/')[-1]
1899 request = urllib2.Request(url)
1901 self.report_download_webpage(video_id)
1902 webpage = urllib2.urlopen(request).read()
1903 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1904 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1906 except ValueError, err:
1907 # since this is the last-resort InfoExtractor, if
1908 # this error is thrown, it'll be thrown here
1909 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1912 self.report_extraction(video_id)
1913 # Start with something easy: JW Player in SWFObject
1914 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1916 # Broaden the search a little bit
1917 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1919 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1922 # It's possible that one of the regexes
1923 # matched, but returned an empty group:
1924 if mobj.group(1) is None:
1925 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1928 video_url = urllib.unquote(mobj.group(1))
1929 video_id = os.path.basename(video_url)
1931 # here's a fun little line of code for you:
1932 video_extension = os.path.splitext(video_id)[1][1:]
1933 video_id = os.path.splitext(video_id)[0]
1935 # it's tempting to parse this further, but you would
1936 # have to take into account all the variations like
1937 # Video Title - Site Name
1938 # Site Name | Video Title
1939 # Video Title - Tagline | Site Name
1940 # and so on and so forth; it's just not practical
1941 mobj = re.search(r'<title>(.*)</title>', webpage)
1943 self._downloader.trouble(u'ERROR: unable to extract title')
1945 video_title = mobj.group(1).decode('utf-8')
1946 video_title = sanitize_title(video_title)
1947 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1949 # video uploader is domain name
1950 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1952 self._downloader.trouble(u'ERROR: unable to extract title')
1954 video_uploader = mobj.group(1).decode('utf-8')
1957 # Process video information
1958 self._downloader.process_info({
1959 'id': video_id.decode('utf-8'),
1960 'url': video_url.decode('utf-8'),
1961 'uploader': video_uploader,
1962 'upload_date': u'NA',
1963 'title': video_title,
1964 'stitle': simple_title,
1965 'ext': video_extension.decode('utf-8'),
1969 except UnavailableVideoError, err:
1970 self._downloader.trouble(u'\nERROR: unable to download video')
1973 class YoutubeSearchIE(InfoExtractor):
1974 """Information Extractor for YouTube search queries."""
1975 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1976 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1977 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1978 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1980 _max_youtube_results = 1000
1982 def __init__(self, youtube_ie, downloader=None):
1983 InfoExtractor.__init__(self, downloader)
1984 self._youtube_ie = youtube_ie
1988 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1990 def report_download_page(self, query, pagenum):
1991 """Report attempt to download playlist page with given number."""
1992 query = query.decode(preferredencoding())
1993 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1995 def _real_initialize(self):
1996 self._youtube_ie.initialize()
1998 def _real_extract(self, query):
1999 mobj = re.match(self._VALID_QUERY, query)
2001 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2004 prefix, query = query.split(':')
2006 query = query.encode('utf-8')
2008 self._download_n_results(query, 1)
2010 elif prefix == 'all':
2011 self._download_n_results(query, self._max_youtube_results)
2017 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2019 elif n > self._max_youtube_results:
2020 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2021 n = self._max_youtube_results
2022 self._download_n_results(query, n)
2024 except ValueError: # parsing prefix as integer fails
2025 self._download_n_results(query, 1)
2028 def _download_n_results(self, query, n):
2029 """Downloads a specified number of results for a query"""
2032 already_seen = set()
2036 self.report_download_page(query, pagenum)
2037 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2038 request = urllib2.Request(result_url)
2040 page = urllib2.urlopen(request).read()
2041 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2042 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2045 # Extract video identifiers
2046 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2047 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2048 if video_id not in already_seen:
2049 video_ids.append(video_id)
2050 already_seen.add(video_id)
2051 if len(video_ids) == n:
2052 # Specified n videos reached
2053 for id in video_ids:
2054 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2057 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2058 for id in video_ids:
2059 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2062 pagenum = pagenum + 1
2064 class GoogleSearchIE(InfoExtractor):
2065 """Information Extractor for Google Video search queries."""
2066 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2067 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2068 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2069 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2071 _max_google_results = 1000
2073 def __init__(self, google_ie, downloader=None):
2074 InfoExtractor.__init__(self, downloader)
2075 self._google_ie = google_ie
2079 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2081 def report_download_page(self, query, pagenum):
2082 """Report attempt to download playlist page with given number."""
2083 query = query.decode(preferredencoding())
2084 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2086 def _real_initialize(self):
2087 self._google_ie.initialize()
2089 def _real_extract(self, query):
2090 mobj = re.match(self._VALID_QUERY, query)
2092 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2095 prefix, query = query.split(':')
2097 query = query.encode('utf-8')
2099 self._download_n_results(query, 1)
2101 elif prefix == 'all':
2102 self._download_n_results(query, self._max_google_results)
2108 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2110 elif n > self._max_google_results:
2111 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2112 n = self._max_google_results
2113 self._download_n_results(query, n)
2115 except ValueError: # parsing prefix as integer fails
2116 self._download_n_results(query, 1)
2119 def _download_n_results(self, query, n):
2120 """Downloads a specified number of results for a query"""
2123 already_seen = set()
2127 self.report_download_page(query, pagenum)
2128 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2129 request = urllib2.Request(result_url)
2131 page = urllib2.urlopen(request).read()
2132 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2133 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2136 # Extract video identifiers
2137 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2138 video_id = mobj.group(1)
2139 if video_id not in already_seen:
2140 video_ids.append(video_id)
2141 already_seen.add(video_id)
2142 if len(video_ids) == n:
2143 # Specified n videos reached
2144 for id in video_ids:
2145 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2148 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2149 for id in video_ids:
2150 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2153 pagenum = pagenum + 1
2155 class YahooSearchIE(InfoExtractor):
2156 """Information Extractor for Yahoo! Video search queries."""
2157 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2158 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2159 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2160 _MORE_PAGES_INDICATOR = r'\s*Next'
2162 _max_yahoo_results = 1000
2164 def __init__(self, yahoo_ie, downloader=None):
2165 InfoExtractor.__init__(self, downloader)
2166 self._yahoo_ie = yahoo_ie
2170 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2172 def report_download_page(self, query, pagenum):
2173 """Report attempt to download playlist page with given number."""
2174 query = query.decode(preferredencoding())
2175 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2177 def _real_initialize(self):
2178 self._yahoo_ie.initialize()
2180 def _real_extract(self, query):
2181 mobj = re.match(self._VALID_QUERY, query)
2183 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2186 prefix, query = query.split(':')
2188 query = query.encode('utf-8')
2190 self._download_n_results(query, 1)
2192 elif prefix == 'all':
2193 self._download_n_results(query, self._max_yahoo_results)
2199 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2201 elif n > self._max_yahoo_results:
2202 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2203 n = self._max_yahoo_results
2204 self._download_n_results(query, n)
2206 except ValueError: # parsing prefix as integer fails
2207 self._download_n_results(query, 1)
2210 def _download_n_results(self, query, n):
2211 """Downloads a specified number of results for a query"""
2214 already_seen = set()
2218 self.report_download_page(query, pagenum)
2219 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2220 request = urllib2.Request(result_url)
2222 page = urllib2.urlopen(request).read()
2223 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2224 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2227 # Extract video identifiers
2228 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2229 video_id = mobj.group(1)
2230 if video_id not in already_seen:
2231 video_ids.append(video_id)
2232 already_seen.add(video_id)
2233 if len(video_ids) == n:
2234 # Specified n videos reached
2235 for id in video_ids:
2236 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2239 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2240 for id in video_ids:
2241 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2244 pagenum = pagenum + 1
2246 class YoutubePlaylistIE(InfoExtractor):
2247 """Information Extractor for YouTube playlists."""
2249 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2250 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2251 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2252 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2255 def __init__(self, youtube_ie, downloader=None):
2256 InfoExtractor.__init__(self, downloader)
2257 self._youtube_ie = youtube_ie
2261 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2263 def report_download_page(self, playlist_id, pagenum):
2264 """Report attempt to download playlist page with given number."""
2265 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2267 def _real_initialize(self):
2268 self._youtube_ie.initialize()
2270 def _real_extract(self, url):
2271 # Extract playlist id
2272 mobj = re.match(self._VALID_URL, url)
2274 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2278 if mobj.group(3) is not None:
2279 self._youtube_ie.extract(mobj.group(3))
2282 # Download playlist pages
2283 # prefix is 'p' as default for playlists but there are other types that need extra care
2284 playlist_prefix = mobj.group(1)
2285 if playlist_prefix == 'a':
2286 playlist_access = 'artist'
2288 playlist_prefix = 'p'
2289 playlist_access = 'view_play_list'
2290 playlist_id = mobj.group(2)
2295 self.report_download_page(playlist_id, pagenum)
2296 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2298 page = urllib2.urlopen(request).read()
2299 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2300 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2303 # Extract video identifiers
2305 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2306 if mobj.group(1) not in ids_in_page:
2307 ids_in_page.append(mobj.group(1))
2308 video_ids.extend(ids_in_page)
2310 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2312 pagenum = pagenum + 1
2314 playliststart = self._downloader.params.get('playliststart', 1) - 1
2315 playlistend = self._downloader.params.get('playlistend', -1)
2316 video_ids = video_ids[playliststart:playlistend]
2318 for id in video_ids:
2319 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2322 class YoutubeUserIE(InfoExtractor):
2323 """Information Extractor for YouTube users."""
2325 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2326 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2327 _GDATA_PAGE_SIZE = 50
2328 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2329 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2332 def __init__(self, youtube_ie, downloader=None):
2333 InfoExtractor.__init__(self, downloader)
2334 self._youtube_ie = youtube_ie
2338 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2340 def report_download_page(self, username, start_index):
2341 """Report attempt to download user page."""
2342 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2343 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2345 def _real_initialize(self):
2346 self._youtube_ie.initialize()
2348 def _real_extract(self, url):
2350 mobj = re.match(self._VALID_URL, url)
2352 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2355 username = mobj.group(1)
2357 # Download video ids using YouTube Data API. Result size per
2358 # query is limited (currently to 50 videos) so we need to query
2359 # page by page until there are no video ids - it means we got
2366 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2367 self.report_download_page(username, start_index)
2369 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2372 page = urllib2.urlopen(request).read()
2373 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2374 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2377 # Extract video identifiers
2380 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2381 if mobj.group(1) not in ids_in_page:
2382 ids_in_page.append(mobj.group(1))
2384 video_ids.extend(ids_in_page)
2386 # A little optimization - if current page is not
2387 # "full", ie. does not contain PAGE_SIZE video ids then
2388 # we can assume that this page is the last one - there
2389 # are no more ids on further pages - no need to query
2392 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2397 all_ids_count = len(video_ids)
2398 playliststart = self._downloader.params.get('playliststart', 1) - 1
2399 playlistend = self._downloader.params.get('playlistend', -1)
2401 if playlistend == -1:
2402 video_ids = video_ids[playliststart:]
2404 video_ids = video_ids[playliststart:playlistend]
2406 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2407 (username, all_ids_count, len(video_ids)))
2409 for video_id in video_ids:
2410 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2413 class DepositFilesIE(InfoExtractor):
2414 """Information extractor for depositfiles.com"""
2416 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2418 def __init__(self, downloader=None):
2419 InfoExtractor.__init__(self, downloader)
2423 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2425 def report_download_webpage(self, file_id):
2426 """Report webpage download."""
2427 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2429 def report_extraction(self, file_id):
2430 """Report information extraction."""
2431 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2433 def _real_initialize(self):
2436 def _real_extract(self, url):
2437 # At this point we have a new file
2438 self._downloader.increment_downloads()
2440 file_id = url.split('/')[-1]
2441 # Rebuild url in english locale
2442 url = 'http://depositfiles.com/en/files/' + file_id
2444 # Retrieve file webpage with 'Free download' button pressed
2445 free_download_indication = { 'gateway_result' : '1' }
2446 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2448 self.report_download_webpage(file_id)
2449 webpage = urllib2.urlopen(request).read()
2450 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2451 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2454 # Search for the real file URL
2455 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2456 if (mobj is None) or (mobj.group(1) is None):
2457 # Try to figure out reason of the error.
2458 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2459 if (mobj is not None) and (mobj.group(1) is not None):
2460 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2461 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2463 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2466 file_url = mobj.group(1)
2467 file_extension = os.path.splitext(file_url)[1][1:]
2469 # Search for file title
2470 mobj = re.search(r'<b title="(.*?)">', webpage)
2472 self._downloader.trouble(u'ERROR: unable to extract title')
2474 file_title = mobj.group(1).decode('utf-8')
2477 # Process file information
2478 self._downloader.process_info({
2479 'id': file_id.decode('utf-8'),
2480 'url': file_url.decode('utf-8'),
2482 'upload_date': u'NA',
2483 'title': file_title,
2484 'stitle': file_title,
2485 'ext': file_extension.decode('utf-8'),
2489 except UnavailableVideoError, err:
2490 self._downloader.trouble(u'ERROR: unable to download file')
2492 class FacebookIE(InfoExtractor):
2493 """Information Extractor for Facebook"""
2495 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2496 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2497 _NETRC_MACHINE = 'facebook'
2498 _available_formats = ['highqual', 'lowqual']
2499 _video_extensions = {
2504 def __init__(self, downloader=None):
2505 InfoExtractor.__init__(self, downloader)
2509 return (re.match(FacebookIE._VALID_URL, url) is not None)
2511 def _reporter(self, message):
2512 """Add header and report message."""
2513 self._downloader.to_screen(u'[facebook] %s' % message)
2515 def report_login(self):
2516 """Report attempt to log in."""
2517 self._reporter(u'Logging in')
2519 def report_video_webpage_download(self, video_id):
2520 """Report attempt to download video webpage."""
2521 self._reporter(u'%s: Downloading video webpage' % video_id)
2523 def report_information_extraction(self, video_id):
2524 """Report attempt to extract video information."""
2525 self._reporter(u'%s: Extracting video information' % video_id)
2527 def _parse_page(self, video_webpage):
2528 """Extract video information from page"""
2530 data = {'title': r'class="video_title datawrap">(.*?)</',
2531 'description': r'<div class="datawrap">(.*?)</div>',
2532 'owner': r'\("video_owner_name", "(.*?)"\)',
2533 'upload_date': r'data-date="(.*?)"',
2534 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2537 for piece in data.keys():
2538 mobj = re.search(data[piece], video_webpage)
2539 if mobj is not None:
2540 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2544 for fmt in self._available_formats:
2545 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2546 if mobj is not None:
2547 # URL is in a Javascript segment inside an escaped Unicode format within
2548 # the generally utf-8 page
2549 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2550 video_info['video_urls'] = video_urls
2554 def _real_initialize(self):
2555 if self._downloader is None:
2560 downloader_params = self._downloader.params
2562 # Attempt to use provided username and password or .netrc data
2563 if downloader_params.get('username', None) is not None:
2564 useremail = downloader_params['username']
2565 password = downloader_params['password']
2566 elif downloader_params.get('usenetrc', False):
2568 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2569 if info is not None:
2573 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2574 except (IOError, netrc.NetrcParseError), err:
2575 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2578 if useremail is None:
2587 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2590 login_results = urllib2.urlopen(request).read()
2591 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2592 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2594 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2595 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2598 def _real_extract(self, url):
2599 mobj = re.match(self._VALID_URL, url)
2601 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2603 video_id = mobj.group('ID')
2606 self.report_video_webpage_download(video_id)
2607 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2609 page = urllib2.urlopen(request)
2610 video_webpage = page.read()
2611 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2612 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2615 # Start extracting information
2616 self.report_information_extraction(video_id)
2618 # Extract information
2619 video_info = self._parse_page(video_webpage)
2622 if 'owner' not in video_info:
2623 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2625 video_uploader = video_info['owner']
2628 if 'title' not in video_info:
2629 self._downloader.trouble(u'ERROR: unable to extract video title')
2631 video_title = video_info['title']
2632 video_title = video_title.decode('utf-8')
2633 video_title = sanitize_title(video_title)
2636 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2637 simple_title = simple_title.strip(ur'_')
2640 if 'thumbnail' not in video_info:
2641 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2642 video_thumbnail = ''
2644 video_thumbnail = video_info['thumbnail']
2648 if 'upload_date' in video_info:
2649 upload_time = video_info['upload_date']
2650 timetuple = email.utils.parsedate_tz(upload_time)
2651 if timetuple is not None:
2653 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2658 video_description = video_info.get('description', 'No description available.')
2660 url_map = video_info['video_urls']
2661 if len(url_map.keys()) > 0:
2662 # Decide which formats to download
2663 req_format = self._downloader.params.get('format', None)
2664 format_limit = self._downloader.params.get('format_limit', None)
2666 if format_limit is not None and format_limit in self._available_formats:
2667 format_list = self._available_formats[self._available_formats.index(format_limit):]
2669 format_list = self._available_formats
2670 existing_formats = [x for x in format_list if x in url_map]
2671 if len(existing_formats) == 0:
2672 self._downloader.trouble(u'ERROR: no known formats available for video')
2674 if req_format is None:
2675 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2676 elif req_format == '-1':
2677 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2680 if req_format not in url_map:
2681 self._downloader.trouble(u'ERROR: requested format not available')
2683 video_url_list = [(req_format, url_map[req_format])] # Specific format
2685 for format_param, video_real_url in video_url_list:
2687 # At this point we have a new video
2688 self._downloader.increment_downloads()
2691 video_extension = self._video_extensions.get(format_param, 'mp4')
2693 # Find the video URL in fmt_url_map or conn paramters
2695 # Process video information
2696 self._downloader.process_info({
2697 'id': video_id.decode('utf-8'),
2698 'url': video_real_url.decode('utf-8'),
2699 'uploader': video_uploader.decode('utf-8'),
2700 'upload_date': upload_date,
2701 'title': video_title,
2702 'stitle': simple_title,
2703 'ext': video_extension.decode('utf-8'),
2704 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2705 'thumbnail': video_thumbnail.decode('utf-8'),
2706 'description': video_description.decode('utf-8'),
2709 except UnavailableVideoError, err:
2710 self._downloader.trouble(u'\nERROR: unable to download video')
2712 class BlipTVIE(InfoExtractor):
2713 """Information extractor for blip.tv"""
2715 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip.tv(/.+)$'
2716 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2720 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2722 def report_extraction(self, file_id):
2723 """Report information extraction."""
2724 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2726 def _simplify_title(self, title):
2727 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2728 res = res.strip(ur'_')
2731 def _real_extract(self, url):
2732 mobj = re.match(self._VALID_URL, url)
2734 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2737 json_url = url + ('&' if '?' in url else '?') + 'skin=json&version=2&no_wrap=1'
2738 request = urllib2.Request(json_url)
2739 self.report_extraction(mobj.group(1))
2741 json_code = urllib2.urlopen(request).read()
2742 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2743 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2746 json_data = json.loads(json_code)
2747 data = json_data['Post'] if 'Post' in json_data else json_data
2749 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2750 video_url = data['media']['url']
2751 umobj = re.match(self._URL_EXT, video_url)
2753 raise ValueError('Can not determine filename extension')
2754 ext = umobj.group(1)
2756 self._downloader.increment_downloads()
2759 'id': data['item_id'],
2761 'uploader': data['display_name'],
2762 'upload_date': upload_date,
2763 'title': data['title'],
2764 'stitle': self._simplify_title(data['title']),
2766 'format': data['media']['mimeType'],
2767 'thumbnail': data['thumbnailUrl'],
2768 'description': data['description'],
2769 'player_url': data['embedUrl']
2771 except (ValueError,KeyError), err:
2772 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2776 self._downloader.process_info(info)
2777 except UnavailableVideoError, err:
2778 self._downloader.trouble(u'\nERROR: unable to download video')
2781 class PostProcessor(object):
2782 """Post Processor class.
2784 PostProcessor objects can be added to downloaders with their
2785 add_post_processor() method. When the downloader has finished a
2786 successful download, it will take its internal chain of PostProcessors
2787 and start calling the run() method on each one of them, first with
2788 an initial argument and then with the returned value of the previous
2791 The chain will be stopped if one of them ever returns None or the end
2792 of the chain is reached.
2794 PostProcessor objects follow a "mutual registration" process similar
2795 to InfoExtractor objects.
2800 def __init__(self, downloader=None):
2801 self._downloader = downloader
2803 def set_downloader(self, downloader):
2804 """Sets the downloader for this PP."""
2805 self._downloader = downloader
2807 def run(self, information):
2808 """Run the PostProcessor.
2810 The "information" argument is a dictionary like the ones
2811 composed by InfoExtractors. The only difference is that this
2812 one has an extra field called "filepath" that points to the
2815 When this method returns None, the postprocessing chain is
2816 stopped. However, this method may return an information
2817 dictionary that will be passed to the next postprocessing
2818 object in the chain. It can be the one it received after
2819 changing some fields.
2821 In addition, this method may raise a PostProcessingError
2822 exception that will be taken into account by the downloader
2825 return information # by default, do nothing
2827 class FFmpegExtractAudioPP(PostProcessor):
2829 def __init__(self, downloader=None, preferredcodec=None):
2830 PostProcessor.__init__(self, downloader)
2831 if preferredcodec is None:
2832 preferredcodec = 'best'
2833 self._preferredcodec = preferredcodec
2836 def get_audio_codec(path):
2838 cmd = ['ffprobe', '-show_streams', '--', path]
2839 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
2840 output = handle.communicate()[0]
2841 if handle.wait() != 0:
2843 except (IOError, OSError):
2846 for line in output.split('\n'):
2847 if line.startswith('codec_name='):
2848 audio_codec = line.split('=')[1].strip()
2849 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
2854 def run_ffmpeg(path, out_path, codec, more_opts):
2856 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
2857 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
2859 except (IOError, OSError):
2862 def run(self, information):
2863 path = information['filepath']
2865 filecodec = self.get_audio_codec(path)
2866 if filecodec is None:
2867 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
2871 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
2872 if filecodec == 'aac' or filecodec == 'mp3':
2873 # Lossless if possible
2875 extension = filecodec
2876 if filecodec == 'aac':
2877 more_opts = ['-f', 'adts']
2880 acodec = 'libmp3lame'
2882 more_opts = ['-ab', '128k']
2884 # We convert the audio (lossy)
2885 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
2886 extension = self._preferredcodec
2887 more_opts = ['-ab', '128k']
2888 if self._preferredcodec == 'aac':
2889 more_opts += ['-f', 'adts']
2891 (prefix, ext) = os.path.splitext(path)
2892 new_path = prefix + '.' + extension
2893 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
2894 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
2897 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
2902 except (IOError, OSError):
2903 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
2906 information['filepath'] = new_path
2909 ### MAIN PROGRAM ###
2910 if __name__ == '__main__':
2912 # Modules needed only when running the main program
2916 # Function to update the program file with the latest version from the repository.
2917 def update_self(downloader, filename):
2918 # Note: downloader only used for options
2919 if not os.access(filename, os.W_OK):
2920 sys.exit('ERROR: no write permissions on %s' % filename)
2922 downloader.to_screen('Updating to latest stable version...')
2924 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2925 latest_version = urllib.urlopen(latest_url).read().strip()
2926 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2927 newcontent = urllib.urlopen(prog_url).read()
2928 except (IOError, OSError), err:
2929 sys.exit('ERROR: unable to download latest version')
2931 stream = open(filename, 'w')
2932 stream.write(newcontent)
2934 except (IOError, OSError), err:
2935 sys.exit('ERROR: unable to overwrite current version')
2936 downloader.to_screen('Updated to version %s' % latest_version)
2938 # Parse command line
2939 parser = optparse.OptionParser(
2940 usage='Usage: %prog [options] url...',
2941 version='2011.07.09-phihag',
2942 conflict_handler='resolve',
2945 parser.add_option('-h', '--help',
2946 action='help', help='print this help text and exit')
2947 parser.add_option('-v', '--version',
2948 action='version', help='print program version and exit')
2949 parser.add_option('-U', '--update',
2950 action='store_true', dest='update_self', help='update this program to latest stable version')
2951 parser.add_option('-i', '--ignore-errors',
2952 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2953 parser.add_option('-r', '--rate-limit',
2954 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2955 parser.add_option('-R', '--retries',
2956 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2957 parser.add_option('--playlist-start',
2958 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2959 parser.add_option('--playlist-end',
2960 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2961 parser.add_option('--dump-user-agent',
2962 action='store_true', dest='dump_user_agent',
2963 help='display the current browser identification', default=False)
2965 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2966 authentication.add_option('-u', '--username',
2967 dest='username', metavar='USERNAME', help='account username')
2968 authentication.add_option('-p', '--password',
2969 dest='password', metavar='PASSWORD', help='account password')
2970 authentication.add_option('-n', '--netrc',
2971 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2972 parser.add_option_group(authentication)
2974 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2975 video_format.add_option('-f', '--format',
2976 action='store', dest='format', metavar='FORMAT', help='video format code')
2977 video_format.add_option('--all-formats',
2978 action='store_const', dest='format', help='download all available video formats', const='-1')
2979 video_format.add_option('--max-quality',
2980 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2981 parser.add_option_group(video_format)
2983 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2984 verbosity.add_option('-q', '--quiet',
2985 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2986 verbosity.add_option('-s', '--simulate',
2987 action='store_true', dest='simulate', help='do not download video', default=False)
2988 verbosity.add_option('-g', '--get-url',
2989 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2990 verbosity.add_option('-e', '--get-title',
2991 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2992 verbosity.add_option('--get-thumbnail',
2993 action='store_true', dest='getthumbnail',
2994 help='simulate, quiet but print thumbnail URL', default=False)
2995 verbosity.add_option('--get-description',
2996 action='store_true', dest='getdescription',
2997 help='simulate, quiet but print video description', default=False)
2998 verbosity.add_option('--get-filename',
2999 action='store_true', dest='getfilename',
3000 help='simulate, quiet but print output filename', default=False)
3001 verbosity.add_option('--no-progress',
3002 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3003 verbosity.add_option('--console-title',
3004 action='store_true', dest='consoletitle',
3005 help='display progress in console titlebar', default=False)
3006 parser.add_option_group(verbosity)
3008 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3009 filesystem.add_option('-t', '--title',
3010 action='store_true', dest='usetitle', help='use title in file name', default=False)
3011 filesystem.add_option('-l', '--literal',
3012 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3013 filesystem.add_option('-A', '--auto-number',
3014 action='store_true', dest='autonumber',
3015 help='number downloaded files starting from 00000', default=False)
3016 filesystem.add_option('-o', '--output',
3017 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3018 filesystem.add_option('-a', '--batch-file',
3019 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3020 filesystem.add_option('-w', '--no-overwrites',
3021 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3022 filesystem.add_option('-c', '--continue',
3023 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3024 filesystem.add_option('--cookies',
3025 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3026 filesystem.add_option('--no-part',
3027 action='store_true', dest='nopart', help='do not use .part files', default=False)
3028 filesystem.add_option('--no-mtime',
3029 action='store_false', dest='updatetime',
3030 help='do not use the Last-modified header to set the file modification time', default=True)
3031 filesystem.add_option('--write-description',
3032 action='store_true', dest='writedescription',
3033 help='write video description to a .description file', default=False)
3034 parser.add_option_group(filesystem)
3036 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3037 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3038 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3039 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3040 help='"best", "aac" or "mp3"; best by default')
3041 parser.add_option_group(postproc)
3043 (opts, args) = parser.parse_args()
3045 # Open appropriate CookieJar
3046 if opts.cookiefile is None:
3047 jar = cookielib.CookieJar()
3050 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3051 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3053 except (IOError, OSError), err:
3054 sys.exit(u'ERROR: unable to open cookie file')
3057 if opts.dump_user_agent:
3058 print std_headers['User-Agent']
3061 # General configuration
3062 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3063 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
3064 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3066 # Batch file verification
3068 if opts.batchfile is not None:
3070 if opts.batchfile == '-':
3073 batchfd = open(opts.batchfile, 'r')
3074 batchurls = batchfd.readlines()
3075 batchurls = [x.strip() for x in batchurls]
3076 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3078 sys.exit(u'ERROR: batch file could not be read')
3079 all_urls = batchurls + args
3081 # Conflicting, missing and erroneous options
3082 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3083 parser.error(u'using .netrc conflicts with giving username/password')
3084 if opts.password is not None and opts.username is None:
3085 parser.error(u'account username missing')
3086 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3087 parser.error(u'using output template conflicts with using title, literal title or auto number')
3088 if opts.usetitle and opts.useliteral:
3089 parser.error(u'using title conflicts with using literal title')
3090 if opts.username is not None and opts.password is None:
3091 opts.password = getpass.getpass(u'Type account password and press return:')
3092 if opts.ratelimit is not None:
3093 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3094 if numeric_limit is None:
3095 parser.error(u'invalid rate limit specified')
3096 opts.ratelimit = numeric_limit
3097 if opts.retries is not None:
3099 opts.retries = long(opts.retries)
3100 except (TypeError, ValueError), err:
3101 parser.error(u'invalid retry count specified')
3103 opts.playliststart = long(opts.playliststart)
3104 if opts.playliststart <= 0:
3106 except (TypeError, ValueError), err:
3107 parser.error(u'invalid playlist start number specified')
3109 opts.playlistend = long(opts.playlistend)
3110 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3112 except (TypeError, ValueError), err:
3113 parser.error(u'invalid playlist end number specified')
3114 if opts.extractaudio:
3115 if opts.audioformat not in ['best', 'aac', 'mp3']:
3116 parser.error(u'invalid audio format specified')
3118 # Information extractors
3119 youtube_ie = YoutubeIE()
3120 metacafe_ie = MetacafeIE(youtube_ie)
3121 dailymotion_ie = DailymotionIE()
3122 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3123 youtube_user_ie = YoutubeUserIE(youtube_ie)
3124 youtube_search_ie = YoutubeSearchIE(youtube_ie)
3125 google_ie = GoogleIE()
3126 google_search_ie = GoogleSearchIE(google_ie)
3127 photobucket_ie = PhotobucketIE()
3128 yahoo_ie = YahooIE()
3129 yahoo_search_ie = YahooSearchIE(yahoo_ie)
3130 deposit_files_ie = DepositFilesIE()
3131 facebook_ie = FacebookIE()
3132 bliptv_ie = BlipTVIE()
3133 generic_ie = GenericIE()
3136 fd = FileDownloader({
3137 'usenetrc': opts.usenetrc,
3138 'username': opts.username,
3139 'password': opts.password,
3140 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3141 'forceurl': opts.geturl,
3142 'forcetitle': opts.gettitle,
3143 'forcethumbnail': opts.getthumbnail,
3144 'forcedescription': opts.getdescription,
3145 'forcefilename': opts.getfilename,
3146 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3147 'format': opts.format,
3148 'format_limit': opts.format_limit,
3149 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3150 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3151 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3152 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3153 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3154 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3155 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3156 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3157 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3158 or u'%(id)s.%(ext)s'),
3159 'ignoreerrors': opts.ignoreerrors,
3160 'ratelimit': opts.ratelimit,
3161 'nooverwrites': opts.nooverwrites,
3162 'retries': opts.retries,
3163 'continuedl': opts.continue_dl,
3164 'noprogress': opts.noprogress,
3165 'playliststart': opts.playliststart,
3166 'playlistend': opts.playlistend,
3167 'logtostderr': opts.outtmpl == '-',
3168 'consoletitle': opts.consoletitle,
3169 'nopart': opts.nopart,
3170 'updatetime': opts.updatetime,
3171 'writedescription': opts.writedescription,
3173 fd.add_info_extractor(youtube_search_ie)
3174 fd.add_info_extractor(youtube_pl_ie)
3175 fd.add_info_extractor(youtube_user_ie)
3176 fd.add_info_extractor(metacafe_ie)
3177 fd.add_info_extractor(dailymotion_ie)
3178 fd.add_info_extractor(youtube_ie)
3179 fd.add_info_extractor(google_ie)
3180 fd.add_info_extractor(google_search_ie)
3181 fd.add_info_extractor(photobucket_ie)
3182 fd.add_info_extractor(yahoo_ie)
3183 fd.add_info_extractor(yahoo_search_ie)
3184 fd.add_info_extractor(deposit_files_ie)
3185 fd.add_info_extractor(facebook_ie)
3186 fd.add_info_extractor(bliptv_ie)
3188 # This must come last since it's the
3189 # fallback if none of the others work
3190 fd.add_info_extractor(generic_ie)
3193 if opts.extractaudio:
3194 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3197 if opts.update_self:
3198 update_self(fd, sys.argv[0])
3201 if len(all_urls) < 1:
3202 if not opts.update_self:
3203 parser.error(u'you must provide at least one URL')
3206 retcode = fd.download(all_urls)
3208 # Dump cookie jar if requested
3209 if opts.cookiefile is not None:
3212 except (IOError, OSError), err:
3213 sys.exit(u'ERROR: unable to save cookie jar')
3217 except DownloadError:
3219 except SameFileError:
3220 sys.exit(u'ERROR: fixed output name but more than one file to download')
3221 except KeyboardInterrupt:
3222 sys.exit(u'\nERROR: Interrupted by user')