2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # Author: Vasyl' Vavrychuk
7 # Author: Witold Baryluk
8 # Author: Paweł Paprota
9 # Author: Gergely Imreh
10 # Author: Philipp Hagemeister <phihag@phihag.de>
11 # License: Public domain code
12 from __future__ import with_statement
38 import cStringIO as StringIO
42 # parse_qs was moved from the cgi module to the urlparse module recently.
44 from urlparse import parse_qs
46 from cgi import parse_qs
50 except ImportError: # Python < 2.6
54 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b11) Gecko/20100101 Firefox/4.0b11',
55 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
56 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
57 'Accept-Encoding': 'gzip, deflate',
58 'Accept-Language': 'en-us,en;q=0.5',
61 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
65 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
71 def raiseError(msg, i):
72 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
73 def skipSpace(i, expectMore=True):
74 while i < len(s) and s[i] in ' \t\r\n':
78 raiseError('Premature end', i)
80 def decodeEscape(match):
96 return unichr(int(esc[1:5], 16))
97 if len(esc) == 5+6 and esc[5:7] == '\\u':
98 hi = int(esc[1:5], 16)
99 low = int(esc[7:11], 16)
100 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
101 raise ValueError('Unknown escape ' + str(esc))
108 while s[e-bslashes-1] == '\\':
110 if bslashes % 2 == 1:
114 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
115 stri = rexp.sub(decodeEscape, s[i:e])
121 if s[i] == '}': # Empty dictionary
125 raiseError('Expected a string object key', i)
126 i,key = parseString(i)
128 if i >= len(s) or s[i] != ':':
129 raiseError('Expected a colon', i)
136 raiseError('Expected comma or closing curly brace', i)
141 if s[i] == ']': # Empty array
146 i = skipSpace(i) # Raise exception if premature end
150 raiseError('Expected a comma or closing bracket', i)
152 def parseDiscrete(i):
153 for k,v in {'true': True, 'false': False, 'null': None}.items():
154 if s.startswith(k, i):
156 raiseError('Not a boolean (or null)', i)
158 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
160 raiseError('Not a number', i)
162 if '.' in nums or 'e' in nums or 'E' in nums:
163 return (i+len(nums), float(nums))
164 return (i+len(nums), int(nums))
165 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
168 i,res = CHARMAP.get(s[i], parseNumber)(i)
169 i = skipSpace(i, False)
173 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
176 def preferredencoding():
177 """Get preferred encoding.
179 Returns the best encoding scheme for the system, based on
180 locale.getpreferredencoding() and some further tweaks.
182 def yield_preferredencoding():
184 pref = locale.getpreferredencoding()
190 return yield_preferredencoding().next()
192 def htmlentity_transform(matchobj):
193 """Transforms an HTML entity to a Unicode character.
195 This function receives a match object and is intended to be used with
196 the re.sub() function.
198 entity = matchobj.group(1)
200 # Known non-numeric HTML entity
201 if entity in htmlentitydefs.name2codepoint:
202 return unichr(htmlentitydefs.name2codepoint[entity])
205 mobj = re.match(ur'(?u)#(x?\d+)', entity)
207 numstr = mobj.group(1)
208 if numstr.startswith(u'x'):
210 numstr = u'0%s' % numstr
213 return unichr(long(numstr, base))
215 # Unknown entity in name, return its literal representation
216 return (u'&%s;' % entity)
218 def sanitize_title(utitle):
219 """Sanitizes a video title so it could be used as part of a filename."""
220 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
221 return utitle.replace(unicode(os.sep), u'%')
223 def sanitize_open(filename, open_mode):
224 """Try to open the given filename, and slightly tweak it if this fails.
226 Attempts to open the given filename. If this fails, it tries to change
227 the filename slightly, step by step, until it's either able to open it
228 or it fails and raises a final exception, like the standard open()
231 It returns the tuple (stream, definitive_file_name).
235 if sys.platform == 'win32':
237 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
238 return (sys.stdout, filename)
239 stream = open(filename, open_mode)
240 return (stream, filename)
241 except (IOError, OSError), err:
242 # In case of error, try to remove win32 forbidden chars
243 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
245 # An exception here should be caught in the caller
246 stream = open(filename, open_mode)
247 return (stream, filename)
249 def timeconvert(timestr):
250 """Convert RFC 2822 defined time string into system timestamp"""
252 timetuple = email.utils.parsedate_tz(timestr)
253 if timetuple is not None:
254 timestamp = email.utils.mktime_tz(timetuple)
257 class DownloadError(Exception):
258 """Download Error exception.
260 This exception may be thrown by FileDownloader objects if they are not
261 configured to continue on errors. They will contain the appropriate
266 class SameFileError(Exception):
267 """Same File exception.
269 This exception will be thrown by FileDownloader objects if they detect
270 multiple files would have to be downloaded to the same file on disk.
274 class PostProcessingError(Exception):
275 """Post Processing exception.
277 This exception may be raised by PostProcessor's .run() method to
278 indicate an error in the postprocessing task.
282 class UnavailableVideoError(Exception):
283 """Unavailable Format exception.
285 This exception will be thrown when a video is requested
286 in a format that is not available for that video.
290 class ContentTooShortError(Exception):
291 """Content Too Short exception.
293 This exception may be raised by FileDownloader objects when a file they
294 download is too small for what the server announced first, indicating
295 the connection was probably interrupted.
301 def __init__(self, downloaded, expected):
302 self.downloaded = downloaded
303 self.expected = expected
305 class YoutubeDLHandler(urllib2.HTTPHandler):
306 """Handler for HTTP requests and responses.
308 This class, when installed with an OpenerDirector, automatically adds
309 the standard headers to every HTTP request and handles gzipped and
310 deflated responses from web servers. If compression is to be avoided in
311 a particular request, the original request in the program code only has
312 to include the HTTP header "Youtubedl-No-Compression", which will be
313 removed before making the real request.
315 Part of this code was copied from:
317 http://techknack.net/python-urllib2-handlers/
319 Andrew Rowls, the author of that code, agreed to release it to the
326 return zlib.decompress(data, -zlib.MAX_WBITS)
328 return zlib.decompress(data)
331 def addinfourl_wrapper(stream, headers, url, code):
332 if hasattr(urllib2.addinfourl, 'getcode'):
333 return urllib2.addinfourl(stream, headers, url, code)
334 ret = urllib2.addinfourl(stream, headers, url)
338 def http_request(self, req):
339 for h in std_headers:
342 req.add_header(h, std_headers[h])
343 if 'Youtubedl-no-compression' in req.headers:
344 if 'Accept-encoding' in req.headers:
345 del req.headers['Accept-encoding']
346 del req.headers['Youtubedl-no-compression']
349 def http_response(self, req, resp):
352 if resp.headers.get('Content-encoding', '') == 'gzip':
353 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
354 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
355 resp.msg = old_resp.msg
357 if resp.headers.get('Content-encoding', '') == 'deflate':
358 gz = StringIO.StringIO(self.deflate(resp.read()))
359 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
360 resp.msg = old_resp.msg
363 class FileDownloader(object):
364 """File Downloader class.
366 File downloader objects are the ones responsible of downloading the
367 actual video file and writing it to disk if the user has requested
368 it, among some other tasks. In most cases there should be one per
369 program. As, given a video URL, the downloader doesn't know how to
370 extract all the needed information, task that InfoExtractors do, it
371 has to pass the URL to one of them.
373 For this, file downloader objects have a method that allows
374 InfoExtractors to be registered in a given order. When it is passed
375 a URL, the file downloader handles it to the first InfoExtractor it
376 finds that reports being able to handle it. The InfoExtractor extracts
377 all the information about the video or videos the URL refers to, and
378 asks the FileDownloader to process the video information, possibly
379 downloading the video.
381 File downloaders accept a lot of parameters. In order not to saturate
382 the object constructor with arguments, it receives a dictionary of
383 options instead. These options are available through the params
384 attribute for the InfoExtractors to use. The FileDownloader also
385 registers itself as the downloader in charge for the InfoExtractors
386 that are added to it, so this is a "mutual registration".
390 username: Username for authentication purposes.
391 password: Password for authentication purposes.
392 usenetrc: Use netrc for authentication instead.
393 quiet: Do not print messages to stdout.
394 forceurl: Force printing final URL.
395 forcetitle: Force printing title.
396 forcethumbnail: Force printing thumbnail URL.
397 forcedescription: Force printing description.
398 forcefilename: Force printing final filename.
399 simulate: Do not download the video files.
400 format: Video format code.
401 format_limit: Highest quality format to try.
402 outtmpl: Template for output names.
403 ignoreerrors: Do not stop on download errors.
404 ratelimit: Download speed limit, in bytes/sec.
405 nooverwrites: Prevent overwriting files.
406 retries: Number of times to retry for HTTP error 5xx
407 continuedl: Try to continue downloads if possible.
408 noprogress: Do not print the progress bar.
409 playliststart: Playlist item to start at.
410 playlistend: Playlist item to end at.
411 logtostderr: Log messages to stderr instead of stdout.
412 consoletitle: Display progress in console window's titlebar.
413 nopart: Do not use temporary .part files.
414 updatetime: Use the Last-modified header to set output file timestamps.
415 writedescription: Write the video description to a .description file
416 writeinfojson: Write the video description to a .info.json file
422 _download_retcode = None
423 _num_downloads = None
426 def __init__(self, params):
427 """Create a FileDownloader object with the given options."""
430 self._download_retcode = 0
431 self._num_downloads = 0
432 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
436 def pmkdir(filename):
437 """Create directory components in filename. Similar to Unix "mkdir -p"."""
438 components = filename.split(os.sep)
439 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
440 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
441 for dir in aggregate:
442 if not os.path.exists(dir):
446 def format_bytes(bytes):
449 if type(bytes) is str:
454 exponent = long(math.log(bytes, 1024.0))
455 suffix = 'bkMGTPEZY'[exponent]
456 converted = float(bytes) / float(1024**exponent)
457 return '%.2f%s' % (converted, suffix)
460 def calc_percent(byte_counter, data_len):
463 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
466 def calc_eta(start, now, total, current):
470 if current == 0 or dif < 0.001: # One millisecond
472 rate = float(current) / dif
473 eta = long((float(total) - float(current)) / rate)
474 (eta_mins, eta_secs) = divmod(eta, 60)
477 return '%02d:%02d' % (eta_mins, eta_secs)
480 def calc_speed(start, now, bytes):
482 if bytes == 0 or dif < 0.001: # One millisecond
483 return '%10s' % '---b/s'
484 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
487 def best_block_size(elapsed_time, bytes):
488 new_min = max(bytes / 2.0, 1.0)
489 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
490 if elapsed_time < 0.001:
492 rate = bytes / elapsed_time
500 def parse_bytes(bytestr):
501 """Parse a string indicating a byte quantity into a long integer."""
502 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
505 number = float(matchobj.group(1))
506 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
507 return long(round(number * multiplier))
509 def add_info_extractor(self, ie):
510 """Add an InfoExtractor object to the end of the list."""
512 ie.set_downloader(self)
514 def add_post_processor(self, pp):
515 """Add a PostProcessor object to the end of the chain."""
517 pp.set_downloader(self)
519 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
520 """Print message to stdout if not in quiet mode."""
522 if not self.params.get('quiet', False):
523 terminator = [u'\n', u''][skip_eol]
524 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
525 self._screen_file.flush()
526 except (UnicodeEncodeError), err:
527 if not ignore_encoding_errors:
530 def to_stderr(self, message):
531 """Print message to stderr."""
532 print >>sys.stderr, message.encode(preferredencoding())
534 def to_cons_title(self, message):
535 """Set console/terminal window title to message."""
536 if not self.params.get('consoletitle', False):
538 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
539 # c_wchar_p() might not be necessary if `message` is
540 # already of type unicode()
541 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
542 elif 'TERM' in os.environ:
543 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
545 def fixed_template(self):
546 """Checks if the output template is fixed."""
547 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
549 def trouble(self, message=None):
550 """Determine action to take when a download problem appears.
552 Depending on if the downloader has been configured to ignore
553 download errors or not, this method may throw an exception or
554 not when errors are found, after printing the message.
556 if message is not None:
557 self.to_stderr(message)
558 if not self.params.get('ignoreerrors', False):
559 raise DownloadError(message)
560 self._download_retcode = 1
562 def slow_down(self, start_time, byte_counter):
563 """Sleep if the download speed is over the rate limit."""
564 rate_limit = self.params.get('ratelimit', None)
565 if rate_limit is None or byte_counter == 0:
568 elapsed = now - start_time
571 speed = float(byte_counter) / elapsed
572 if speed > rate_limit:
573 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
575 def temp_name(self, filename):
576 """Returns a temporary filename for the given filename."""
577 if self.params.get('nopart', False) or filename == u'-' or \
578 (os.path.exists(filename) and not os.path.isfile(filename)):
580 return filename + u'.part'
582 def undo_temp_name(self, filename):
583 if filename.endswith(u'.part'):
584 return filename[:-len(u'.part')]
587 def try_rename(self, old_filename, new_filename):
589 if old_filename == new_filename:
591 os.rename(old_filename, new_filename)
592 except (IOError, OSError), err:
593 self.trouble(u'ERROR: unable to rename file')
595 def try_utime(self, filename, last_modified_hdr):
596 """Try to set the last-modified time of the given file."""
597 if last_modified_hdr is None:
599 if not os.path.isfile(filename):
601 timestr = last_modified_hdr
604 filetime = timeconvert(timestr)
608 os.utime(filename,(time.time(), filetime))
612 def report_writedescription(self, descfn):
613 """ Report that the description file is being written """
614 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
616 def report_writeinfojson(self, infofn):
617 """ Report that the metadata file has been written """
618 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
620 def report_destination(self, filename):
621 """Report destination filename."""
622 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
624 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
625 """Report download progress."""
626 if self.params.get('noprogress', False):
628 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
629 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
630 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
631 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
633 def report_resuming_byte(self, resume_len):
634 """Report attempt to resume at given byte."""
635 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
637 def report_retry(self, count, retries):
638 """Report retry in case of HTTP error 5xx"""
639 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
641 def report_file_already_downloaded(self, file_name):
642 """Report file has already been fully downloaded."""
644 self.to_screen(u'[download] %s has already been downloaded' % file_name)
645 except (UnicodeEncodeError), err:
646 self.to_screen(u'[download] The file has already been downloaded')
648 def report_unable_to_resume(self):
649 """Report it was impossible to resume download."""
650 self.to_screen(u'[download] Unable to resume')
652 def report_finish(self):
653 """Report download finished."""
654 if self.params.get('noprogress', False):
655 self.to_screen(u'[download] Download completed')
659 def increment_downloads(self):
660 """Increment the ordinal that assigns a number to each file."""
661 self._num_downloads += 1
663 def prepare_filename(self, info_dict):
664 """Generate the output filename."""
666 template_dict = dict(info_dict)
667 template_dict['epoch'] = unicode(long(time.time()))
668 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
669 filename = self.params['outtmpl'] % template_dict
671 except (ValueError, KeyError), err:
672 self.trouble(u'ERROR: invalid system charset or erroneous output template')
675 def process_info(self, info_dict):
676 """Process a single dictionary returned by an InfoExtractor."""
677 filename = self.prepare_filename(info_dict)
678 # Do nothing else if in simulate mode
679 if self.params.get('simulate', False):
681 if self.params.get('forcetitle', False):
682 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
683 if self.params.get('forceurl', False):
684 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
685 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
686 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
687 if self.params.get('forcedescription', False) and 'description' in info_dict:
688 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
689 if self.params.get('forcefilename', False) and filename is not None:
690 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
696 if self.params.get('nooverwrites', False) and os.path.exists(filename):
697 self.to_stderr(u'WARNING: file exists and will be skipped')
701 self.pmkdir(filename)
702 except (OSError, IOError), err:
703 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
706 if self.params.get('writedescription', False):
708 descfn = filename + '.description'
709 self.report_writedescription(descfn)
710 with contextlib.closing(open(descfn, 'wb')) as descfile:
711 descfile.write(info_dict['description'].encode('utf-8'))
712 except (OSError, IOError):
713 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
716 if self.params.get('writeinfojson', False):
717 infofn = filename + '.info.json'
718 self.report_writeinfojson(infofn)
721 except (NameError,AttributeError):
722 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
725 with contextlib.closing(open(infofn, 'wb')) as infof:
726 json.dump(info_dict, infof)
727 except (OSError, IOError):
728 self.trouble(u'ERROR: Cannot write metadata to JSON file: %s' % str(infofn))
732 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
733 except (OSError, IOError), err:
734 raise UnavailableVideoError
735 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
736 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
738 except (ContentTooShortError, ), err:
739 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
744 self.post_process(filename, info_dict)
745 except (PostProcessingError), err:
746 self.trouble(u'ERROR: postprocessing: %s' % str(err))
749 def download(self, url_list):
750 """Download a given list of URLs."""
751 if len(url_list) > 1 and self.fixed_template():
752 raise SameFileError(self.params['outtmpl'])
755 suitable_found = False
757 # Go to next InfoExtractor if not suitable
758 if not ie.suitable(url):
761 # Suitable InfoExtractor found
762 suitable_found = True
764 # Extract information from URL and process it
767 # Suitable InfoExtractor had been found; go to next URL
770 if not suitable_found:
771 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
773 return self._download_retcode
775 def post_process(self, filename, ie_info):
776 """Run the postprocessing chain on the given file."""
778 info['filepath'] = filename
784 def _download_with_rtmpdump(self, filename, url, player_url):
785 self.report_destination(filename)
786 tmpfilename = self.temp_name(filename)
788 # Check for rtmpdump first
790 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
791 except (OSError, IOError):
792 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
795 # Download using rtmpdump. rtmpdump returns exit code 2 when
796 # the connection was interrumpted and resuming appears to be
797 # possible. This is part of rtmpdump's normal usage, AFAIK.
798 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
799 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
800 while retval == 2 or retval == 1:
801 prevsize = os.path.getsize(tmpfilename)
802 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
803 time.sleep(5.0) # This seems to be needed
804 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
805 cursize = os.path.getsize(tmpfilename)
806 if prevsize == cursize and retval == 1:
809 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
810 self.try_rename(tmpfilename, filename)
813 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
816 def _do_download(self, filename, url, player_url):
817 # Check file already present
818 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
819 self.report_file_already_downloaded(filename)
822 # Attempt to download using rtmpdump
823 if url.startswith('rtmp'):
824 return self._download_with_rtmpdump(filename, url, player_url)
826 tmpfilename = self.temp_name(filename)
830 # Do not include the Accept-Encoding header
831 headers = {'Youtubedl-no-compression': 'True'}
832 basic_request = urllib2.Request(url, None, headers)
833 request = urllib2.Request(url, None, headers)
835 # Establish possible resume length
836 if os.path.isfile(tmpfilename):
837 resume_len = os.path.getsize(tmpfilename)
841 # Request parameters in case of being able to resume
842 if self.params.get('continuedl', False) and resume_len != 0:
843 self.report_resuming_byte(resume_len)
844 request.add_header('Range','bytes=%d-' % resume_len)
848 retries = self.params.get('retries', 0)
849 while count <= retries:
850 # Establish connection
852 data = urllib2.urlopen(request)
854 except (urllib2.HTTPError, ), err:
855 if (err.code < 500 or err.code >= 600) and err.code != 416:
856 # Unexpected HTTP error
858 elif err.code == 416:
859 # Unable to resume (requested range not satisfiable)
861 # Open the connection again without the range header
862 data = urllib2.urlopen(basic_request)
863 content_length = data.info()['Content-Length']
864 except (urllib2.HTTPError, ), err:
865 if err.code < 500 or err.code >= 600:
868 # Examine the reported length
869 if (content_length is not None and
870 (resume_len - 100 < long(content_length) < resume_len + 100)):
871 # The file had already been fully downloaded.
872 # Explanation to the above condition: in issue #175 it was revealed that
873 # YouTube sometimes adds or removes a few bytes from the end of the file,
874 # changing the file size slightly and causing problems for some users. So
875 # I decided to implement a suggested change and consider the file
876 # completely downloaded if the file size differs less than 100 bytes from
877 # the one in the hard drive.
878 self.report_file_already_downloaded(filename)
879 self.try_rename(tmpfilename, filename)
882 # The length does not match, we start the download over
883 self.report_unable_to_resume()
889 self.report_retry(count, retries)
892 self.trouble(u'ERROR: giving up after %s retries' % retries)
895 data_len = data.info().get('Content-length', None)
896 if data_len is not None:
897 data_len = long(data_len) + resume_len
898 data_len_str = self.format_bytes(data_len)
899 byte_counter = 0 + resume_len
905 data_block = data.read(block_size)
907 if len(data_block) == 0:
909 byte_counter += len(data_block)
911 # Open file just in time
914 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
915 filename = self.undo_temp_name(tmpfilename)
916 self.report_destination(filename)
917 except (OSError, IOError), err:
918 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
921 stream.write(data_block)
922 except (IOError, OSError), err:
923 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
925 block_size = self.best_block_size(after - before, len(data_block))
928 percent_str = self.calc_percent(byte_counter, data_len)
929 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
930 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
931 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
934 self.slow_down(start, byte_counter - resume_len)
938 if data_len is not None and byte_counter != data_len:
939 raise ContentTooShortError(byte_counter, long(data_len))
940 self.try_rename(tmpfilename, filename)
942 # Update file modification time
943 if self.params.get('updatetime', True):
944 self.try_utime(filename, data.info().get('last-modified', None))
948 class InfoExtractor(object):
949 """Information Extractor class.
951 Information extractors are the classes that, given a URL, extract
952 information from the video (or videos) the URL refers to. This
953 information includes the real video URL, the video title and simplified
954 title, author and others. The information is stored in a dictionary
955 which is then passed to the FileDownloader. The FileDownloader
956 processes this information possibly downloading the video to the file
957 system, among other possible outcomes. The dictionaries must include
958 the following fields:
960 id: Video identifier.
961 url: Final video URL.
962 uploader: Nickname of the video uploader.
963 title: Literal title.
964 stitle: Simplified title.
965 ext: Video filename extension.
966 format: Video format.
967 player_url: SWF Player URL (may be None).
969 The following fields are optional. Their primary purpose is to allow
970 youtube-dl to serve as the backend for a video search function, such
971 as the one in youtube2mp3. They are only used when their respective
972 forced printing functions are called:
974 thumbnail: Full URL to a video thumbnail image.
975 description: One-line video description.
977 Subclasses of this one should re-define the _real_initialize() and
978 _real_extract() methods, as well as the suitable() static method.
979 Probably, they should also be instantiated and added to the main
986 def __init__(self, downloader=None):
987 """Constructor. Receives an optional downloader."""
989 self.set_downloader(downloader)
993 """Receives a URL and returns True if suitable for this IE."""
996 def initialize(self):
997 """Initializes an instance (authentication, etc)."""
999 self._real_initialize()
1002 def extract(self, url):
1003 """Extracts URL information and returns it in list of dicts."""
1005 return self._real_extract(url)
1007 def set_downloader(self, downloader):
1008 """Sets the downloader for this IE."""
1009 self._downloader = downloader
1011 def _real_initialize(self):
1012 """Real initialization process. Redefine in subclasses."""
1015 def _real_extract(self, url):
1016 """Real extraction process. Redefine in subclasses."""
1019 class YoutubeIE(InfoExtractor):
1020 """Information extractor for youtube.com."""
1022 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
1023 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1024 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1025 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1026 _NETRC_MACHINE = 'youtube'
1027 # Listed in order of quality
1028 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
1029 _video_extensions = {
1035 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1042 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1044 def report_lang(self):
1045 """Report attempt to set language."""
1046 self._downloader.to_screen(u'[youtube] Setting language')
1048 def report_login(self):
1049 """Report attempt to log in."""
1050 self._downloader.to_screen(u'[youtube] Logging in')
1052 def report_age_confirmation(self):
1053 """Report attempt to confirm age."""
1054 self._downloader.to_screen(u'[youtube] Confirming age')
1056 def report_video_webpage_download(self, video_id):
1057 """Report attempt to download video webpage."""
1058 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1060 def report_video_info_webpage_download(self, video_id):
1061 """Report attempt to download video info webpage."""
1062 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1064 def report_information_extraction(self, video_id):
1065 """Report attempt to extract video information."""
1066 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1068 def report_unavailable_format(self, video_id, format):
1069 """Report extracted video URL."""
1070 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1072 def report_rtmp_download(self):
1073 """Indicate the download will use the RTMP protocol."""
1074 self._downloader.to_screen(u'[youtube] RTMP download detected')
1076 def _real_initialize(self):
1077 if self._downloader is None:
1082 downloader_params = self._downloader.params
1084 # Attempt to use provided username and password or .netrc data
1085 if downloader_params.get('username', None) is not None:
1086 username = downloader_params['username']
1087 password = downloader_params['password']
1088 elif downloader_params.get('usenetrc', False):
1090 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1091 if info is not None:
1095 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1096 except (IOError, netrc.NetrcParseError), err:
1097 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1101 request = urllib2.Request(self._LANG_URL)
1104 urllib2.urlopen(request).read()
1105 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1106 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1109 # No authentication to be performed
1110 if username is None:
1115 'current_form': 'loginForm',
1117 'action_login': 'Log In',
1118 'username': username,
1119 'password': password,
1121 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1124 login_results = urllib2.urlopen(request).read()
1125 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1126 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1128 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1129 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1135 'action_confirm': 'Confirm',
1137 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1139 self.report_age_confirmation()
1140 age_results = urllib2.urlopen(request).read()
1141 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1142 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1145 def _real_extract(self, url):
1146 # Extract video id from URL
1147 mobj = re.match(self._VALID_URL, url)
1149 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1151 video_id = mobj.group(2)
1154 self.report_video_webpage_download(video_id)
1155 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1157 video_webpage = urllib2.urlopen(request).read()
1158 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1159 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1162 # Attempt to extract SWF player URL
1163 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1164 if mobj is not None:
1165 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1170 self.report_video_info_webpage_download(video_id)
1171 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1172 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1173 % (video_id, el_type))
1174 request = urllib2.Request(video_info_url)
1176 video_info_webpage = urllib2.urlopen(request).read()
1177 video_info = parse_qs(video_info_webpage)
1178 if 'token' in video_info:
1180 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1181 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1183 if 'token' not in video_info:
1184 if 'reason' in video_info:
1185 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1187 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1190 # Start extracting information
1191 self.report_information_extraction(video_id)
1194 if 'author' not in video_info:
1195 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1197 video_uploader = urllib.unquote_plus(video_info['author'][0])
1200 if 'title' not in video_info:
1201 self._downloader.trouble(u'ERROR: unable to extract video title')
1203 video_title = urllib.unquote_plus(video_info['title'][0])
1204 video_title = video_title.decode('utf-8')
1205 video_title = sanitize_title(video_title)
1208 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1209 simple_title = simple_title.strip(ur'_')
1212 if 'thumbnail_url' not in video_info:
1213 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1214 video_thumbnail = ''
1215 else: # don't panic if we can't find it
1216 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1220 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1221 if mobj is not None:
1222 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1223 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1224 for expression in format_expressions:
1226 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1234 video_description = u'No description available.'
1235 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1236 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1237 if mobj is not None:
1238 video_description = mobj.group(1).decode('utf-8')
1240 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1241 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1242 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1243 # TODO use another parser
1246 video_token = urllib.unquote_plus(video_info['token'][0])
1248 # Decide which formats to download
1249 req_format = self._downloader.params.get('format', None)
1251 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1252 self.report_rtmp_download()
1253 video_url_list = [(None, video_info['conn'][0])]
1254 print(repr(video_info['conn'][0]))
1255 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1256 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1257 url_data = [dict(pairStr.split('=') for pairStr in uds.split('&')) for uds in url_data_strs]
1258 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1259 url_map = dict((ud['itag'], urllib.unquote(ud['url'])) for ud in url_data)
1261 format_limit = self._downloader.params.get('format_limit', None)
1262 if format_limit is not None and format_limit in self._available_formats:
1263 format_list = self._available_formats[self._available_formats.index(format_limit):]
1265 format_list = self._available_formats
1266 existing_formats = [x for x in format_list if x in url_map]
1267 if len(existing_formats) == 0:
1268 self._downloader.trouble(u'ERROR: no known formats available for video')
1270 if req_format is None:
1271 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1272 elif req_format == '-1':
1273 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1276 if req_format not in url_map:
1277 self._downloader.trouble(u'ERROR: requested format not available')
1279 video_url_list = [(req_format, url_map[req_format])] # Specific format
1281 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1284 for format_param, video_real_url in video_url_list:
1285 # At this point we have a new video
1286 self._downloader.increment_downloads()
1289 video_extension = self._video_extensions.get(format_param, 'flv')
1291 # Find the video URL in fmt_url_map or conn paramters
1293 # Process video information
1294 self._downloader.process_info({
1295 'id': video_id.decode('utf-8'),
1296 'url': video_real_url.decode('utf-8'),
1297 'uploader': video_uploader.decode('utf-8'),
1298 'upload_date': upload_date,
1299 'title': video_title,
1300 'stitle': simple_title,
1301 'ext': video_extension.decode('utf-8'),
1302 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1303 'thumbnail': video_thumbnail.decode('utf-8'),
1304 'description': video_description,
1305 'player_url': player_url,
1307 except UnavailableVideoError, err:
1308 self._downloader.trouble(u'\nERROR: unable to download video')
1311 class MetacafeIE(InfoExtractor):
1312 """Information Extractor for metacafe.com."""
1314 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1315 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1316 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1319 def __init__(self, youtube_ie, downloader=None):
1320 InfoExtractor.__init__(self, downloader)
1321 self._youtube_ie = youtube_ie
1325 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1327 def report_disclaimer(self):
1328 """Report disclaimer retrieval."""
1329 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1331 def report_age_confirmation(self):
1332 """Report attempt to confirm age."""
1333 self._downloader.to_screen(u'[metacafe] Confirming age')
1335 def report_download_webpage(self, video_id):
1336 """Report webpage download."""
1337 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1339 def report_extraction(self, video_id):
1340 """Report information extraction."""
1341 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1343 def _real_initialize(self):
1344 # Retrieve disclaimer
1345 request = urllib2.Request(self._DISCLAIMER)
1347 self.report_disclaimer()
1348 disclaimer = urllib2.urlopen(request).read()
1349 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1350 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1356 'submit': "Continue - I'm over 18",
1358 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1360 self.report_age_confirmation()
1361 disclaimer = urllib2.urlopen(request).read()
1362 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1363 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1366 def _real_extract(self, url):
1367 # Extract id and simplified title from URL
1368 mobj = re.match(self._VALID_URL, url)
1370 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1373 video_id = mobj.group(1)
1375 # Check if video comes from YouTube
1376 mobj2 = re.match(r'^yt-(.*)$', video_id)
1377 if mobj2 is not None:
1378 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1381 # At this point we have a new video
1382 self._downloader.increment_downloads()
1384 simple_title = mobj.group(2).decode('utf-8')
1386 # Retrieve video webpage to extract further information
1387 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1389 self.report_download_webpage(video_id)
1390 webpage = urllib2.urlopen(request).read()
1391 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1392 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1395 # Extract URL, uploader and title from webpage
1396 self.report_extraction(video_id)
1397 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1398 if mobj is not None:
1399 mediaURL = urllib.unquote(mobj.group(1))
1400 video_extension = mediaURL[-3:]
1402 # Extract gdaKey if available
1403 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1405 video_url = mediaURL
1407 gdaKey = mobj.group(1)
1408 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1410 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1412 self._downloader.trouble(u'ERROR: unable to extract media URL')
1414 vardict = parse_qs(mobj.group(1))
1415 if 'mediaData' not in vardict:
1416 self._downloader.trouble(u'ERROR: unable to extract media URL')
1418 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1420 self._downloader.trouble(u'ERROR: unable to extract media URL')
1422 mediaURL = mobj.group(1).replace('\\/', '/')
1423 video_extension = mediaURL[-3:]
1424 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1426 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1428 self._downloader.trouble(u'ERROR: unable to extract title')
1430 video_title = mobj.group(1).decode('utf-8')
1431 video_title = sanitize_title(video_title)
1433 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1435 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1437 video_uploader = mobj.group(1)
1440 # Process video information
1441 self._downloader.process_info({
1442 'id': video_id.decode('utf-8'),
1443 'url': video_url.decode('utf-8'),
1444 'uploader': video_uploader.decode('utf-8'),
1445 'upload_date': u'NA',
1446 'title': video_title,
1447 'stitle': simple_title,
1448 'ext': video_extension.decode('utf-8'),
1452 except UnavailableVideoError:
1453 self._downloader.trouble(u'\nERROR: unable to download video')
1456 class DailymotionIE(InfoExtractor):
1457 """Information Extractor for Dailymotion"""
1459 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1461 def __init__(self, downloader=None):
1462 InfoExtractor.__init__(self, downloader)
1466 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1468 def report_download_webpage(self, video_id):
1469 """Report webpage download."""
1470 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1472 def report_extraction(self, video_id):
1473 """Report information extraction."""
1474 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1476 def _real_initialize(self):
1479 def _real_extract(self, url):
1480 # Extract id and simplified title from URL
1481 mobj = re.match(self._VALID_URL, url)
1483 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1486 # At this point we have a new video
1487 self._downloader.increment_downloads()
1488 video_id = mobj.group(1)
1490 simple_title = mobj.group(2).decode('utf-8')
1491 video_extension = 'flv'
1493 # Retrieve video webpage to extract further information
1494 request = urllib2.Request(url)
1496 self.report_download_webpage(video_id)
1497 webpage = urllib2.urlopen(request).read()
1498 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1499 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1502 # Extract URL, uploader and title from webpage
1503 self.report_extraction(video_id)
1504 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1506 self._downloader.trouble(u'ERROR: unable to extract media URL')
1508 mediaURL = urllib.unquote(mobj.group(1))
1510 # if needed add http://www.dailymotion.com/ if relative URL
1512 video_url = mediaURL
1514 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1515 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1517 self._downloader.trouble(u'ERROR: unable to extract title')
1519 video_title = mobj.group(1).decode('utf-8')
1520 video_title = sanitize_title(video_title)
1522 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1524 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1526 video_uploader = mobj.group(1)
1529 # Process video information
1530 self._downloader.process_info({
1531 'id': video_id.decode('utf-8'),
1532 'url': video_url.decode('utf-8'),
1533 'uploader': video_uploader.decode('utf-8'),
1534 'upload_date': u'NA',
1535 'title': video_title,
1536 'stitle': simple_title,
1537 'ext': video_extension.decode('utf-8'),
1541 except UnavailableVideoError:
1542 self._downloader.trouble(u'\nERROR: unable to download video')
1544 class GoogleIE(InfoExtractor):
1545 """Information extractor for video.google.com."""
1547 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1549 def __init__(self, downloader=None):
1550 InfoExtractor.__init__(self, downloader)
1554 return (re.match(GoogleIE._VALID_URL, url) is not None)
1556 def report_download_webpage(self, video_id):
1557 """Report webpage download."""
1558 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1560 def report_extraction(self, video_id):
1561 """Report information extraction."""
1562 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1564 def _real_initialize(self):
1567 def _real_extract(self, url):
1568 # Extract id from URL
1569 mobj = re.match(self._VALID_URL, url)
1571 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1574 # At this point we have a new video
1575 self._downloader.increment_downloads()
1576 video_id = mobj.group(1)
1578 video_extension = 'mp4'
1580 # Retrieve video webpage to extract further information
1581 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1583 self.report_download_webpage(video_id)
1584 webpage = urllib2.urlopen(request).read()
1585 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1586 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1589 # Extract URL, uploader, and title from webpage
1590 self.report_extraction(video_id)
1591 mobj = re.search(r"download_url:'([^']+)'", webpage)
1593 video_extension = 'flv'
1594 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1596 self._downloader.trouble(u'ERROR: unable to extract media URL')
1598 mediaURL = urllib.unquote(mobj.group(1))
1599 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1600 mediaURL = mediaURL.replace('\\x26', '\x26')
1602 video_url = mediaURL
1604 mobj = re.search(r'<title>(.*)</title>', webpage)
1606 self._downloader.trouble(u'ERROR: unable to extract title')
1608 video_title = mobj.group(1).decode('utf-8')
1609 video_title = sanitize_title(video_title)
1610 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1612 # Extract video description
1613 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1615 self._downloader.trouble(u'ERROR: unable to extract video description')
1617 video_description = mobj.group(1).decode('utf-8')
1618 if not video_description:
1619 video_description = 'No description available.'
1621 # Extract video thumbnail
1622 if self._downloader.params.get('forcethumbnail', False):
1623 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1625 webpage = urllib2.urlopen(request).read()
1626 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1627 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1629 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1631 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1633 video_thumbnail = mobj.group(1)
1634 else: # we need something to pass to process_info
1635 video_thumbnail = ''
1639 # Process video information
1640 self._downloader.process_info({
1641 'id': video_id.decode('utf-8'),
1642 'url': video_url.decode('utf-8'),
1644 'upload_date': u'NA',
1645 'title': video_title,
1646 'stitle': simple_title,
1647 'ext': video_extension.decode('utf-8'),
1651 except UnavailableVideoError:
1652 self._downloader.trouble(u'\nERROR: unable to download video')
1655 class PhotobucketIE(InfoExtractor):
1656 """Information extractor for photobucket.com."""
1658 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1660 def __init__(self, downloader=None):
1661 InfoExtractor.__init__(self, downloader)
1665 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1667 def report_download_webpage(self, video_id):
1668 """Report webpage download."""
1669 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1671 def report_extraction(self, video_id):
1672 """Report information extraction."""
1673 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1675 def _real_initialize(self):
1678 def _real_extract(self, url):
1679 # Extract id from URL
1680 mobj = re.match(self._VALID_URL, url)
1682 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1685 # At this point we have a new video
1686 self._downloader.increment_downloads()
1687 video_id = mobj.group(1)
1689 video_extension = 'flv'
1691 # Retrieve video webpage to extract further information
1692 request = urllib2.Request(url)
1694 self.report_download_webpage(video_id)
1695 webpage = urllib2.urlopen(request).read()
1696 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1697 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1700 # Extract URL, uploader, and title from webpage
1701 self.report_extraction(video_id)
1702 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1704 self._downloader.trouble(u'ERROR: unable to extract media URL')
1706 mediaURL = urllib.unquote(mobj.group(1))
1708 video_url = mediaURL
1710 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1712 self._downloader.trouble(u'ERROR: unable to extract title')
1714 video_title = mobj.group(1).decode('utf-8')
1715 video_title = sanitize_title(video_title)
1716 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1718 video_uploader = mobj.group(2).decode('utf-8')
1721 # Process video information
1722 self._downloader.process_info({
1723 'id': video_id.decode('utf-8'),
1724 'url': video_url.decode('utf-8'),
1725 'uploader': video_uploader,
1726 'upload_date': u'NA',
1727 'title': video_title,
1728 'stitle': simple_title,
1729 'ext': video_extension.decode('utf-8'),
1733 except UnavailableVideoError:
1734 self._downloader.trouble(u'\nERROR: unable to download video')
1737 class YahooIE(InfoExtractor):
1738 """Information extractor for video.yahoo.com."""
1740 # _VALID_URL matches all Yahoo! Video URLs
1741 # _VPAGE_URL matches only the extractable '/watch/' URLs
1742 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1743 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1745 def __init__(self, downloader=None):
1746 InfoExtractor.__init__(self, downloader)
1750 return (re.match(YahooIE._VALID_URL, url) is not None)
1752 def report_download_webpage(self, video_id):
1753 """Report webpage download."""
1754 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1756 def report_extraction(self, video_id):
1757 """Report information extraction."""
1758 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1760 def _real_initialize(self):
1763 def _real_extract(self, url, new_video=True):
1764 # Extract ID from URL
1765 mobj = re.match(self._VALID_URL, url)
1767 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1770 # At this point we have a new video
1771 self._downloader.increment_downloads()
1772 video_id = mobj.group(2)
1773 video_extension = 'flv'
1775 # Rewrite valid but non-extractable URLs as
1776 # extractable English language /watch/ URLs
1777 if re.match(self._VPAGE_URL, url) is None:
1778 request = urllib2.Request(url)
1780 webpage = urllib2.urlopen(request).read()
1781 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1782 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1785 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1787 self._downloader.trouble(u'ERROR: Unable to extract id field')
1789 yahoo_id = mobj.group(1)
1791 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1793 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1795 yahoo_vid = mobj.group(1)
1797 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1798 return self._real_extract(url, new_video=False)
1800 # Retrieve video webpage to extract further information
1801 request = urllib2.Request(url)
1803 self.report_download_webpage(video_id)
1804 webpage = urllib2.urlopen(request).read()
1805 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1806 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1809 # Extract uploader and title from webpage
1810 self.report_extraction(video_id)
1811 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1813 self._downloader.trouble(u'ERROR: unable to extract video title')
1815 video_title = mobj.group(1).decode('utf-8')
1816 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1818 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1820 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1822 video_uploader = mobj.group(1).decode('utf-8')
1824 # Extract video thumbnail
1825 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1827 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1829 video_thumbnail = mobj.group(1).decode('utf-8')
1831 # Extract video description
1832 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1834 self._downloader.trouble(u'ERROR: unable to extract video description')
1836 video_description = mobj.group(1).decode('utf-8')
1837 if not video_description: video_description = 'No description available.'
1839 # Extract video height and width
1840 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1842 self._downloader.trouble(u'ERROR: unable to extract video height')
1844 yv_video_height = mobj.group(1)
1846 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1848 self._downloader.trouble(u'ERROR: unable to extract video width')
1850 yv_video_width = mobj.group(1)
1852 # Retrieve video playlist to extract media URL
1853 # I'm not completely sure what all these options are, but we
1854 # seem to need most of them, otherwise the server sends a 401.
1855 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1856 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1857 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1858 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1859 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1861 self.report_download_webpage(video_id)
1862 webpage = urllib2.urlopen(request).read()
1863 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1864 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1867 # Extract media URL from playlist XML
1868 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1870 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1872 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1873 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1876 # Process video information
1877 self._downloader.process_info({
1878 'id': video_id.decode('utf-8'),
1880 'uploader': video_uploader,
1881 'upload_date': u'NA',
1882 'title': video_title,
1883 'stitle': simple_title,
1884 'ext': video_extension.decode('utf-8'),
1885 'thumbnail': video_thumbnail.decode('utf-8'),
1886 'description': video_description,
1887 'thumbnail': video_thumbnail,
1888 'description': video_description,
1891 except UnavailableVideoError:
1892 self._downloader.trouble(u'\nERROR: unable to download video')
1895 class GenericIE(InfoExtractor):
1896 """Generic last-resort information extractor."""
1898 def __init__(self, downloader=None):
1899 InfoExtractor.__init__(self, downloader)
1905 def report_download_webpage(self, video_id):
1906 """Report webpage download."""
1907 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1908 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1910 def report_extraction(self, video_id):
1911 """Report information extraction."""
1912 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1914 def _real_initialize(self):
1917 def _real_extract(self, url):
1918 # At this point we have a new video
1919 self._downloader.increment_downloads()
1921 video_id = url.split('/')[-1]
1922 request = urllib2.Request(url)
1924 self.report_download_webpage(video_id)
1925 webpage = urllib2.urlopen(request).read()
1926 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1927 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1929 except ValueError, err:
1930 # since this is the last-resort InfoExtractor, if
1931 # this error is thrown, it'll be thrown here
1932 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1935 self.report_extraction(video_id)
1936 # Start with something easy: JW Player in SWFObject
1937 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1939 # Broaden the search a little bit
1940 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1942 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1945 # It's possible that one of the regexes
1946 # matched, but returned an empty group:
1947 if mobj.group(1) is None:
1948 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1951 video_url = urllib.unquote(mobj.group(1))
1952 video_id = os.path.basename(video_url)
1954 # here's a fun little line of code for you:
1955 video_extension = os.path.splitext(video_id)[1][1:]
1956 video_id = os.path.splitext(video_id)[0]
1958 # it's tempting to parse this further, but you would
1959 # have to take into account all the variations like
1960 # Video Title - Site Name
1961 # Site Name | Video Title
1962 # Video Title - Tagline | Site Name
1963 # and so on and so forth; it's just not practical
1964 mobj = re.search(r'<title>(.*)</title>', webpage)
1966 self._downloader.trouble(u'ERROR: unable to extract title')
1968 video_title = mobj.group(1).decode('utf-8')
1969 video_title = sanitize_title(video_title)
1970 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1972 # video uploader is domain name
1973 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1975 self._downloader.trouble(u'ERROR: unable to extract title')
1977 video_uploader = mobj.group(1).decode('utf-8')
1980 # Process video information
1981 self._downloader.process_info({
1982 'id': video_id.decode('utf-8'),
1983 'url': video_url.decode('utf-8'),
1984 'uploader': video_uploader,
1985 'upload_date': u'NA',
1986 'title': video_title,
1987 'stitle': simple_title,
1988 'ext': video_extension.decode('utf-8'),
1992 except UnavailableVideoError, err:
1993 self._downloader.trouble(u'\nERROR: unable to download video')
1996 class YoutubeSearchIE(InfoExtractor):
1997 """Information Extractor for YouTube search queries."""
1998 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1999 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2000 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2001 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2003 _max_youtube_results = 1000
2005 def __init__(self, youtube_ie, downloader=None):
2006 InfoExtractor.__init__(self, downloader)
2007 self._youtube_ie = youtube_ie
2011 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2013 def report_download_page(self, query, pagenum):
2014 """Report attempt to download playlist page with given number."""
2015 query = query.decode(preferredencoding())
2016 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2018 def _real_initialize(self):
2019 self._youtube_ie.initialize()
2021 def _real_extract(self, query):
2022 mobj = re.match(self._VALID_QUERY, query)
2024 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2027 prefix, query = query.split(':')
2029 query = query.encode('utf-8')
2031 self._download_n_results(query, 1)
2033 elif prefix == 'all':
2034 self._download_n_results(query, self._max_youtube_results)
2040 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2042 elif n > self._max_youtube_results:
2043 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2044 n = self._max_youtube_results
2045 self._download_n_results(query, n)
2047 except ValueError: # parsing prefix as integer fails
2048 self._download_n_results(query, 1)
2051 def _download_n_results(self, query, n):
2052 """Downloads a specified number of results for a query"""
2055 already_seen = set()
2059 self.report_download_page(query, pagenum)
2060 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2061 request = urllib2.Request(result_url)
2063 page = urllib2.urlopen(request).read()
2064 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2065 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2068 # Extract video identifiers
2069 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2070 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2071 if video_id not in already_seen:
2072 video_ids.append(video_id)
2073 already_seen.add(video_id)
2074 if len(video_ids) == n:
2075 # Specified n videos reached
2076 for id in video_ids:
2077 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2080 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2081 for id in video_ids:
2082 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2085 pagenum = pagenum + 1
2087 class GoogleSearchIE(InfoExtractor):
2088 """Information Extractor for Google Video search queries."""
2089 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2090 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2091 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2092 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2094 _max_google_results = 1000
2096 def __init__(self, google_ie, downloader=None):
2097 InfoExtractor.__init__(self, downloader)
2098 self._google_ie = google_ie
2102 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2104 def report_download_page(self, query, pagenum):
2105 """Report attempt to download playlist page with given number."""
2106 query = query.decode(preferredencoding())
2107 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2109 def _real_initialize(self):
2110 self._google_ie.initialize()
2112 def _real_extract(self, query):
2113 mobj = re.match(self._VALID_QUERY, query)
2115 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2118 prefix, query = query.split(':')
2120 query = query.encode('utf-8')
2122 self._download_n_results(query, 1)
2124 elif prefix == 'all':
2125 self._download_n_results(query, self._max_google_results)
2131 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2133 elif n > self._max_google_results:
2134 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2135 n = self._max_google_results
2136 self._download_n_results(query, n)
2138 except ValueError: # parsing prefix as integer fails
2139 self._download_n_results(query, 1)
2142 def _download_n_results(self, query, n):
2143 """Downloads a specified number of results for a query"""
2146 already_seen = set()
2150 self.report_download_page(query, pagenum)
2151 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2152 request = urllib2.Request(result_url)
2154 page = urllib2.urlopen(request).read()
2155 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2156 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2159 # Extract video identifiers
2160 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2161 video_id = mobj.group(1)
2162 if video_id not in already_seen:
2163 video_ids.append(video_id)
2164 already_seen.add(video_id)
2165 if len(video_ids) == n:
2166 # Specified n videos reached
2167 for id in video_ids:
2168 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2171 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2172 for id in video_ids:
2173 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2176 pagenum = pagenum + 1
2178 class YahooSearchIE(InfoExtractor):
2179 """Information Extractor for Yahoo! Video search queries."""
2180 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2181 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2182 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2183 _MORE_PAGES_INDICATOR = r'\s*Next'
2185 _max_yahoo_results = 1000
2187 def __init__(self, yahoo_ie, downloader=None):
2188 InfoExtractor.__init__(self, downloader)
2189 self._yahoo_ie = yahoo_ie
2193 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2195 def report_download_page(self, query, pagenum):
2196 """Report attempt to download playlist page with given number."""
2197 query = query.decode(preferredencoding())
2198 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2200 def _real_initialize(self):
2201 self._yahoo_ie.initialize()
2203 def _real_extract(self, query):
2204 mobj = re.match(self._VALID_QUERY, query)
2206 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2209 prefix, query = query.split(':')
2211 query = query.encode('utf-8')
2213 self._download_n_results(query, 1)
2215 elif prefix == 'all':
2216 self._download_n_results(query, self._max_yahoo_results)
2222 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2224 elif n > self._max_yahoo_results:
2225 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2226 n = self._max_yahoo_results
2227 self._download_n_results(query, n)
2229 except ValueError: # parsing prefix as integer fails
2230 self._download_n_results(query, 1)
2233 def _download_n_results(self, query, n):
2234 """Downloads a specified number of results for a query"""
2237 already_seen = set()
2241 self.report_download_page(query, pagenum)
2242 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2243 request = urllib2.Request(result_url)
2245 page = urllib2.urlopen(request).read()
2246 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2247 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2250 # Extract video identifiers
2251 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2252 video_id = mobj.group(1)
2253 if video_id not in already_seen:
2254 video_ids.append(video_id)
2255 already_seen.add(video_id)
2256 if len(video_ids) == n:
2257 # Specified n videos reached
2258 for id in video_ids:
2259 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2262 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2263 for id in video_ids:
2264 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2267 pagenum = pagenum + 1
2269 class YoutubePlaylistIE(InfoExtractor):
2270 """Information Extractor for YouTube playlists."""
2272 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2273 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2274 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2275 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2278 def __init__(self, youtube_ie, downloader=None):
2279 InfoExtractor.__init__(self, downloader)
2280 self._youtube_ie = youtube_ie
2284 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2286 def report_download_page(self, playlist_id, pagenum):
2287 """Report attempt to download playlist page with given number."""
2288 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2290 def _real_initialize(self):
2291 self._youtube_ie.initialize()
2293 def _real_extract(self, url):
2294 # Extract playlist id
2295 mobj = re.match(self._VALID_URL, url)
2297 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2301 if mobj.group(3) is not None:
2302 self._youtube_ie.extract(mobj.group(3))
2305 # Download playlist pages
2306 # prefix is 'p' as default for playlists but there are other types that need extra care
2307 playlist_prefix = mobj.group(1)
2308 if playlist_prefix == 'a':
2309 playlist_access = 'artist'
2311 playlist_prefix = 'p'
2312 playlist_access = 'view_play_list'
2313 playlist_id = mobj.group(2)
2318 self.report_download_page(playlist_id, pagenum)
2319 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2321 page = urllib2.urlopen(request).read()
2322 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2323 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2326 # Extract video identifiers
2328 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2329 if mobj.group(1) not in ids_in_page:
2330 ids_in_page.append(mobj.group(1))
2331 video_ids.extend(ids_in_page)
2333 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2335 pagenum = pagenum + 1
2337 playliststart = self._downloader.params.get('playliststart', 1) - 1
2338 playlistend = self._downloader.params.get('playlistend', -1)
2339 video_ids = video_ids[playliststart:playlistend]
2341 for id in video_ids:
2342 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2345 class YoutubeUserIE(InfoExtractor):
2346 """Information Extractor for YouTube users."""
2348 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2349 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2350 _GDATA_PAGE_SIZE = 50
2351 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2352 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2355 def __init__(self, youtube_ie, downloader=None):
2356 InfoExtractor.__init__(self, downloader)
2357 self._youtube_ie = youtube_ie
2361 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2363 def report_download_page(self, username, start_index):
2364 """Report attempt to download user page."""
2365 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2366 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2368 def _real_initialize(self):
2369 self._youtube_ie.initialize()
2371 def _real_extract(self, url):
2373 mobj = re.match(self._VALID_URL, url)
2375 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2378 username = mobj.group(1)
2380 # Download video ids using YouTube Data API. Result size per
2381 # query is limited (currently to 50 videos) so we need to query
2382 # page by page until there are no video ids - it means we got
2389 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2390 self.report_download_page(username, start_index)
2392 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2395 page = urllib2.urlopen(request).read()
2396 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2397 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2400 # Extract video identifiers
2403 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2404 if mobj.group(1) not in ids_in_page:
2405 ids_in_page.append(mobj.group(1))
2407 video_ids.extend(ids_in_page)
2409 # A little optimization - if current page is not
2410 # "full", ie. does not contain PAGE_SIZE video ids then
2411 # we can assume that this page is the last one - there
2412 # are no more ids on further pages - no need to query
2415 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2420 all_ids_count = len(video_ids)
2421 playliststart = self._downloader.params.get('playliststart', 1) - 1
2422 playlistend = self._downloader.params.get('playlistend', -1)
2424 if playlistend == -1:
2425 video_ids = video_ids[playliststart:]
2427 video_ids = video_ids[playliststart:playlistend]
2429 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2430 (username, all_ids_count, len(video_ids)))
2432 for video_id in video_ids:
2433 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2436 class DepositFilesIE(InfoExtractor):
2437 """Information extractor for depositfiles.com"""
2439 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2441 def __init__(self, downloader=None):
2442 InfoExtractor.__init__(self, downloader)
2446 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2448 def report_download_webpage(self, file_id):
2449 """Report webpage download."""
2450 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2452 def report_extraction(self, file_id):
2453 """Report information extraction."""
2454 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2456 def _real_initialize(self):
2459 def _real_extract(self, url):
2460 # At this point we have a new file
2461 self._downloader.increment_downloads()
2463 file_id = url.split('/')[-1]
2464 # Rebuild url in english locale
2465 url = 'http://depositfiles.com/en/files/' + file_id
2467 # Retrieve file webpage with 'Free download' button pressed
2468 free_download_indication = { 'gateway_result' : '1' }
2469 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2471 self.report_download_webpage(file_id)
2472 webpage = urllib2.urlopen(request).read()
2473 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2474 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2477 # Search for the real file URL
2478 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2479 if (mobj is None) or (mobj.group(1) is None):
2480 # Try to figure out reason of the error.
2481 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2482 if (mobj is not None) and (mobj.group(1) is not None):
2483 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2484 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2486 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2489 file_url = mobj.group(1)
2490 file_extension = os.path.splitext(file_url)[1][1:]
2492 # Search for file title
2493 mobj = re.search(r'<b title="(.*?)">', webpage)
2495 self._downloader.trouble(u'ERROR: unable to extract title')
2497 file_title = mobj.group(1).decode('utf-8')
2500 # Process file information
2501 self._downloader.process_info({
2502 'id': file_id.decode('utf-8'),
2503 'url': file_url.decode('utf-8'),
2505 'upload_date': u'NA',
2506 'title': file_title,
2507 'stitle': file_title,
2508 'ext': file_extension.decode('utf-8'),
2512 except UnavailableVideoError, err:
2513 self._downloader.trouble(u'ERROR: unable to download file')
2515 class FacebookIE(InfoExtractor):
2516 """Information Extractor for Facebook"""
2518 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2519 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2520 _NETRC_MACHINE = 'facebook'
2521 _available_formats = ['highqual', 'lowqual']
2522 _video_extensions = {
2527 def __init__(self, downloader=None):
2528 InfoExtractor.__init__(self, downloader)
2532 return (re.match(FacebookIE._VALID_URL, url) is not None)
2534 def _reporter(self, message):
2535 """Add header and report message."""
2536 self._downloader.to_screen(u'[facebook] %s' % message)
2538 def report_login(self):
2539 """Report attempt to log in."""
2540 self._reporter(u'Logging in')
2542 def report_video_webpage_download(self, video_id):
2543 """Report attempt to download video webpage."""
2544 self._reporter(u'%s: Downloading video webpage' % video_id)
2546 def report_information_extraction(self, video_id):
2547 """Report attempt to extract video information."""
2548 self._reporter(u'%s: Extracting video information' % video_id)
2550 def _parse_page(self, video_webpage):
2551 """Extract video information from page"""
2553 data = {'title': r'class="video_title datawrap">(.*?)</',
2554 'description': r'<div class="datawrap">(.*?)</div>',
2555 'owner': r'\("video_owner_name", "(.*?)"\)',
2556 'upload_date': r'data-date="(.*?)"',
2557 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2560 for piece in data.keys():
2561 mobj = re.search(data[piece], video_webpage)
2562 if mobj is not None:
2563 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2567 for fmt in self._available_formats:
2568 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2569 if mobj is not None:
2570 # URL is in a Javascript segment inside an escaped Unicode format within
2571 # the generally utf-8 page
2572 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2573 video_info['video_urls'] = video_urls
2577 def _real_initialize(self):
2578 if self._downloader is None:
2583 downloader_params = self._downloader.params
2585 # Attempt to use provided username and password or .netrc data
2586 if downloader_params.get('username', None) is not None:
2587 useremail = downloader_params['username']
2588 password = downloader_params['password']
2589 elif downloader_params.get('usenetrc', False):
2591 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2592 if info is not None:
2596 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2597 except (IOError, netrc.NetrcParseError), err:
2598 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2601 if useremail is None:
2610 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2613 login_results = urllib2.urlopen(request).read()
2614 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2615 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2617 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2618 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2621 def _real_extract(self, url):
2622 mobj = re.match(self._VALID_URL, url)
2624 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2626 video_id = mobj.group('ID')
2629 self.report_video_webpage_download(video_id)
2630 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2632 page = urllib2.urlopen(request)
2633 video_webpage = page.read()
2634 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2635 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2638 # Start extracting information
2639 self.report_information_extraction(video_id)
2641 # Extract information
2642 video_info = self._parse_page(video_webpage)
2645 if 'owner' not in video_info:
2646 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2648 video_uploader = video_info['owner']
2651 if 'title' not in video_info:
2652 self._downloader.trouble(u'ERROR: unable to extract video title')
2654 video_title = video_info['title']
2655 video_title = video_title.decode('utf-8')
2656 video_title = sanitize_title(video_title)
2659 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2660 simple_title = simple_title.strip(ur'_')
2663 if 'thumbnail' not in video_info:
2664 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2665 video_thumbnail = ''
2667 video_thumbnail = video_info['thumbnail']
2671 if 'upload_date' in video_info:
2672 upload_time = video_info['upload_date']
2673 timetuple = email.utils.parsedate_tz(upload_time)
2674 if timetuple is not None:
2676 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2681 video_description = video_info.get('description', 'No description available.')
2683 url_map = video_info['video_urls']
2684 if len(url_map.keys()) > 0:
2685 # Decide which formats to download
2686 req_format = self._downloader.params.get('format', None)
2687 format_limit = self._downloader.params.get('format_limit', None)
2689 if format_limit is not None and format_limit in self._available_formats:
2690 format_list = self._available_formats[self._available_formats.index(format_limit):]
2692 format_list = self._available_formats
2693 existing_formats = [x for x in format_list if x in url_map]
2694 if len(existing_formats) == 0:
2695 self._downloader.trouble(u'ERROR: no known formats available for video')
2697 if req_format is None:
2698 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2699 elif req_format == '-1':
2700 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2703 if req_format not in url_map:
2704 self._downloader.trouble(u'ERROR: requested format not available')
2706 video_url_list = [(req_format, url_map[req_format])] # Specific format
2708 for format_param, video_real_url in video_url_list:
2710 # At this point we have a new video
2711 self._downloader.increment_downloads()
2714 video_extension = self._video_extensions.get(format_param, 'mp4')
2716 # Find the video URL in fmt_url_map or conn paramters
2718 # Process video information
2719 self._downloader.process_info({
2720 'id': video_id.decode('utf-8'),
2721 'url': video_real_url.decode('utf-8'),
2722 'uploader': video_uploader.decode('utf-8'),
2723 'upload_date': upload_date,
2724 'title': video_title,
2725 'stitle': simple_title,
2726 'ext': video_extension.decode('utf-8'),
2727 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2728 'thumbnail': video_thumbnail.decode('utf-8'),
2729 'description': video_description.decode('utf-8'),
2732 except UnavailableVideoError, err:
2733 self._downloader.trouble(u'\nERROR: unable to download video')
2735 class BlipTVIE(InfoExtractor):
2736 """Information extractor for blip.tv"""
2738 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip.tv(/.+)$'
2739 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2743 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2745 def report_extraction(self, file_id):
2746 """Report information extraction."""
2747 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2749 def _simplify_title(self, title):
2750 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2751 res = res.strip(ur'_')
2754 def _real_extract(self, url):
2755 mobj = re.match(self._VALID_URL, url)
2757 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2760 json_url = url + ('&' if '?' in url else '?') + 'skin=json&version=2&no_wrap=1'
2761 request = urllib2.Request(json_url)
2762 self.report_extraction(mobj.group(1))
2764 json_code = urllib2.urlopen(request).read()
2765 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2766 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2769 json_data = json.loads(json_code)
2770 data = json_data['Post'] if 'Post' in json_data else json_data
2772 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2773 video_url = data['media']['url']
2774 umobj = re.match(self._URL_EXT, video_url)
2776 raise ValueError('Can not determine filename extension')
2777 ext = umobj.group(1)
2779 self._downloader.increment_downloads()
2782 'id': data['item_id'],
2784 'uploader': data['display_name'],
2785 'upload_date': upload_date,
2786 'title': data['title'],
2787 'stitle': self._simplify_title(data['title']),
2789 'format': data['media']['mimeType'],
2790 'thumbnail': data['thumbnailUrl'],
2791 'description': data['description'],
2792 'player_url': data['embedUrl']
2794 except (ValueError,KeyError), err:
2795 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2799 self._downloader.process_info(info)
2800 except UnavailableVideoError, err:
2801 self._downloader.trouble(u'\nERROR: unable to download video')
2804 class PostProcessor(object):
2805 """Post Processor class.
2807 PostProcessor objects can be added to downloaders with their
2808 add_post_processor() method. When the downloader has finished a
2809 successful download, it will take its internal chain of PostProcessors
2810 and start calling the run() method on each one of them, first with
2811 an initial argument and then with the returned value of the previous
2814 The chain will be stopped if one of them ever returns None or the end
2815 of the chain is reached.
2817 PostProcessor objects follow a "mutual registration" process similar
2818 to InfoExtractor objects.
2823 def __init__(self, downloader=None):
2824 self._downloader = downloader
2826 def set_downloader(self, downloader):
2827 """Sets the downloader for this PP."""
2828 self._downloader = downloader
2830 def run(self, information):
2831 """Run the PostProcessor.
2833 The "information" argument is a dictionary like the ones
2834 composed by InfoExtractors. The only difference is that this
2835 one has an extra field called "filepath" that points to the
2838 When this method returns None, the postprocessing chain is
2839 stopped. However, this method may return an information
2840 dictionary that will be passed to the next postprocessing
2841 object in the chain. It can be the one it received after
2842 changing some fields.
2844 In addition, this method may raise a PostProcessingError
2845 exception that will be taken into account by the downloader
2848 return information # by default, do nothing
2850 class FFmpegExtractAudioPP(PostProcessor):
2852 def __init__(self, downloader=None, preferredcodec=None):
2853 PostProcessor.__init__(self, downloader)
2854 if preferredcodec is None:
2855 preferredcodec = 'best'
2856 self._preferredcodec = preferredcodec
2859 def get_audio_codec(path):
2861 cmd = ['ffprobe', '-show_streams', '--', path]
2862 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
2863 output = handle.communicate()[0]
2864 if handle.wait() != 0:
2866 except (IOError, OSError):
2869 for line in output.split('\n'):
2870 if line.startswith('codec_name='):
2871 audio_codec = line.split('=')[1].strip()
2872 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
2877 def run_ffmpeg(path, out_path, codec, more_opts):
2879 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
2880 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
2882 except (IOError, OSError):
2885 def run(self, information):
2886 path = information['filepath']
2888 filecodec = self.get_audio_codec(path)
2889 if filecodec is None:
2890 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
2894 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
2895 if filecodec == 'aac' or filecodec == 'mp3':
2896 # Lossless if possible
2898 extension = filecodec
2899 if filecodec == 'aac':
2900 more_opts = ['-f', 'adts']
2903 acodec = 'libmp3lame'
2905 more_opts = ['-ab', '128k']
2907 # We convert the audio (lossy)
2908 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
2909 extension = self._preferredcodec
2910 more_opts = ['-ab', '128k']
2911 if self._preferredcodec == 'aac':
2912 more_opts += ['-f', 'adts']
2914 (prefix, ext) = os.path.splitext(path)
2915 new_path = prefix + '.' + extension
2916 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
2917 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
2920 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
2925 except (IOError, OSError):
2926 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
2929 information['filepath'] = new_path
2932 ### MAIN PROGRAM ###
2933 if __name__ == '__main__':
2935 # Modules needed only when running the main program
2939 # Function to update the program file with the latest version from the repository.
2940 def update_self(downloader, filename):
2941 # Note: downloader only used for options
2942 if not os.access(filename, os.W_OK):
2943 sys.exit('ERROR: no write permissions on %s' % filename)
2945 downloader.to_screen('Updating to latest stable version...')
2947 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2948 latest_version = urllib.urlopen(latest_url).read().strip()
2949 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2950 newcontent = urllib.urlopen(prog_url).read()
2951 except (IOError, OSError), err:
2952 sys.exit('ERROR: unable to download latest version')
2954 stream = open(filename, 'w')
2955 stream.write(newcontent)
2957 except (IOError, OSError), err:
2958 sys.exit('ERROR: unable to overwrite current version')
2959 downloader.to_screen('Updated to version %s' % latest_version)
2961 # Parse command line
2962 parser = optparse.OptionParser(
2963 usage='Usage: %prog [options] url...',
2964 version='2011.07.09-phihag',
2965 conflict_handler='resolve',
2968 parser.add_option('-h', '--help',
2969 action='help', help='print this help text and exit')
2970 parser.add_option('-v', '--version',
2971 action='version', help='print program version and exit')
2972 parser.add_option('-U', '--update',
2973 action='store_true', dest='update_self', help='update this program to latest stable version')
2974 parser.add_option('-i', '--ignore-errors',
2975 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2976 parser.add_option('-r', '--rate-limit',
2977 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2978 parser.add_option('-R', '--retries',
2979 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2980 parser.add_option('--playlist-start',
2981 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2982 parser.add_option('--playlist-end',
2983 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2984 parser.add_option('--dump-user-agent',
2985 action='store_true', dest='dump_user_agent',
2986 help='display the current browser identification', default=False)
2988 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2989 authentication.add_option('-u', '--username',
2990 dest='username', metavar='USERNAME', help='account username')
2991 authentication.add_option('-p', '--password',
2992 dest='password', metavar='PASSWORD', help='account password')
2993 authentication.add_option('-n', '--netrc',
2994 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2995 parser.add_option_group(authentication)
2997 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2998 video_format.add_option('-f', '--format',
2999 action='store', dest='format', metavar='FORMAT', help='video format code')
3000 video_format.add_option('--all-formats',
3001 action='store_const', dest='format', help='download all available video formats', const='-1')
3002 video_format.add_option('--max-quality',
3003 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3004 parser.add_option_group(video_format)
3006 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3007 verbosity.add_option('-q', '--quiet',
3008 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3009 verbosity.add_option('-s', '--simulate',
3010 action='store_true', dest='simulate', help='do not download video', default=False)
3011 verbosity.add_option('-g', '--get-url',
3012 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3013 verbosity.add_option('-e', '--get-title',
3014 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3015 verbosity.add_option('--get-thumbnail',
3016 action='store_true', dest='getthumbnail',
3017 help='simulate, quiet but print thumbnail URL', default=False)
3018 verbosity.add_option('--get-description',
3019 action='store_true', dest='getdescription',
3020 help='simulate, quiet but print video description', default=False)
3021 verbosity.add_option('--get-filename',
3022 action='store_true', dest='getfilename',
3023 help='simulate, quiet but print output filename', default=False)
3024 verbosity.add_option('--no-progress',
3025 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3026 verbosity.add_option('--console-title',
3027 action='store_true', dest='consoletitle',
3028 help='display progress in console titlebar', default=False)
3029 parser.add_option_group(verbosity)
3031 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3032 filesystem.add_option('-t', '--title',
3033 action='store_true', dest='usetitle', help='use title in file name', default=False)
3034 filesystem.add_option('-l', '--literal',
3035 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3036 filesystem.add_option('-A', '--auto-number',
3037 action='store_true', dest='autonumber',
3038 help='number downloaded files starting from 00000', default=False)
3039 filesystem.add_option('-o', '--output',
3040 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3041 filesystem.add_option('-a', '--batch-file',
3042 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3043 filesystem.add_option('-w', '--no-overwrites',
3044 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3045 filesystem.add_option('-c', '--continue',
3046 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3047 filesystem.add_option('--cookies',
3048 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3049 filesystem.add_option('--no-part',
3050 action='store_true', dest='nopart', help='do not use .part files', default=False)
3051 filesystem.add_option('--no-mtime',
3052 action='store_false', dest='updatetime',
3053 help='do not use the Last-modified header to set the file modification time', default=True)
3054 filesystem.add_option('--write-description',
3055 action='store_true', dest='writedescription',
3056 help='write video description to a .description file', default=False)
3057 filesystem.add_option('--write-info-json',
3058 action='store_true', dest='writeinfojson',
3059 help='write video metadata to a .info.json file', default=False)
3060 parser.add_option_group(filesystem)
3062 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3063 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3064 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3065 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3066 help='"best", "aac" or "mp3"; best by default')
3067 parser.add_option_group(postproc)
3069 (opts, args) = parser.parse_args()
3071 # Open appropriate CookieJar
3072 if opts.cookiefile is None:
3073 jar = cookielib.CookieJar()
3076 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3077 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3079 except (IOError, OSError), err:
3080 sys.exit(u'ERROR: unable to open cookie file')
3083 if opts.dump_user_agent:
3084 print std_headers['User-Agent']
3087 # General configuration
3088 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3089 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
3090 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3092 # Batch file verification
3094 if opts.batchfile is not None:
3096 if opts.batchfile == '-':
3099 batchfd = open(opts.batchfile, 'r')
3100 batchurls = batchfd.readlines()
3101 batchurls = [x.strip() for x in batchurls]
3102 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3104 sys.exit(u'ERROR: batch file could not be read')
3105 all_urls = batchurls + args
3107 # Conflicting, missing and erroneous options
3108 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3109 parser.error(u'using .netrc conflicts with giving username/password')
3110 if opts.password is not None and opts.username is None:
3111 parser.error(u'account username missing')
3112 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3113 parser.error(u'using output template conflicts with using title, literal title or auto number')
3114 if opts.usetitle and opts.useliteral:
3115 parser.error(u'using title conflicts with using literal title')
3116 if opts.username is not None and opts.password is None:
3117 opts.password = getpass.getpass(u'Type account password and press return:')
3118 if opts.ratelimit is not None:
3119 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3120 if numeric_limit is None:
3121 parser.error(u'invalid rate limit specified')
3122 opts.ratelimit = numeric_limit
3123 if opts.retries is not None:
3125 opts.retries = long(opts.retries)
3126 except (TypeError, ValueError), err:
3127 parser.error(u'invalid retry count specified')
3129 opts.playliststart = long(opts.playliststart)
3130 if opts.playliststart <= 0:
3132 except (TypeError, ValueError), err:
3133 parser.error(u'invalid playlist start number specified')
3135 opts.playlistend = long(opts.playlistend)
3136 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3138 except (TypeError, ValueError), err:
3139 parser.error(u'invalid playlist end number specified')
3140 if opts.extractaudio:
3141 if opts.audioformat not in ['best', 'aac', 'mp3']:
3142 parser.error(u'invalid audio format specified')
3144 # Information extractors
3145 youtube_ie = YoutubeIE()
3146 metacafe_ie = MetacafeIE(youtube_ie)
3147 dailymotion_ie = DailymotionIE()
3148 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3149 youtube_user_ie = YoutubeUserIE(youtube_ie)
3150 youtube_search_ie = YoutubeSearchIE(youtube_ie)
3151 google_ie = GoogleIE()
3152 google_search_ie = GoogleSearchIE(google_ie)
3153 photobucket_ie = PhotobucketIE()
3154 yahoo_ie = YahooIE()
3155 yahoo_search_ie = YahooSearchIE(yahoo_ie)
3156 deposit_files_ie = DepositFilesIE()
3157 facebook_ie = FacebookIE()
3158 bliptv_ie = BlipTVIE()
3159 generic_ie = GenericIE()
3162 fd = FileDownloader({
3163 'usenetrc': opts.usenetrc,
3164 'username': opts.username,
3165 'password': opts.password,
3166 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3167 'forceurl': opts.geturl,
3168 'forcetitle': opts.gettitle,
3169 'forcethumbnail': opts.getthumbnail,
3170 'forcedescription': opts.getdescription,
3171 'forcefilename': opts.getfilename,
3172 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3173 'format': opts.format,
3174 'format_limit': opts.format_limit,
3175 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3176 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3177 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3178 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3179 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3180 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3181 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3182 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3183 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3184 or u'%(id)s.%(ext)s'),
3185 'ignoreerrors': opts.ignoreerrors,
3186 'ratelimit': opts.ratelimit,
3187 'nooverwrites': opts.nooverwrites,
3188 'retries': opts.retries,
3189 'continuedl': opts.continue_dl,
3190 'noprogress': opts.noprogress,
3191 'playliststart': opts.playliststart,
3192 'playlistend': opts.playlistend,
3193 'logtostderr': opts.outtmpl == '-',
3194 'consoletitle': opts.consoletitle,
3195 'nopart': opts.nopart,
3196 'updatetime': opts.updatetime,
3197 'writedescription': opts.writedescription,
3198 'writeinfojson': opts.writeinfojson,
3200 fd.add_info_extractor(youtube_search_ie)
3201 fd.add_info_extractor(youtube_pl_ie)
3202 fd.add_info_extractor(youtube_user_ie)
3203 fd.add_info_extractor(metacafe_ie)
3204 fd.add_info_extractor(dailymotion_ie)
3205 fd.add_info_extractor(youtube_ie)
3206 fd.add_info_extractor(google_ie)
3207 fd.add_info_extractor(google_search_ie)
3208 fd.add_info_extractor(photobucket_ie)
3209 fd.add_info_extractor(yahoo_ie)
3210 fd.add_info_extractor(yahoo_search_ie)
3211 fd.add_info_extractor(deposit_files_ie)
3212 fd.add_info_extractor(facebook_ie)
3213 fd.add_info_extractor(bliptv_ie)
3215 # This must come last since it's the
3216 # fallback if none of the others work
3217 fd.add_info_extractor(generic_ie)
3220 if opts.extractaudio:
3221 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3224 if opts.update_self:
3225 update_self(fd, sys.argv[0])
3228 if len(all_urls) < 1:
3229 if not opts.update_self:
3230 parser.error(u'you must provide at least one URL')
3233 retcode = fd.download(all_urls)
3235 # Dump cookie jar if requested
3236 if opts.cookiefile is not None:
3239 except (IOError, OSError), err:
3240 sys.exit(u'ERROR: unable to save cookie jar')
3244 except DownloadError:
3246 except SameFileError:
3247 sys.exit(u'ERROR: fixed output name but more than one file to download')
3248 except KeyboardInterrupt:
3249 sys.exit(u'\nERROR: Interrupted by user')