2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # Author: Vasyl' Vavrychuk
7 # Author: Witold Baryluk
8 # Author: Paweł Paprota
9 # Author: Gergely Imreh
10 # Author: Philipp Hagemeister <phihag@phihag.de>
11 # License: Public domain code
38 except ImportError: # Python 2.4
41 import cStringIO as StringIO
45 # parse_qs was moved from the cgi module to the urlparse module recently.
47 from urlparse import parse_qs
49 from cgi import parse_qs
53 except ImportError: # Python < 2.6
57 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b11) Gecko/20100101 Firefox/4.0b11',
58 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
59 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
60 'Accept-Encoding': 'gzip, deflate',
61 'Accept-Language': 'en-us,en;q=0.5',
64 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
68 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
74 def raiseError(msg, i):
75 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
76 def skipSpace(i, expectMore=True):
77 while i < len(s) and s[i] in ' \t\r\n':
81 raiseError('Premature end', i)
83 def decodeEscape(match):
99 return unichr(int(esc[1:5], 16))
100 if len(esc) == 5+6 and esc[5:7] == '\\u':
101 hi = int(esc[1:5], 16)
102 low = int(esc[7:11], 16)
103 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
104 raise ValueError('Unknown escape ' + str(esc))
111 while s[e-bslashes-1] == '\\':
113 if bslashes % 2 == 1:
117 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
118 stri = rexp.sub(decodeEscape, s[i:e])
124 if s[i] == '}': # Empty dictionary
128 raiseError('Expected a string object key', i)
129 i,key = parseString(i)
131 if i >= len(s) or s[i] != ':':
132 raiseError('Expected a colon', i)
139 raiseError('Expected comma or closing curly brace', i)
144 if s[i] == ']': # Empty array
149 i = skipSpace(i) # Raise exception if premature end
153 raiseError('Expected a comma or closing bracket', i)
155 def parseDiscrete(i):
156 for k,v in {'true': True, 'false': False, 'null': None}.items():
157 if s.startswith(k, i):
159 raiseError('Not a boolean (or null)', i)
161 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
163 raiseError('Not a number', i)
165 if '.' in nums or 'e' in nums or 'E' in nums:
166 return (i+len(nums), float(nums))
167 return (i+len(nums), int(nums))
168 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
171 i,res = CHARMAP.get(s[i], parseNumber)(i)
172 i = skipSpace(i, False)
176 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
179 def preferredencoding():
180 """Get preferred encoding.
182 Returns the best encoding scheme for the system, based on
183 locale.getpreferredencoding() and some further tweaks.
185 def yield_preferredencoding():
187 pref = locale.getpreferredencoding()
193 return yield_preferredencoding().next()
195 def htmlentity_transform(matchobj):
196 """Transforms an HTML entity to a Unicode character.
198 This function receives a match object and is intended to be used with
199 the re.sub() function.
201 entity = matchobj.group(1)
203 # Known non-numeric HTML entity
204 if entity in htmlentitydefs.name2codepoint:
205 return unichr(htmlentitydefs.name2codepoint[entity])
208 mobj = re.match(ur'(?u)#(x?\d+)', entity)
210 numstr = mobj.group(1)
211 if numstr.startswith(u'x'):
213 numstr = u'0%s' % numstr
216 return unichr(long(numstr, base))
218 # Unknown entity in name, return its literal representation
219 return (u'&%s;' % entity)
221 def sanitize_title(utitle):
222 """Sanitizes a video title so it could be used as part of a filename."""
223 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
224 return utitle.replace(unicode(os.sep), u'%')
226 def sanitize_open(filename, open_mode):
227 """Try to open the given filename, and slightly tweak it if this fails.
229 Attempts to open the given filename. If this fails, it tries to change
230 the filename slightly, step by step, until it's either able to open it
231 or it fails and raises a final exception, like the standard open()
234 It returns the tuple (stream, definitive_file_name).
238 if sys.platform == 'win32':
240 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
241 return (sys.stdout, filename)
242 stream = open(filename, open_mode)
243 return (stream, filename)
244 except (IOError, OSError), err:
245 # In case of error, try to remove win32 forbidden chars
246 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
248 # An exception here should be caught in the caller
249 stream = open(filename, open_mode)
250 return (stream, filename)
252 def timeconvert(timestr):
253 """Convert RFC 2822 defined time string into system timestamp"""
255 timetuple = email.utils.parsedate_tz(timestr)
256 if timetuple is not None:
257 timestamp = email.utils.mktime_tz(timetuple)
260 class DownloadError(Exception):
261 """Download Error exception.
263 This exception may be thrown by FileDownloader objects if they are not
264 configured to continue on errors. They will contain the appropriate
269 class SameFileError(Exception):
270 """Same File exception.
272 This exception will be thrown by FileDownloader objects if they detect
273 multiple files would have to be downloaded to the same file on disk.
277 class PostProcessingError(Exception):
278 """Post Processing exception.
280 This exception may be raised by PostProcessor's .run() method to
281 indicate an error in the postprocessing task.
285 class UnavailableVideoError(Exception):
286 """Unavailable Format exception.
288 This exception will be thrown when a video is requested
289 in a format that is not available for that video.
293 class ContentTooShortError(Exception):
294 """Content Too Short exception.
296 This exception may be raised by FileDownloader objects when a file they
297 download is too small for what the server announced first, indicating
298 the connection was probably interrupted.
304 def __init__(self, downloaded, expected):
305 self.downloaded = downloaded
306 self.expected = expected
308 class YoutubeDLHandler(urllib2.HTTPHandler):
309 """Handler for HTTP requests and responses.
311 This class, when installed with an OpenerDirector, automatically adds
312 the standard headers to every HTTP request and handles gzipped and
313 deflated responses from web servers. If compression is to be avoided in
314 a particular request, the original request in the program code only has
315 to include the HTTP header "Youtubedl-No-Compression", which will be
316 removed before making the real request.
318 Part of this code was copied from:
320 http://techknack.net/python-urllib2-handlers/
322 Andrew Rowls, the author of that code, agreed to release it to the
329 return zlib.decompress(data, -zlib.MAX_WBITS)
331 return zlib.decompress(data)
334 def addinfourl_wrapper(stream, headers, url, code):
335 if hasattr(urllib2.addinfourl, 'getcode'):
336 return urllib2.addinfourl(stream, headers, url, code)
337 ret = urllib2.addinfourl(stream, headers, url)
341 def http_request(self, req):
342 for h in std_headers:
345 req.add_header(h, std_headers[h])
346 if 'Youtubedl-no-compression' in req.headers:
347 if 'Accept-encoding' in req.headers:
348 del req.headers['Accept-encoding']
349 del req.headers['Youtubedl-no-compression']
352 def http_response(self, req, resp):
355 if resp.headers.get('Content-encoding', '') == 'gzip':
356 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
357 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
358 resp.msg = old_resp.msg
360 if resp.headers.get('Content-encoding', '') == 'deflate':
361 gz = StringIO.StringIO(self.deflate(resp.read()))
362 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
363 resp.msg = old_resp.msg
366 class FileDownloader(object):
367 """File Downloader class.
369 File downloader objects are the ones responsible of downloading the
370 actual video file and writing it to disk if the user has requested
371 it, among some other tasks. In most cases there should be one per
372 program. As, given a video URL, the downloader doesn't know how to
373 extract all the needed information, task that InfoExtractors do, it
374 has to pass the URL to one of them.
376 For this, file downloader objects have a method that allows
377 InfoExtractors to be registered in a given order. When it is passed
378 a URL, the file downloader handles it to the first InfoExtractor it
379 finds that reports being able to handle it. The InfoExtractor extracts
380 all the information about the video or videos the URL refers to, and
381 asks the FileDownloader to process the video information, possibly
382 downloading the video.
384 File downloaders accept a lot of parameters. In order not to saturate
385 the object constructor with arguments, it receives a dictionary of
386 options instead. These options are available through the params
387 attribute for the InfoExtractors to use. The FileDownloader also
388 registers itself as the downloader in charge for the InfoExtractors
389 that are added to it, so this is a "mutual registration".
393 username: Username for authentication purposes.
394 password: Password for authentication purposes.
395 usenetrc: Use netrc for authentication instead.
396 quiet: Do not print messages to stdout.
397 forceurl: Force printing final URL.
398 forcetitle: Force printing title.
399 forcethumbnail: Force printing thumbnail URL.
400 forcedescription: Force printing description.
401 forcefilename: Force printing final filename.
402 simulate: Do not download the video files.
403 format: Video format code.
404 format_limit: Highest quality format to try.
405 outtmpl: Template for output names.
406 ignoreerrors: Do not stop on download errors.
407 ratelimit: Download speed limit, in bytes/sec.
408 nooverwrites: Prevent overwriting files.
409 retries: Number of times to retry for HTTP error 5xx
410 continuedl: Try to continue downloads if possible.
411 noprogress: Do not print the progress bar.
412 playliststart: Playlist item to start at.
413 playlistend: Playlist item to end at.
414 logtostderr: Log messages to stderr instead of stdout.
415 consoletitle: Display progress in console window's titlebar.
416 nopart: Do not use temporary .part files.
417 updatetime: Use the Last-modified header to set output file timestamps.
418 writedescription: Write the video description to a .description file
419 writeinfojson: Write the video description to a .info.json file
425 _download_retcode = None
426 _num_downloads = None
429 def __init__(self, params):
430 """Create a FileDownloader object with the given options."""
433 self._download_retcode = 0
434 self._num_downloads = 0
435 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
439 def pmkdir(filename):
440 """Create directory components in filename. Similar to Unix "mkdir -p"."""
441 components = filename.split(os.sep)
442 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
443 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
444 for dir in aggregate:
445 if not os.path.exists(dir):
449 def format_bytes(bytes):
452 if type(bytes) is str:
457 exponent = long(math.log(bytes, 1024.0))
458 suffix = 'bkMGTPEZY'[exponent]
459 converted = float(bytes) / float(1024**exponent)
460 return '%.2f%s' % (converted, suffix)
463 def calc_percent(byte_counter, data_len):
466 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
469 def calc_eta(start, now, total, current):
473 if current == 0 or dif < 0.001: # One millisecond
475 rate = float(current) / dif
476 eta = long((float(total) - float(current)) / rate)
477 (eta_mins, eta_secs) = divmod(eta, 60)
480 return '%02d:%02d' % (eta_mins, eta_secs)
483 def calc_speed(start, now, bytes):
485 if bytes == 0 or dif < 0.001: # One millisecond
486 return '%10s' % '---b/s'
487 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
490 def best_block_size(elapsed_time, bytes):
491 new_min = max(bytes / 2.0, 1.0)
492 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
493 if elapsed_time < 0.001:
495 rate = bytes / elapsed_time
503 def parse_bytes(bytestr):
504 """Parse a string indicating a byte quantity into a long integer."""
505 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
508 number = float(matchobj.group(1))
509 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
510 return long(round(number * multiplier))
512 def add_info_extractor(self, ie):
513 """Add an InfoExtractor object to the end of the list."""
515 ie.set_downloader(self)
517 def add_post_processor(self, pp):
518 """Add a PostProcessor object to the end of the chain."""
520 pp.set_downloader(self)
522 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
523 """Print message to stdout if not in quiet mode."""
525 if not self.params.get('quiet', False):
526 terminator = [u'\n', u''][skip_eol]
527 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
528 self._screen_file.flush()
529 except (UnicodeEncodeError), err:
530 if not ignore_encoding_errors:
533 def to_stderr(self, message):
534 """Print message to stderr."""
535 print >>sys.stderr, message.encode(preferredencoding())
537 def to_cons_title(self, message):
538 """Set console/terminal window title to message."""
539 if not self.params.get('consoletitle', False):
541 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
542 # c_wchar_p() might not be necessary if `message` is
543 # already of type unicode()
544 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
545 elif 'TERM' in os.environ:
546 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
548 def fixed_template(self):
549 """Checks if the output template is fixed."""
550 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
552 def trouble(self, message=None):
553 """Determine action to take when a download problem appears.
555 Depending on if the downloader has been configured to ignore
556 download errors or not, this method may throw an exception or
557 not when errors are found, after printing the message.
559 if message is not None:
560 self.to_stderr(message)
561 if not self.params.get('ignoreerrors', False):
562 raise DownloadError(message)
563 self._download_retcode = 1
565 def slow_down(self, start_time, byte_counter):
566 """Sleep if the download speed is over the rate limit."""
567 rate_limit = self.params.get('ratelimit', None)
568 if rate_limit is None or byte_counter == 0:
571 elapsed = now - start_time
574 speed = float(byte_counter) / elapsed
575 if speed > rate_limit:
576 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
578 def temp_name(self, filename):
579 """Returns a temporary filename for the given filename."""
580 if self.params.get('nopart', False) or filename == u'-' or \
581 (os.path.exists(filename) and not os.path.isfile(filename)):
583 return filename + u'.part'
585 def undo_temp_name(self, filename):
586 if filename.endswith(u'.part'):
587 return filename[:-len(u'.part')]
590 def try_rename(self, old_filename, new_filename):
592 if old_filename == new_filename:
594 os.rename(old_filename, new_filename)
595 except (IOError, OSError), err:
596 self.trouble(u'ERROR: unable to rename file')
598 def try_utime(self, filename, last_modified_hdr):
599 """Try to set the last-modified time of the given file."""
600 if last_modified_hdr is None:
602 if not os.path.isfile(filename):
604 timestr = last_modified_hdr
607 filetime = timeconvert(timestr)
611 os.utime(filename,(time.time(), filetime))
615 def report_writedescription(self, descfn):
616 """ Report that the description file is being written """
617 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
619 def report_writeinfojson(self, infofn):
620 """ Report that the metadata file has been written """
621 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
623 def report_destination(self, filename):
624 """Report destination filename."""
625 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
627 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
628 """Report download progress."""
629 if self.params.get('noprogress', False):
631 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
632 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
633 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
634 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
636 def report_resuming_byte(self, resume_len):
637 """Report attempt to resume at given byte."""
638 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
640 def report_retry(self, count, retries):
641 """Report retry in case of HTTP error 5xx"""
642 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
644 def report_file_already_downloaded(self, file_name):
645 """Report file has already been fully downloaded."""
647 self.to_screen(u'[download] %s has already been downloaded' % file_name)
648 except (UnicodeEncodeError), err:
649 self.to_screen(u'[download] The file has already been downloaded')
651 def report_unable_to_resume(self):
652 """Report it was impossible to resume download."""
653 self.to_screen(u'[download] Unable to resume')
655 def report_finish(self):
656 """Report download finished."""
657 if self.params.get('noprogress', False):
658 self.to_screen(u'[download] Download completed')
662 def increment_downloads(self):
663 """Increment the ordinal that assigns a number to each file."""
664 self._num_downloads += 1
666 def prepare_filename(self, info_dict):
667 """Generate the output filename."""
669 template_dict = dict(info_dict)
670 template_dict['epoch'] = unicode(long(time.time()))
671 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
672 filename = self.params['outtmpl'] % template_dict
674 except (ValueError, KeyError), err:
675 self.trouble(u'ERROR: invalid system charset or erroneous output template')
678 def process_info(self, info_dict):
679 """Process a single dictionary returned by an InfoExtractor."""
680 filename = self.prepare_filename(info_dict)
681 # Do nothing else if in simulate mode
682 if self.params.get('simulate', False):
684 if self.params.get('forcetitle', False):
685 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
686 if self.params.get('forceurl', False):
687 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
688 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
689 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
690 if self.params.get('forcedescription', False) and 'description' in info_dict:
691 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
692 if self.params.get('forcefilename', False) and filename is not None:
693 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
699 if self.params.get('nooverwrites', False) and os.path.exists(filename):
700 self.to_stderr(u'WARNING: file exists and will be skipped')
704 self.pmkdir(filename)
705 except (OSError, IOError), err:
706 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
709 if self.params.get('writedescription', False):
711 descfn = filename + '.description'
712 self.report_writedescription(descfn)
713 descfile = open(descfn, 'wb')
715 descfile.write(info_dict['description'].encode('utf-8'))
718 except (OSError, IOError):
719 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
722 if self.params.get('writeinfojson', False):
723 infofn = filename + '.info.json'
724 self.report_writeinfojson(infofn)
727 except (NameError,AttributeError):
728 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
731 infof = open(infofn, 'wb')
733 json.dump(info_dict, infof)
736 except (OSError, IOError):
737 self.trouble(u'ERROR: Cannot write metadata to JSON file: %s' % str(infofn))
741 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
742 except (OSError, IOError), err:
743 raise UnavailableVideoError
744 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
745 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
747 except (ContentTooShortError, ), err:
748 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
753 self.post_process(filename, info_dict)
754 except (PostProcessingError), err:
755 self.trouble(u'ERROR: postprocessing: %s' % str(err))
758 def download(self, url_list):
759 """Download a given list of URLs."""
760 if len(url_list) > 1 and self.fixed_template():
761 raise SameFileError(self.params['outtmpl'])
764 suitable_found = False
766 # Go to next InfoExtractor if not suitable
767 if not ie.suitable(url):
770 # Suitable InfoExtractor found
771 suitable_found = True
773 # Extract information from URL and process it
776 # Suitable InfoExtractor had been found; go to next URL
779 if not suitable_found:
780 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
782 return self._download_retcode
784 def post_process(self, filename, ie_info):
785 """Run the postprocessing chain on the given file."""
787 info['filepath'] = filename
793 def _download_with_rtmpdump(self, filename, url, player_url):
794 self.report_destination(filename)
795 tmpfilename = self.temp_name(filename)
797 # Check for rtmpdump first
799 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
800 except (OSError, IOError):
801 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
804 # Download using rtmpdump. rtmpdump returns exit code 2 when
805 # the connection was interrumpted and resuming appears to be
806 # possible. This is part of rtmpdump's normal usage, AFAIK.
807 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
808 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
809 while retval == 2 or retval == 1:
810 prevsize = os.path.getsize(tmpfilename)
811 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
812 time.sleep(5.0) # This seems to be needed
813 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
814 cursize = os.path.getsize(tmpfilename)
815 if prevsize == cursize and retval == 1:
818 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
819 self.try_rename(tmpfilename, filename)
822 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
825 def _do_download(self, filename, url, player_url):
826 # Check file already present
827 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
828 self.report_file_already_downloaded(filename)
831 # Attempt to download using rtmpdump
832 if url.startswith('rtmp'):
833 return self._download_with_rtmpdump(filename, url, player_url)
835 tmpfilename = self.temp_name(filename)
839 # Do not include the Accept-Encoding header
840 headers = {'Youtubedl-no-compression': 'True'}
841 basic_request = urllib2.Request(url, None, headers)
842 request = urllib2.Request(url, None, headers)
844 # Establish possible resume length
845 if os.path.isfile(tmpfilename):
846 resume_len = os.path.getsize(tmpfilename)
850 # Request parameters in case of being able to resume
851 if self.params.get('continuedl', False) and resume_len != 0:
852 self.report_resuming_byte(resume_len)
853 request.add_header('Range','bytes=%d-' % resume_len)
857 retries = self.params.get('retries', 0)
858 while count <= retries:
859 # Establish connection
861 data = urllib2.urlopen(request)
863 except (urllib2.HTTPError, ), err:
864 if (err.code < 500 or err.code >= 600) and err.code != 416:
865 # Unexpected HTTP error
867 elif err.code == 416:
868 # Unable to resume (requested range not satisfiable)
870 # Open the connection again without the range header
871 data = urllib2.urlopen(basic_request)
872 content_length = data.info()['Content-Length']
873 except (urllib2.HTTPError, ), err:
874 if err.code < 500 or err.code >= 600:
877 # Examine the reported length
878 if (content_length is not None and
879 (resume_len - 100 < long(content_length) < resume_len + 100)):
880 # The file had already been fully downloaded.
881 # Explanation to the above condition: in issue #175 it was revealed that
882 # YouTube sometimes adds or removes a few bytes from the end of the file,
883 # changing the file size slightly and causing problems for some users. So
884 # I decided to implement a suggested change and consider the file
885 # completely downloaded if the file size differs less than 100 bytes from
886 # the one in the hard drive.
887 self.report_file_already_downloaded(filename)
888 self.try_rename(tmpfilename, filename)
891 # The length does not match, we start the download over
892 self.report_unable_to_resume()
898 self.report_retry(count, retries)
901 self.trouble(u'ERROR: giving up after %s retries' % retries)
904 data_len = data.info().get('Content-length', None)
905 if data_len is not None:
906 data_len = long(data_len) + resume_len
907 data_len_str = self.format_bytes(data_len)
908 byte_counter = 0 + resume_len
914 data_block = data.read(block_size)
916 if len(data_block) == 0:
918 byte_counter += len(data_block)
920 # Open file just in time
923 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
924 filename = self.undo_temp_name(tmpfilename)
925 self.report_destination(filename)
926 except (OSError, IOError), err:
927 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
930 stream.write(data_block)
931 except (IOError, OSError), err:
932 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
934 block_size = self.best_block_size(after - before, len(data_block))
937 percent_str = self.calc_percent(byte_counter, data_len)
938 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
939 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
940 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
943 self.slow_down(start, byte_counter - resume_len)
947 if data_len is not None and byte_counter != data_len:
948 raise ContentTooShortError(byte_counter, long(data_len))
949 self.try_rename(tmpfilename, filename)
951 # Update file modification time
952 if self.params.get('updatetime', True):
953 self.try_utime(filename, data.info().get('last-modified', None))
957 class InfoExtractor(object):
958 """Information Extractor class.
960 Information extractors are the classes that, given a URL, extract
961 information from the video (or videos) the URL refers to. This
962 information includes the real video URL, the video title and simplified
963 title, author and others. The information is stored in a dictionary
964 which is then passed to the FileDownloader. The FileDownloader
965 processes this information possibly downloading the video to the file
966 system, among other possible outcomes. The dictionaries must include
967 the following fields:
969 id: Video identifier.
970 url: Final video URL.
971 uploader: Nickname of the video uploader.
972 title: Literal title.
973 stitle: Simplified title.
974 ext: Video filename extension.
975 format: Video format.
976 player_url: SWF Player URL (may be None).
978 The following fields are optional. Their primary purpose is to allow
979 youtube-dl to serve as the backend for a video search function, such
980 as the one in youtube2mp3. They are only used when their respective
981 forced printing functions are called:
983 thumbnail: Full URL to a video thumbnail image.
984 description: One-line video description.
986 Subclasses of this one should re-define the _real_initialize() and
987 _real_extract() methods, as well as the suitable() static method.
988 Probably, they should also be instantiated and added to the main
995 def __init__(self, downloader=None):
996 """Constructor. Receives an optional downloader."""
998 self.set_downloader(downloader)
1002 """Receives a URL and returns True if suitable for this IE."""
1005 def initialize(self):
1006 """Initializes an instance (authentication, etc)."""
1008 self._real_initialize()
1011 def extract(self, url):
1012 """Extracts URL information and returns it in list of dicts."""
1014 return self._real_extract(url)
1016 def set_downloader(self, downloader):
1017 """Sets the downloader for this IE."""
1018 self._downloader = downloader
1020 def _real_initialize(self):
1021 """Real initialization process. Redefine in subclasses."""
1024 def _real_extract(self, url):
1025 """Real extraction process. Redefine in subclasses."""
1028 class YoutubeIE(InfoExtractor):
1029 """Information extractor for youtube.com."""
1031 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
1032 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1033 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1034 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1035 _NETRC_MACHINE = 'youtube'
1036 # Listed in order of quality
1037 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
1038 _video_extensions = {
1044 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1051 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1053 def report_lang(self):
1054 """Report attempt to set language."""
1055 self._downloader.to_screen(u'[youtube] Setting language')
1057 def report_login(self):
1058 """Report attempt to log in."""
1059 self._downloader.to_screen(u'[youtube] Logging in')
1061 def report_age_confirmation(self):
1062 """Report attempt to confirm age."""
1063 self._downloader.to_screen(u'[youtube] Confirming age')
1065 def report_video_webpage_download(self, video_id):
1066 """Report attempt to download video webpage."""
1067 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1069 def report_video_info_webpage_download(self, video_id):
1070 """Report attempt to download video info webpage."""
1071 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1073 def report_information_extraction(self, video_id):
1074 """Report attempt to extract video information."""
1075 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1077 def report_unavailable_format(self, video_id, format):
1078 """Report extracted video URL."""
1079 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1081 def report_rtmp_download(self):
1082 """Indicate the download will use the RTMP protocol."""
1083 self._downloader.to_screen(u'[youtube] RTMP download detected')
1085 def _real_initialize(self):
1086 if self._downloader is None:
1091 downloader_params = self._downloader.params
1093 # Attempt to use provided username and password or .netrc data
1094 if downloader_params.get('username', None) is not None:
1095 username = downloader_params['username']
1096 password = downloader_params['password']
1097 elif downloader_params.get('usenetrc', False):
1099 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1100 if info is not None:
1104 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1105 except (IOError, netrc.NetrcParseError), err:
1106 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1110 request = urllib2.Request(self._LANG_URL)
1113 urllib2.urlopen(request).read()
1114 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1115 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1118 # No authentication to be performed
1119 if username is None:
1124 'current_form': 'loginForm',
1126 'action_login': 'Log In',
1127 'username': username,
1128 'password': password,
1130 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1133 login_results = urllib2.urlopen(request).read()
1134 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1135 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1137 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1138 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1144 'action_confirm': 'Confirm',
1146 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1148 self.report_age_confirmation()
1149 age_results = urllib2.urlopen(request).read()
1150 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1151 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1154 def _real_extract(self, url):
1155 # Extract video id from URL
1156 mobj = re.match(self._VALID_URL, url)
1158 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1160 video_id = mobj.group(2)
1163 self.report_video_webpage_download(video_id)
1164 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1166 video_webpage = urllib2.urlopen(request).read()
1167 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1168 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1171 # Attempt to extract SWF player URL
1172 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1173 if mobj is not None:
1174 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1179 self.report_video_info_webpage_download(video_id)
1180 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1181 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1182 % (video_id, el_type))
1183 request = urllib2.Request(video_info_url)
1185 video_info_webpage = urllib2.urlopen(request).read()
1186 video_info = parse_qs(video_info_webpage)
1187 if 'token' in video_info:
1189 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1190 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1192 if 'token' not in video_info:
1193 if 'reason' in video_info:
1194 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1196 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1199 # Start extracting information
1200 self.report_information_extraction(video_id)
1203 if 'author' not in video_info:
1204 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1206 video_uploader = urllib.unquote_plus(video_info['author'][0])
1209 if 'title' not in video_info:
1210 self._downloader.trouble(u'ERROR: unable to extract video title')
1212 video_title = urllib.unquote_plus(video_info['title'][0])
1213 video_title = video_title.decode('utf-8')
1214 video_title = sanitize_title(video_title)
1217 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1218 simple_title = simple_title.strip(ur'_')
1221 if 'thumbnail_url' not in video_info:
1222 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1223 video_thumbnail = ''
1224 else: # don't panic if we can't find it
1225 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1229 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1230 if mobj is not None:
1231 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1232 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1233 for expression in format_expressions:
1235 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1243 video_description = u'No description available.'
1244 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1245 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1246 if mobj is not None:
1247 video_description = mobj.group(1).decode('utf-8')
1249 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1250 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1251 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1252 # TODO use another parser
1255 video_token = urllib.unquote_plus(video_info['token'][0])
1257 # Decide which formats to download
1258 req_format = self._downloader.params.get('format', None)
1260 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1261 self.report_rtmp_download()
1262 video_url_list = [(None, video_info['conn'][0])]
1263 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1264 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1265 url_data = [dict(pairStr.split('=') for pairStr in uds.split('&')) for uds in url_data_strs]
1266 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1267 url_map = dict((ud['itag'], urllib.unquote(ud['url'])) for ud in url_data)
1269 format_limit = self._downloader.params.get('format_limit', None)
1270 if format_limit is not None and format_limit in self._available_formats:
1271 format_list = self._available_formats[self._available_formats.index(format_limit):]
1273 format_list = self._available_formats
1274 existing_formats = [x for x in format_list if x in url_map]
1275 if len(existing_formats) == 0:
1276 self._downloader.trouble(u'ERROR: no known formats available for video')
1278 if req_format is None:
1279 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1280 elif req_format == '-1':
1281 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1284 if req_format not in url_map:
1285 self._downloader.trouble(u'ERROR: requested format not available')
1287 video_url_list = [(req_format, url_map[req_format])] # Specific format
1289 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1292 for format_param, video_real_url in video_url_list:
1293 # At this point we have a new video
1294 self._downloader.increment_downloads()
1297 video_extension = self._video_extensions.get(format_param, 'flv')
1299 # Find the video URL in fmt_url_map or conn paramters
1301 # Process video information
1302 self._downloader.process_info({
1303 'id': video_id.decode('utf-8'),
1304 'url': video_real_url.decode('utf-8'),
1305 'uploader': video_uploader.decode('utf-8'),
1306 'upload_date': upload_date,
1307 'title': video_title,
1308 'stitle': simple_title,
1309 'ext': video_extension.decode('utf-8'),
1310 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1311 'thumbnail': video_thumbnail.decode('utf-8'),
1312 'description': video_description,
1313 'player_url': player_url,
1315 except UnavailableVideoError, err:
1316 self._downloader.trouble(u'\nERROR: unable to download video')
1319 class MetacafeIE(InfoExtractor):
1320 """Information Extractor for metacafe.com."""
1322 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1323 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1324 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1327 def __init__(self, youtube_ie, downloader=None):
1328 InfoExtractor.__init__(self, downloader)
1329 self._youtube_ie = youtube_ie
1333 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1335 def report_disclaimer(self):
1336 """Report disclaimer retrieval."""
1337 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1339 def report_age_confirmation(self):
1340 """Report attempt to confirm age."""
1341 self._downloader.to_screen(u'[metacafe] Confirming age')
1343 def report_download_webpage(self, video_id):
1344 """Report webpage download."""
1345 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1347 def report_extraction(self, video_id):
1348 """Report information extraction."""
1349 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1351 def _real_initialize(self):
1352 # Retrieve disclaimer
1353 request = urllib2.Request(self._DISCLAIMER)
1355 self.report_disclaimer()
1356 disclaimer = urllib2.urlopen(request).read()
1357 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1358 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1364 'submit': "Continue - I'm over 18",
1366 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1368 self.report_age_confirmation()
1369 disclaimer = urllib2.urlopen(request).read()
1370 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1371 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1374 def _real_extract(self, url):
1375 # Extract id and simplified title from URL
1376 mobj = re.match(self._VALID_URL, url)
1378 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1381 video_id = mobj.group(1)
1383 # Check if video comes from YouTube
1384 mobj2 = re.match(r'^yt-(.*)$', video_id)
1385 if mobj2 is not None:
1386 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1389 # At this point we have a new video
1390 self._downloader.increment_downloads()
1392 simple_title = mobj.group(2).decode('utf-8')
1394 # Retrieve video webpage to extract further information
1395 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1397 self.report_download_webpage(video_id)
1398 webpage = urllib2.urlopen(request).read()
1399 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1400 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1403 # Extract URL, uploader and title from webpage
1404 self.report_extraction(video_id)
1405 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1406 if mobj is not None:
1407 mediaURL = urllib.unquote(mobj.group(1))
1408 video_extension = mediaURL[-3:]
1410 # Extract gdaKey if available
1411 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1413 video_url = mediaURL
1415 gdaKey = mobj.group(1)
1416 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1418 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1420 self._downloader.trouble(u'ERROR: unable to extract media URL')
1422 vardict = parse_qs(mobj.group(1))
1423 if 'mediaData' not in vardict:
1424 self._downloader.trouble(u'ERROR: unable to extract media URL')
1426 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1428 self._downloader.trouble(u'ERROR: unable to extract media URL')
1430 mediaURL = mobj.group(1).replace('\\/', '/')
1431 video_extension = mediaURL[-3:]
1432 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1434 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1436 self._downloader.trouble(u'ERROR: unable to extract title')
1438 video_title = mobj.group(1).decode('utf-8')
1439 video_title = sanitize_title(video_title)
1441 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1443 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1445 video_uploader = mobj.group(1)
1448 # Process video information
1449 self._downloader.process_info({
1450 'id': video_id.decode('utf-8'),
1451 'url': video_url.decode('utf-8'),
1452 'uploader': video_uploader.decode('utf-8'),
1453 'upload_date': u'NA',
1454 'title': video_title,
1455 'stitle': simple_title,
1456 'ext': video_extension.decode('utf-8'),
1460 except UnavailableVideoError:
1461 self._downloader.trouble(u'\nERROR: unable to download video')
1464 class DailymotionIE(InfoExtractor):
1465 """Information Extractor for Dailymotion"""
1467 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1469 def __init__(self, downloader=None):
1470 InfoExtractor.__init__(self, downloader)
1474 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1476 def report_download_webpage(self, video_id):
1477 """Report webpage download."""
1478 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1480 def report_extraction(self, video_id):
1481 """Report information extraction."""
1482 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1484 def _real_initialize(self):
1487 def _real_extract(self, url):
1488 # Extract id and simplified title from URL
1489 mobj = re.match(self._VALID_URL, url)
1491 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1494 # At this point we have a new video
1495 self._downloader.increment_downloads()
1496 video_id = mobj.group(1)
1498 simple_title = mobj.group(2).decode('utf-8')
1499 video_extension = 'flv'
1501 # Retrieve video webpage to extract further information
1502 request = urllib2.Request(url)
1504 self.report_download_webpage(video_id)
1505 webpage = urllib2.urlopen(request).read()
1506 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1507 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1510 # Extract URL, uploader and title from webpage
1511 self.report_extraction(video_id)
1512 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1514 self._downloader.trouble(u'ERROR: unable to extract media URL')
1516 mediaURL = urllib.unquote(mobj.group(1))
1518 # if needed add http://www.dailymotion.com/ if relative URL
1520 video_url = mediaURL
1522 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1523 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1525 self._downloader.trouble(u'ERROR: unable to extract title')
1527 video_title = mobj.group(1).decode('utf-8')
1528 video_title = sanitize_title(video_title)
1530 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1532 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1534 video_uploader = mobj.group(1)
1537 # Process video information
1538 self._downloader.process_info({
1539 'id': video_id.decode('utf-8'),
1540 'url': video_url.decode('utf-8'),
1541 'uploader': video_uploader.decode('utf-8'),
1542 'upload_date': u'NA',
1543 'title': video_title,
1544 'stitle': simple_title,
1545 'ext': video_extension.decode('utf-8'),
1549 except UnavailableVideoError:
1550 self._downloader.trouble(u'\nERROR: unable to download video')
1552 class GoogleIE(InfoExtractor):
1553 """Information extractor for video.google.com."""
1555 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1557 def __init__(self, downloader=None):
1558 InfoExtractor.__init__(self, downloader)
1562 return (re.match(GoogleIE._VALID_URL, url) is not None)
1564 def report_download_webpage(self, video_id):
1565 """Report webpage download."""
1566 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1568 def report_extraction(self, video_id):
1569 """Report information extraction."""
1570 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1572 def _real_initialize(self):
1575 def _real_extract(self, url):
1576 # Extract id from URL
1577 mobj = re.match(self._VALID_URL, url)
1579 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1582 # At this point we have a new video
1583 self._downloader.increment_downloads()
1584 video_id = mobj.group(1)
1586 video_extension = 'mp4'
1588 # Retrieve video webpage to extract further information
1589 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1591 self.report_download_webpage(video_id)
1592 webpage = urllib2.urlopen(request).read()
1593 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1594 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1597 # Extract URL, uploader, and title from webpage
1598 self.report_extraction(video_id)
1599 mobj = re.search(r"download_url:'([^']+)'", webpage)
1601 video_extension = 'flv'
1602 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1604 self._downloader.trouble(u'ERROR: unable to extract media URL')
1606 mediaURL = urllib.unquote(mobj.group(1))
1607 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1608 mediaURL = mediaURL.replace('\\x26', '\x26')
1610 video_url = mediaURL
1612 mobj = re.search(r'<title>(.*)</title>', webpage)
1614 self._downloader.trouble(u'ERROR: unable to extract title')
1616 video_title = mobj.group(1).decode('utf-8')
1617 video_title = sanitize_title(video_title)
1618 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1620 # Extract video description
1621 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1623 self._downloader.trouble(u'ERROR: unable to extract video description')
1625 video_description = mobj.group(1).decode('utf-8')
1626 if not video_description:
1627 video_description = 'No description available.'
1629 # Extract video thumbnail
1630 if self._downloader.params.get('forcethumbnail', False):
1631 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1633 webpage = urllib2.urlopen(request).read()
1634 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1635 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1637 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1639 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1641 video_thumbnail = mobj.group(1)
1642 else: # we need something to pass to process_info
1643 video_thumbnail = ''
1647 # Process video information
1648 self._downloader.process_info({
1649 'id': video_id.decode('utf-8'),
1650 'url': video_url.decode('utf-8'),
1652 'upload_date': u'NA',
1653 'title': video_title,
1654 'stitle': simple_title,
1655 'ext': video_extension.decode('utf-8'),
1659 except UnavailableVideoError:
1660 self._downloader.trouble(u'\nERROR: unable to download video')
1663 class PhotobucketIE(InfoExtractor):
1664 """Information extractor for photobucket.com."""
1666 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1668 def __init__(self, downloader=None):
1669 InfoExtractor.__init__(self, downloader)
1673 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1675 def report_download_webpage(self, video_id):
1676 """Report webpage download."""
1677 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1679 def report_extraction(self, video_id):
1680 """Report information extraction."""
1681 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1683 def _real_initialize(self):
1686 def _real_extract(self, url):
1687 # Extract id from URL
1688 mobj = re.match(self._VALID_URL, url)
1690 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1693 # At this point we have a new video
1694 self._downloader.increment_downloads()
1695 video_id = mobj.group(1)
1697 video_extension = 'flv'
1699 # Retrieve video webpage to extract further information
1700 request = urllib2.Request(url)
1702 self.report_download_webpage(video_id)
1703 webpage = urllib2.urlopen(request).read()
1704 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1705 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1708 # Extract URL, uploader, and title from webpage
1709 self.report_extraction(video_id)
1710 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1712 self._downloader.trouble(u'ERROR: unable to extract media URL')
1714 mediaURL = urllib.unquote(mobj.group(1))
1716 video_url = mediaURL
1718 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1720 self._downloader.trouble(u'ERROR: unable to extract title')
1722 video_title = mobj.group(1).decode('utf-8')
1723 video_title = sanitize_title(video_title)
1724 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1726 video_uploader = mobj.group(2).decode('utf-8')
1729 # Process video information
1730 self._downloader.process_info({
1731 'id': video_id.decode('utf-8'),
1732 'url': video_url.decode('utf-8'),
1733 'uploader': video_uploader,
1734 'upload_date': u'NA',
1735 'title': video_title,
1736 'stitle': simple_title,
1737 'ext': video_extension.decode('utf-8'),
1741 except UnavailableVideoError:
1742 self._downloader.trouble(u'\nERROR: unable to download video')
1745 class YahooIE(InfoExtractor):
1746 """Information extractor for video.yahoo.com."""
1748 # _VALID_URL matches all Yahoo! Video URLs
1749 # _VPAGE_URL matches only the extractable '/watch/' URLs
1750 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1751 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1753 def __init__(self, downloader=None):
1754 InfoExtractor.__init__(self, downloader)
1758 return (re.match(YahooIE._VALID_URL, url) is not None)
1760 def report_download_webpage(self, video_id):
1761 """Report webpage download."""
1762 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1764 def report_extraction(self, video_id):
1765 """Report information extraction."""
1766 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1768 def _real_initialize(self):
1771 def _real_extract(self, url, new_video=True):
1772 # Extract ID from URL
1773 mobj = re.match(self._VALID_URL, url)
1775 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1778 # At this point we have a new video
1779 self._downloader.increment_downloads()
1780 video_id = mobj.group(2)
1781 video_extension = 'flv'
1783 # Rewrite valid but non-extractable URLs as
1784 # extractable English language /watch/ URLs
1785 if re.match(self._VPAGE_URL, url) is None:
1786 request = urllib2.Request(url)
1788 webpage = urllib2.urlopen(request).read()
1789 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1790 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1793 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1795 self._downloader.trouble(u'ERROR: Unable to extract id field')
1797 yahoo_id = mobj.group(1)
1799 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1801 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1803 yahoo_vid = mobj.group(1)
1805 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1806 return self._real_extract(url, new_video=False)
1808 # Retrieve video webpage to extract further information
1809 request = urllib2.Request(url)
1811 self.report_download_webpage(video_id)
1812 webpage = urllib2.urlopen(request).read()
1813 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1814 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1817 # Extract uploader and title from webpage
1818 self.report_extraction(video_id)
1819 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1821 self._downloader.trouble(u'ERROR: unable to extract video title')
1823 video_title = mobj.group(1).decode('utf-8')
1824 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1826 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1828 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1830 video_uploader = mobj.group(1).decode('utf-8')
1832 # Extract video thumbnail
1833 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1835 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1837 video_thumbnail = mobj.group(1).decode('utf-8')
1839 # Extract video description
1840 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1842 self._downloader.trouble(u'ERROR: unable to extract video description')
1844 video_description = mobj.group(1).decode('utf-8')
1845 if not video_description: video_description = 'No description available.'
1847 # Extract video height and width
1848 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1850 self._downloader.trouble(u'ERROR: unable to extract video height')
1852 yv_video_height = mobj.group(1)
1854 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1856 self._downloader.trouble(u'ERROR: unable to extract video width')
1858 yv_video_width = mobj.group(1)
1860 # Retrieve video playlist to extract media URL
1861 # I'm not completely sure what all these options are, but we
1862 # seem to need most of them, otherwise the server sends a 401.
1863 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1864 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1865 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1866 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1867 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1869 self.report_download_webpage(video_id)
1870 webpage = urllib2.urlopen(request).read()
1871 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1872 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1875 # Extract media URL from playlist XML
1876 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1878 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1880 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1881 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1884 # Process video information
1885 self._downloader.process_info({
1886 'id': video_id.decode('utf-8'),
1888 'uploader': video_uploader,
1889 'upload_date': u'NA',
1890 'title': video_title,
1891 'stitle': simple_title,
1892 'ext': video_extension.decode('utf-8'),
1893 'thumbnail': video_thumbnail.decode('utf-8'),
1894 'description': video_description,
1895 'thumbnail': video_thumbnail,
1896 'description': video_description,
1899 except UnavailableVideoError:
1900 self._downloader.trouble(u'\nERROR: unable to download video')
1903 class GenericIE(InfoExtractor):
1904 """Generic last-resort information extractor."""
1906 def __init__(self, downloader=None):
1907 InfoExtractor.__init__(self, downloader)
1913 def report_download_webpage(self, video_id):
1914 """Report webpage download."""
1915 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1916 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1918 def report_extraction(self, video_id):
1919 """Report information extraction."""
1920 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1922 def _real_initialize(self):
1925 def _real_extract(self, url):
1926 # At this point we have a new video
1927 self._downloader.increment_downloads()
1929 video_id = url.split('/')[-1]
1930 request = urllib2.Request(url)
1932 self.report_download_webpage(video_id)
1933 webpage = urllib2.urlopen(request).read()
1934 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1935 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1937 except ValueError, err:
1938 # since this is the last-resort InfoExtractor, if
1939 # this error is thrown, it'll be thrown here
1940 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1943 self.report_extraction(video_id)
1944 # Start with something easy: JW Player in SWFObject
1945 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1947 # Broaden the search a little bit
1948 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1950 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1953 # It's possible that one of the regexes
1954 # matched, but returned an empty group:
1955 if mobj.group(1) is None:
1956 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1959 video_url = urllib.unquote(mobj.group(1))
1960 video_id = os.path.basename(video_url)
1962 # here's a fun little line of code for you:
1963 video_extension = os.path.splitext(video_id)[1][1:]
1964 video_id = os.path.splitext(video_id)[0]
1966 # it's tempting to parse this further, but you would
1967 # have to take into account all the variations like
1968 # Video Title - Site Name
1969 # Site Name | Video Title
1970 # Video Title - Tagline | Site Name
1971 # and so on and so forth; it's just not practical
1972 mobj = re.search(r'<title>(.*)</title>', webpage)
1974 self._downloader.trouble(u'ERROR: unable to extract title')
1976 video_title = mobj.group(1).decode('utf-8')
1977 video_title = sanitize_title(video_title)
1978 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1980 # video uploader is domain name
1981 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1983 self._downloader.trouble(u'ERROR: unable to extract title')
1985 video_uploader = mobj.group(1).decode('utf-8')
1988 # Process video information
1989 self._downloader.process_info({
1990 'id': video_id.decode('utf-8'),
1991 'url': video_url.decode('utf-8'),
1992 'uploader': video_uploader,
1993 'upload_date': u'NA',
1994 'title': video_title,
1995 'stitle': simple_title,
1996 'ext': video_extension.decode('utf-8'),
2000 except UnavailableVideoError, err:
2001 self._downloader.trouble(u'\nERROR: unable to download video')
2004 class YoutubeSearchIE(InfoExtractor):
2005 """Information Extractor for YouTube search queries."""
2006 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2007 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2008 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2009 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2011 _max_youtube_results = 1000
2013 def __init__(self, youtube_ie, downloader=None):
2014 InfoExtractor.__init__(self, downloader)
2015 self._youtube_ie = youtube_ie
2019 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2021 def report_download_page(self, query, pagenum):
2022 """Report attempt to download playlist page with given number."""
2023 query = query.decode(preferredencoding())
2024 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2026 def _real_initialize(self):
2027 self._youtube_ie.initialize()
2029 def _real_extract(self, query):
2030 mobj = re.match(self._VALID_QUERY, query)
2032 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2035 prefix, query = query.split(':')
2037 query = query.encode('utf-8')
2039 self._download_n_results(query, 1)
2041 elif prefix == 'all':
2042 self._download_n_results(query, self._max_youtube_results)
2048 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2050 elif n > self._max_youtube_results:
2051 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2052 n = self._max_youtube_results
2053 self._download_n_results(query, n)
2055 except ValueError: # parsing prefix as integer fails
2056 self._download_n_results(query, 1)
2059 def _download_n_results(self, query, n):
2060 """Downloads a specified number of results for a query"""
2063 already_seen = set()
2067 self.report_download_page(query, pagenum)
2068 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2069 request = urllib2.Request(result_url)
2071 page = urllib2.urlopen(request).read()
2072 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2073 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2076 # Extract video identifiers
2077 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2078 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2079 if video_id not in already_seen:
2080 video_ids.append(video_id)
2081 already_seen.add(video_id)
2082 if len(video_ids) == n:
2083 # Specified n videos reached
2084 for id in video_ids:
2085 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2088 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2089 for id in video_ids:
2090 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2093 pagenum = pagenum + 1
2095 class GoogleSearchIE(InfoExtractor):
2096 """Information Extractor for Google Video search queries."""
2097 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2098 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2099 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2100 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2102 _max_google_results = 1000
2104 def __init__(self, google_ie, downloader=None):
2105 InfoExtractor.__init__(self, downloader)
2106 self._google_ie = google_ie
2110 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2112 def report_download_page(self, query, pagenum):
2113 """Report attempt to download playlist page with given number."""
2114 query = query.decode(preferredencoding())
2115 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2117 def _real_initialize(self):
2118 self._google_ie.initialize()
2120 def _real_extract(self, query):
2121 mobj = re.match(self._VALID_QUERY, query)
2123 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2126 prefix, query = query.split(':')
2128 query = query.encode('utf-8')
2130 self._download_n_results(query, 1)
2132 elif prefix == 'all':
2133 self._download_n_results(query, self._max_google_results)
2139 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2141 elif n > self._max_google_results:
2142 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2143 n = self._max_google_results
2144 self._download_n_results(query, n)
2146 except ValueError: # parsing prefix as integer fails
2147 self._download_n_results(query, 1)
2150 def _download_n_results(self, query, n):
2151 """Downloads a specified number of results for a query"""
2154 already_seen = set()
2158 self.report_download_page(query, pagenum)
2159 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2160 request = urllib2.Request(result_url)
2162 page = urllib2.urlopen(request).read()
2163 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2164 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2167 # Extract video identifiers
2168 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2169 video_id = mobj.group(1)
2170 if video_id not in already_seen:
2171 video_ids.append(video_id)
2172 already_seen.add(video_id)
2173 if len(video_ids) == n:
2174 # Specified n videos reached
2175 for id in video_ids:
2176 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2179 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2180 for id in video_ids:
2181 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2184 pagenum = pagenum + 1
2186 class YahooSearchIE(InfoExtractor):
2187 """Information Extractor for Yahoo! Video search queries."""
2188 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2189 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2190 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2191 _MORE_PAGES_INDICATOR = r'\s*Next'
2193 _max_yahoo_results = 1000
2195 def __init__(self, yahoo_ie, downloader=None):
2196 InfoExtractor.__init__(self, downloader)
2197 self._yahoo_ie = yahoo_ie
2201 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2203 def report_download_page(self, query, pagenum):
2204 """Report attempt to download playlist page with given number."""
2205 query = query.decode(preferredencoding())
2206 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2208 def _real_initialize(self):
2209 self._yahoo_ie.initialize()
2211 def _real_extract(self, query):
2212 mobj = re.match(self._VALID_QUERY, query)
2214 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2217 prefix, query = query.split(':')
2219 query = query.encode('utf-8')
2221 self._download_n_results(query, 1)
2223 elif prefix == 'all':
2224 self._download_n_results(query, self._max_yahoo_results)
2230 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2232 elif n > self._max_yahoo_results:
2233 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2234 n = self._max_yahoo_results
2235 self._download_n_results(query, n)
2237 except ValueError: # parsing prefix as integer fails
2238 self._download_n_results(query, 1)
2241 def _download_n_results(self, query, n):
2242 """Downloads a specified number of results for a query"""
2245 already_seen = set()
2249 self.report_download_page(query, pagenum)
2250 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2251 request = urllib2.Request(result_url)
2253 page = urllib2.urlopen(request).read()
2254 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2255 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2258 # Extract video identifiers
2259 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2260 video_id = mobj.group(1)
2261 if video_id not in already_seen:
2262 video_ids.append(video_id)
2263 already_seen.add(video_id)
2264 if len(video_ids) == n:
2265 # Specified n videos reached
2266 for id in video_ids:
2267 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2270 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2271 for id in video_ids:
2272 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2275 pagenum = pagenum + 1
2277 class YoutubePlaylistIE(InfoExtractor):
2278 """Information Extractor for YouTube playlists."""
2280 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2281 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2282 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2283 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2286 def __init__(self, youtube_ie, downloader=None):
2287 InfoExtractor.__init__(self, downloader)
2288 self._youtube_ie = youtube_ie
2292 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2294 def report_download_page(self, playlist_id, pagenum):
2295 """Report attempt to download playlist page with given number."""
2296 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2298 def _real_initialize(self):
2299 self._youtube_ie.initialize()
2301 def _real_extract(self, url):
2302 # Extract playlist id
2303 mobj = re.match(self._VALID_URL, url)
2305 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2309 if mobj.group(3) is not None:
2310 self._youtube_ie.extract(mobj.group(3))
2313 # Download playlist pages
2314 # prefix is 'p' as default for playlists but there are other types that need extra care
2315 playlist_prefix = mobj.group(1)
2316 if playlist_prefix == 'a':
2317 playlist_access = 'artist'
2319 playlist_prefix = 'p'
2320 playlist_access = 'view_play_list'
2321 playlist_id = mobj.group(2)
2326 self.report_download_page(playlist_id, pagenum)
2327 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2329 page = urllib2.urlopen(request).read()
2330 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2331 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2334 # Extract video identifiers
2336 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2337 if mobj.group(1) not in ids_in_page:
2338 ids_in_page.append(mobj.group(1))
2339 video_ids.extend(ids_in_page)
2341 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2343 pagenum = pagenum + 1
2345 playliststart = self._downloader.params.get('playliststart', 1) - 1
2346 playlistend = self._downloader.params.get('playlistend', -1)
2347 video_ids = video_ids[playliststart:playlistend]
2349 for id in video_ids:
2350 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2353 class YoutubeUserIE(InfoExtractor):
2354 """Information Extractor for YouTube users."""
2356 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2357 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2358 _GDATA_PAGE_SIZE = 50
2359 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2360 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2363 def __init__(self, youtube_ie, downloader=None):
2364 InfoExtractor.__init__(self, downloader)
2365 self._youtube_ie = youtube_ie
2369 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2371 def report_download_page(self, username, start_index):
2372 """Report attempt to download user page."""
2373 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2374 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2376 def _real_initialize(self):
2377 self._youtube_ie.initialize()
2379 def _real_extract(self, url):
2381 mobj = re.match(self._VALID_URL, url)
2383 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2386 username = mobj.group(1)
2388 # Download video ids using YouTube Data API. Result size per
2389 # query is limited (currently to 50 videos) so we need to query
2390 # page by page until there are no video ids - it means we got
2397 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2398 self.report_download_page(username, start_index)
2400 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2403 page = urllib2.urlopen(request).read()
2404 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2405 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2408 # Extract video identifiers
2411 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2412 if mobj.group(1) not in ids_in_page:
2413 ids_in_page.append(mobj.group(1))
2415 video_ids.extend(ids_in_page)
2417 # A little optimization - if current page is not
2418 # "full", ie. does not contain PAGE_SIZE video ids then
2419 # we can assume that this page is the last one - there
2420 # are no more ids on further pages - no need to query
2423 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2428 all_ids_count = len(video_ids)
2429 playliststart = self._downloader.params.get('playliststart', 1) - 1
2430 playlistend = self._downloader.params.get('playlistend', -1)
2432 if playlistend == -1:
2433 video_ids = video_ids[playliststart:]
2435 video_ids = video_ids[playliststart:playlistend]
2437 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2438 (username, all_ids_count, len(video_ids)))
2440 for video_id in video_ids:
2441 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2444 class DepositFilesIE(InfoExtractor):
2445 """Information extractor for depositfiles.com"""
2447 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2449 def __init__(self, downloader=None):
2450 InfoExtractor.__init__(self, downloader)
2454 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2456 def report_download_webpage(self, file_id):
2457 """Report webpage download."""
2458 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2460 def report_extraction(self, file_id):
2461 """Report information extraction."""
2462 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2464 def _real_initialize(self):
2467 def _real_extract(self, url):
2468 # At this point we have a new file
2469 self._downloader.increment_downloads()
2471 file_id = url.split('/')[-1]
2472 # Rebuild url in english locale
2473 url = 'http://depositfiles.com/en/files/' + file_id
2475 # Retrieve file webpage with 'Free download' button pressed
2476 free_download_indication = { 'gateway_result' : '1' }
2477 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2479 self.report_download_webpage(file_id)
2480 webpage = urllib2.urlopen(request).read()
2481 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2482 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2485 # Search for the real file URL
2486 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2487 if (mobj is None) or (mobj.group(1) is None):
2488 # Try to figure out reason of the error.
2489 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2490 if (mobj is not None) and (mobj.group(1) is not None):
2491 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2492 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2494 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2497 file_url = mobj.group(1)
2498 file_extension = os.path.splitext(file_url)[1][1:]
2500 # Search for file title
2501 mobj = re.search(r'<b title="(.*?)">', webpage)
2503 self._downloader.trouble(u'ERROR: unable to extract title')
2505 file_title = mobj.group(1).decode('utf-8')
2508 # Process file information
2509 self._downloader.process_info({
2510 'id': file_id.decode('utf-8'),
2511 'url': file_url.decode('utf-8'),
2513 'upload_date': u'NA',
2514 'title': file_title,
2515 'stitle': file_title,
2516 'ext': file_extension.decode('utf-8'),
2520 except UnavailableVideoError, err:
2521 self._downloader.trouble(u'ERROR: unable to download file')
2523 class FacebookIE(InfoExtractor):
2524 """Information Extractor for Facebook"""
2526 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2527 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2528 _NETRC_MACHINE = 'facebook'
2529 _available_formats = ['highqual', 'lowqual']
2530 _video_extensions = {
2535 def __init__(self, downloader=None):
2536 InfoExtractor.__init__(self, downloader)
2540 return (re.match(FacebookIE._VALID_URL, url) is not None)
2542 def _reporter(self, message):
2543 """Add header and report message."""
2544 self._downloader.to_screen(u'[facebook] %s' % message)
2546 def report_login(self):
2547 """Report attempt to log in."""
2548 self._reporter(u'Logging in')
2550 def report_video_webpage_download(self, video_id):
2551 """Report attempt to download video webpage."""
2552 self._reporter(u'%s: Downloading video webpage' % video_id)
2554 def report_information_extraction(self, video_id):
2555 """Report attempt to extract video information."""
2556 self._reporter(u'%s: Extracting video information' % video_id)
2558 def _parse_page(self, video_webpage):
2559 """Extract video information from page"""
2561 data = {'title': r'class="video_title datawrap">(.*?)</',
2562 'description': r'<div class="datawrap">(.*?)</div>',
2563 'owner': r'\("video_owner_name", "(.*?)"\)',
2564 'upload_date': r'data-date="(.*?)"',
2565 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2568 for piece in data.keys():
2569 mobj = re.search(data[piece], video_webpage)
2570 if mobj is not None:
2571 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2575 for fmt in self._available_formats:
2576 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2577 if mobj is not None:
2578 # URL is in a Javascript segment inside an escaped Unicode format within
2579 # the generally utf-8 page
2580 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2581 video_info['video_urls'] = video_urls
2585 def _real_initialize(self):
2586 if self._downloader is None:
2591 downloader_params = self._downloader.params
2593 # Attempt to use provided username and password or .netrc data
2594 if downloader_params.get('username', None) is not None:
2595 useremail = downloader_params['username']
2596 password = downloader_params['password']
2597 elif downloader_params.get('usenetrc', False):
2599 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2600 if info is not None:
2604 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2605 except (IOError, netrc.NetrcParseError), err:
2606 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2609 if useremail is None:
2618 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2621 login_results = urllib2.urlopen(request).read()
2622 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2623 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2625 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2626 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2629 def _real_extract(self, url):
2630 mobj = re.match(self._VALID_URL, url)
2632 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2634 video_id = mobj.group('ID')
2637 self.report_video_webpage_download(video_id)
2638 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2640 page = urllib2.urlopen(request)
2641 video_webpage = page.read()
2642 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2643 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2646 # Start extracting information
2647 self.report_information_extraction(video_id)
2649 # Extract information
2650 video_info = self._parse_page(video_webpage)
2653 if 'owner' not in video_info:
2654 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2656 video_uploader = video_info['owner']
2659 if 'title' not in video_info:
2660 self._downloader.trouble(u'ERROR: unable to extract video title')
2662 video_title = video_info['title']
2663 video_title = video_title.decode('utf-8')
2664 video_title = sanitize_title(video_title)
2667 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2668 simple_title = simple_title.strip(ur'_')
2671 if 'thumbnail' not in video_info:
2672 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2673 video_thumbnail = ''
2675 video_thumbnail = video_info['thumbnail']
2679 if 'upload_date' in video_info:
2680 upload_time = video_info['upload_date']
2681 timetuple = email.utils.parsedate_tz(upload_time)
2682 if timetuple is not None:
2684 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2689 video_description = video_info.get('description', 'No description available.')
2691 url_map = video_info['video_urls']
2692 if len(url_map.keys()) > 0:
2693 # Decide which formats to download
2694 req_format = self._downloader.params.get('format', None)
2695 format_limit = self._downloader.params.get('format_limit', None)
2697 if format_limit is not None and format_limit in self._available_formats:
2698 format_list = self._available_formats[self._available_formats.index(format_limit):]
2700 format_list = self._available_formats
2701 existing_formats = [x for x in format_list if x in url_map]
2702 if len(existing_formats) == 0:
2703 self._downloader.trouble(u'ERROR: no known formats available for video')
2705 if req_format is None:
2706 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2707 elif req_format == '-1':
2708 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2711 if req_format not in url_map:
2712 self._downloader.trouble(u'ERROR: requested format not available')
2714 video_url_list = [(req_format, url_map[req_format])] # Specific format
2716 for format_param, video_real_url in video_url_list:
2718 # At this point we have a new video
2719 self._downloader.increment_downloads()
2722 video_extension = self._video_extensions.get(format_param, 'mp4')
2724 # Find the video URL in fmt_url_map or conn paramters
2726 # Process video information
2727 self._downloader.process_info({
2728 'id': video_id.decode('utf-8'),
2729 'url': video_real_url.decode('utf-8'),
2730 'uploader': video_uploader.decode('utf-8'),
2731 'upload_date': upload_date,
2732 'title': video_title,
2733 'stitle': simple_title,
2734 'ext': video_extension.decode('utf-8'),
2735 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2736 'thumbnail': video_thumbnail.decode('utf-8'),
2737 'description': video_description.decode('utf-8'),
2740 except UnavailableVideoError, err:
2741 self._downloader.trouble(u'\nERROR: unable to download video')
2743 class BlipTVIE(InfoExtractor):
2744 """Information extractor for blip.tv"""
2746 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip.tv(/.+)$'
2747 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2751 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2753 def report_extraction(self, file_id):
2754 """Report information extraction."""
2755 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2757 def _simplify_title(self, title):
2758 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2759 res = res.strip(ur'_')
2762 def _real_extract(self, url):
2763 mobj = re.match(self._VALID_URL, url)
2765 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2772 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2773 request = urllib2.Request(json_url)
2774 self.report_extraction(mobj.group(1))
2776 json_code = urllib2.urlopen(request).read()
2777 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2778 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2781 json_data = json.loads(json_code)
2782 if 'Post' in json_data:
2783 data = json_data['Post']
2787 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2788 video_url = data['media']['url']
2789 umobj = re.match(self._URL_EXT, video_url)
2791 raise ValueError('Can not determine filename extension')
2792 ext = umobj.group(1)
2794 self._downloader.increment_downloads()
2797 'id': data['item_id'],
2799 'uploader': data['display_name'],
2800 'upload_date': upload_date,
2801 'title': data['title'],
2802 'stitle': self._simplify_title(data['title']),
2804 'format': data['media']['mimeType'],
2805 'thumbnail': data['thumbnailUrl'],
2806 'description': data['description'],
2807 'player_url': data['embedUrl']
2809 except (ValueError,KeyError), err:
2810 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2814 self._downloader.process_info(info)
2815 except UnavailableVideoError, err:
2816 self._downloader.trouble(u'\nERROR: unable to download video')
2819 class PostProcessor(object):
2820 """Post Processor class.
2822 PostProcessor objects can be added to downloaders with their
2823 add_post_processor() method. When the downloader has finished a
2824 successful download, it will take its internal chain of PostProcessors
2825 and start calling the run() method on each one of them, first with
2826 an initial argument and then with the returned value of the previous
2829 The chain will be stopped if one of them ever returns None or the end
2830 of the chain is reached.
2832 PostProcessor objects follow a "mutual registration" process similar
2833 to InfoExtractor objects.
2838 def __init__(self, downloader=None):
2839 self._downloader = downloader
2841 def set_downloader(self, downloader):
2842 """Sets the downloader for this PP."""
2843 self._downloader = downloader
2845 def run(self, information):
2846 """Run the PostProcessor.
2848 The "information" argument is a dictionary like the ones
2849 composed by InfoExtractors. The only difference is that this
2850 one has an extra field called "filepath" that points to the
2853 When this method returns None, the postprocessing chain is
2854 stopped. However, this method may return an information
2855 dictionary that will be passed to the next postprocessing
2856 object in the chain. It can be the one it received after
2857 changing some fields.
2859 In addition, this method may raise a PostProcessingError
2860 exception that will be taken into account by the downloader
2863 return information # by default, do nothing
2865 class FFmpegExtractAudioPP(PostProcessor):
2867 def __init__(self, downloader=None, preferredcodec=None):
2868 PostProcessor.__init__(self, downloader)
2869 if preferredcodec is None:
2870 preferredcodec = 'best'
2871 self._preferredcodec = preferredcodec
2874 def get_audio_codec(path):
2876 cmd = ['ffprobe', '-show_streams', '--', path]
2877 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
2878 output = handle.communicate()[0]
2879 if handle.wait() != 0:
2881 except (IOError, OSError):
2884 for line in output.split('\n'):
2885 if line.startswith('codec_name='):
2886 audio_codec = line.split('=')[1].strip()
2887 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
2892 def run_ffmpeg(path, out_path, codec, more_opts):
2894 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
2895 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
2897 except (IOError, OSError):
2900 def run(self, information):
2901 path = information['filepath']
2903 filecodec = self.get_audio_codec(path)
2904 if filecodec is None:
2905 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
2909 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
2910 if filecodec == 'aac' or filecodec == 'mp3':
2911 # Lossless if possible
2913 extension = filecodec
2914 if filecodec == 'aac':
2915 more_opts = ['-f', 'adts']
2918 acodec = 'libmp3lame'
2920 more_opts = ['-ab', '128k']
2922 # We convert the audio (lossy)
2923 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
2924 extension = self._preferredcodec
2925 more_opts = ['-ab', '128k']
2926 if self._preferredcodec == 'aac':
2927 more_opts += ['-f', 'adts']
2929 (prefix, ext) = os.path.splitext(path)
2930 new_path = prefix + '.' + extension
2931 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
2932 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
2935 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
2940 except (IOError, OSError):
2941 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
2944 information['filepath'] = new_path
2947 ### MAIN PROGRAM ###
2948 if __name__ == '__main__':
2950 # Modules needed only when running the main program
2954 # Function to update the program file with the latest version from the repository.
2955 def update_self(downloader, filename):
2956 # Note: downloader only used for options
2957 if not os.access(filename, os.W_OK):
2958 sys.exit('ERROR: no write permissions on %s' % filename)
2960 downloader.to_screen('Updating to latest stable version...')
2962 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2963 latest_version = urllib.urlopen(latest_url).read().strip()
2964 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2965 newcontent = urllib.urlopen(prog_url).read()
2966 except (IOError, OSError), err:
2967 sys.exit('ERROR: unable to download latest version')
2969 stream = open(filename, 'w')
2970 stream.write(newcontent)
2972 except (IOError, OSError), err:
2973 sys.exit('ERROR: unable to overwrite current version')
2974 downloader.to_screen('Updated to version %s' % latest_version)
2976 # Parse command line
2977 parser = optparse.OptionParser(
2978 usage='Usage: %prog [options] url...',
2979 version='2011.07.09-phihag',
2980 conflict_handler='resolve',
2983 parser.add_option('-h', '--help',
2984 action='help', help='print this help text and exit')
2985 parser.add_option('-v', '--version',
2986 action='version', help='print program version and exit')
2987 parser.add_option('-U', '--update',
2988 action='store_true', dest='update_self', help='update this program to latest stable version')
2989 parser.add_option('-i', '--ignore-errors',
2990 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2991 parser.add_option('-r', '--rate-limit',
2992 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2993 parser.add_option('-R', '--retries',
2994 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2995 parser.add_option('--playlist-start',
2996 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2997 parser.add_option('--playlist-end',
2998 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2999 parser.add_option('--dump-user-agent',
3000 action='store_true', dest='dump_user_agent',
3001 help='display the current browser identification', default=False)
3003 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3004 authentication.add_option('-u', '--username',
3005 dest='username', metavar='USERNAME', help='account username')
3006 authentication.add_option('-p', '--password',
3007 dest='password', metavar='PASSWORD', help='account password')
3008 authentication.add_option('-n', '--netrc',
3009 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3010 parser.add_option_group(authentication)
3012 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3013 video_format.add_option('-f', '--format',
3014 action='store', dest='format', metavar='FORMAT', help='video format code')
3015 video_format.add_option('--all-formats',
3016 action='store_const', dest='format', help='download all available video formats', const='-1')
3017 video_format.add_option('--max-quality',
3018 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3019 parser.add_option_group(video_format)
3021 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3022 verbosity.add_option('-q', '--quiet',
3023 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3024 verbosity.add_option('-s', '--simulate',
3025 action='store_true', dest='simulate', help='do not download video', default=False)
3026 verbosity.add_option('-g', '--get-url',
3027 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3028 verbosity.add_option('-e', '--get-title',
3029 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3030 verbosity.add_option('--get-thumbnail',
3031 action='store_true', dest='getthumbnail',
3032 help='simulate, quiet but print thumbnail URL', default=False)
3033 verbosity.add_option('--get-description',
3034 action='store_true', dest='getdescription',
3035 help='simulate, quiet but print video description', default=False)
3036 verbosity.add_option('--get-filename',
3037 action='store_true', dest='getfilename',
3038 help='simulate, quiet but print output filename', default=False)
3039 verbosity.add_option('--no-progress',
3040 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3041 verbosity.add_option('--console-title',
3042 action='store_true', dest='consoletitle',
3043 help='display progress in console titlebar', default=False)
3044 parser.add_option_group(verbosity)
3046 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3047 filesystem.add_option('-t', '--title',
3048 action='store_true', dest='usetitle', help='use title in file name', default=False)
3049 filesystem.add_option('-l', '--literal',
3050 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3051 filesystem.add_option('-A', '--auto-number',
3052 action='store_true', dest='autonumber',
3053 help='number downloaded files starting from 00000', default=False)
3054 filesystem.add_option('-o', '--output',
3055 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3056 filesystem.add_option('-a', '--batch-file',
3057 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3058 filesystem.add_option('-w', '--no-overwrites',
3059 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3060 filesystem.add_option('-c', '--continue',
3061 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3062 filesystem.add_option('--cookies',
3063 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3064 filesystem.add_option('--no-part',
3065 action='store_true', dest='nopart', help='do not use .part files', default=False)
3066 filesystem.add_option('--no-mtime',
3067 action='store_false', dest='updatetime',
3068 help='do not use the Last-modified header to set the file modification time', default=True)
3069 filesystem.add_option('--write-description',
3070 action='store_true', dest='writedescription',
3071 help='write video description to a .description file', default=False)
3072 filesystem.add_option('--write-info-json',
3073 action='store_true', dest='writeinfojson',
3074 help='write video metadata to a .info.json file', default=False)
3075 parser.add_option_group(filesystem)
3077 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3078 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3079 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3080 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3081 help='"best", "aac" or "mp3"; best by default')
3082 parser.add_option_group(postproc)
3084 (opts, args) = parser.parse_args()
3086 # Open appropriate CookieJar
3087 if opts.cookiefile is None:
3088 jar = cookielib.CookieJar()
3091 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3092 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3094 except (IOError, OSError), err:
3095 sys.exit(u'ERROR: unable to open cookie file')
3098 if opts.dump_user_agent:
3099 print std_headers['User-Agent']
3102 # General configuration
3103 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3104 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
3105 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3107 # Batch file verification
3109 if opts.batchfile is not None:
3111 if opts.batchfile == '-':
3114 batchfd = open(opts.batchfile, 'r')
3115 batchurls = batchfd.readlines()
3116 batchurls = [x.strip() for x in batchurls]
3117 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3119 sys.exit(u'ERROR: batch file could not be read')
3120 all_urls = batchurls + args
3122 # Conflicting, missing and erroneous options
3123 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3124 parser.error(u'using .netrc conflicts with giving username/password')
3125 if opts.password is not None and opts.username is None:
3126 parser.error(u'account username missing')
3127 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3128 parser.error(u'using output template conflicts with using title, literal title or auto number')
3129 if opts.usetitle and opts.useliteral:
3130 parser.error(u'using title conflicts with using literal title')
3131 if opts.username is not None and opts.password is None:
3132 opts.password = getpass.getpass(u'Type account password and press return:')
3133 if opts.ratelimit is not None:
3134 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3135 if numeric_limit is None:
3136 parser.error(u'invalid rate limit specified')
3137 opts.ratelimit = numeric_limit
3138 if opts.retries is not None:
3140 opts.retries = long(opts.retries)
3141 except (TypeError, ValueError), err:
3142 parser.error(u'invalid retry count specified')
3144 opts.playliststart = long(opts.playliststart)
3145 if opts.playliststart <= 0:
3147 except (TypeError, ValueError), err:
3148 parser.error(u'invalid playlist start number specified')
3150 opts.playlistend = long(opts.playlistend)
3151 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3153 except (TypeError, ValueError), err:
3154 parser.error(u'invalid playlist end number specified')
3155 if opts.extractaudio:
3156 if opts.audioformat not in ['best', 'aac', 'mp3']:
3157 parser.error(u'invalid audio format specified')
3159 # Information extractors
3160 youtube_ie = YoutubeIE()
3161 metacafe_ie = MetacafeIE(youtube_ie)
3162 dailymotion_ie = DailymotionIE()
3163 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3164 youtube_user_ie = YoutubeUserIE(youtube_ie)
3165 youtube_search_ie = YoutubeSearchIE(youtube_ie)
3166 google_ie = GoogleIE()
3167 google_search_ie = GoogleSearchIE(google_ie)
3168 photobucket_ie = PhotobucketIE()
3169 yahoo_ie = YahooIE()
3170 yahoo_search_ie = YahooSearchIE(yahoo_ie)
3171 deposit_files_ie = DepositFilesIE()
3172 facebook_ie = FacebookIE()
3173 bliptv_ie = BlipTVIE()
3174 generic_ie = GenericIE()
3177 fd = FileDownloader({
3178 'usenetrc': opts.usenetrc,
3179 'username': opts.username,
3180 'password': opts.password,
3181 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3182 'forceurl': opts.geturl,
3183 'forcetitle': opts.gettitle,
3184 'forcethumbnail': opts.getthumbnail,
3185 'forcedescription': opts.getdescription,
3186 'forcefilename': opts.getfilename,
3187 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3188 'format': opts.format,
3189 'format_limit': opts.format_limit,
3190 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3191 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3192 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3193 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3194 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3195 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3196 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3197 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3198 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3199 or u'%(id)s.%(ext)s'),
3200 'ignoreerrors': opts.ignoreerrors,
3201 'ratelimit': opts.ratelimit,
3202 'nooverwrites': opts.nooverwrites,
3203 'retries': opts.retries,
3204 'continuedl': opts.continue_dl,
3205 'noprogress': opts.noprogress,
3206 'playliststart': opts.playliststart,
3207 'playlistend': opts.playlistend,
3208 'logtostderr': opts.outtmpl == '-',
3209 'consoletitle': opts.consoletitle,
3210 'nopart': opts.nopart,
3211 'updatetime': opts.updatetime,
3212 'writedescription': opts.writedescription,
3213 'writeinfojson': opts.writeinfojson,
3215 fd.add_info_extractor(youtube_search_ie)
3216 fd.add_info_extractor(youtube_pl_ie)
3217 fd.add_info_extractor(youtube_user_ie)
3218 fd.add_info_extractor(metacafe_ie)
3219 fd.add_info_extractor(dailymotion_ie)
3220 fd.add_info_extractor(youtube_ie)
3221 fd.add_info_extractor(google_ie)
3222 fd.add_info_extractor(google_search_ie)
3223 fd.add_info_extractor(photobucket_ie)
3224 fd.add_info_extractor(yahoo_ie)
3225 fd.add_info_extractor(yahoo_search_ie)
3226 fd.add_info_extractor(deposit_files_ie)
3227 fd.add_info_extractor(facebook_ie)
3228 fd.add_info_extractor(bliptv_ie)
3230 # This must come last since it's the
3231 # fallback if none of the others work
3232 fd.add_info_extractor(generic_ie)
3235 if opts.extractaudio:
3236 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3239 if opts.update_self:
3240 update_self(fd, sys.argv[0])
3243 if len(all_urls) < 1:
3244 if not opts.update_self:
3245 parser.error(u'you must provide at least one URL')
3248 retcode = fd.download(all_urls)
3250 # Dump cookie jar if requested
3251 if opts.cookiefile is not None:
3254 except (IOError, OSError), err:
3255 sys.exit(u'ERROR: unable to save cookie jar')
3259 except DownloadError:
3261 except SameFileError:
3262 sys.exit(u'ERROR: fixed output name but more than one file to download')
3263 except KeyboardInterrupt:
3264 sys.exit(u'\nERROR: Interrupted by user')