2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # License: Public domain code
24 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2',
25 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
26 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
27 'Accept-Language': 'en-us,en;q=0.5',
30 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
32 def preferredencoding():
33 """Get preferred encoding.
35 Returns the best encoding scheme for the system, based on
36 locale.getpreferredencoding() and some further tweaks.
38 def yield_preferredencoding():
40 pref = locale.getpreferredencoding()
46 return yield_preferredencoding().next()
48 class DownloadError(Exception):
49 """Download Error exception.
51 This exception may be thrown by FileDownloader objects if they are not
52 configured to continue on errors. They will contain the appropriate
57 class SameFileError(Exception):
58 """Same File exception.
60 This exception will be thrown by FileDownloader objects if they detect
61 multiple files would have to be downloaded to the same file on disk.
65 class PostProcessingError(Exception):
66 """Post Processing exception.
68 This exception may be raised by PostProcessor's .run() method to
69 indicate an error in the postprocessing task.
73 class UnavailableFormatError(Exception):
74 """Unavailable Format exception.
76 This exception will be thrown when a video is requested
77 in a format that is not available for that video.
81 class ContentTooShortError(Exception):
82 """Content Too Short exception.
84 This exception may be raised by FileDownloader objects when a file they
85 download is too small for what the server announced first, indicating
86 the connection was probably interrupted.
92 def __init__(self, downloaded, expected):
93 self.downloaded = downloaded
94 self.expected = expected
96 class FileDownloader(object):
97 """File Downloader class.
99 File downloader objects are the ones responsible of downloading the
100 actual video file and writing it to disk if the user has requested
101 it, among some other tasks. In most cases there should be one per
102 program. As, given a video URL, the downloader doesn't know how to
103 extract all the needed information, task that InfoExtractors do, it
104 has to pass the URL to one of them.
106 For this, file downloader objects have a method that allows
107 InfoExtractors to be registered in a given order. When it is passed
108 a URL, the file downloader handles it to the first InfoExtractor it
109 finds that reports being able to handle it. The InfoExtractor extracts
110 all the information about the video or videos the URL refers to, and
111 asks the FileDownloader to process the video information, possibly
112 downloading the video.
114 File downloaders accept a lot of parameters. In order not to saturate
115 the object constructor with arguments, it receives a dictionary of
116 options instead. These options are available through the params
117 attribute for the InfoExtractors to use. The FileDownloader also
118 registers itself as the downloader in charge for the InfoExtractors
119 that are added to it, so this is a "mutual registration".
123 username: Username for authentication purposes.
124 password: Password for authentication purposes.
125 usenetrc: Use netrc for authentication instead.
126 quiet: Do not print messages to stdout.
127 forceurl: Force printing final URL.
128 forcetitle: Force printing title.
129 simulate: Do not download the video files.
130 format: Video format code.
131 outtmpl: Template for output names.
132 ignoreerrors: Do not stop on download errors.
133 ratelimit: Download speed limit, in bytes/sec.
134 nooverwrites: Prevent overwriting files.
135 continuedl: Try to continue downloads if possible.
141 _download_retcode = None
143 def __init__(self, params):
144 """Create a FileDownloader object with the given options."""
147 self._download_retcode = 0
151 def pmkdir(filename):
152 """Create directory components in filename. Similar to Unix "mkdir -p"."""
153 components = filename.split(os.sep)
154 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
155 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
156 for dir in aggregate:
157 if not os.path.exists(dir):
161 def format_bytes(bytes):
164 if type(bytes) is str:
169 exponent = long(math.log(bytes, 1024.0))
170 suffix = 'bkMGTPEZY'[exponent]
171 converted = float(bytes) / float(1024**exponent)
172 return '%.2f%s' % (converted, suffix)
175 def calc_percent(byte_counter, data_len):
178 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
181 def calc_eta(start, now, total, current):
185 if current == 0 or dif < 0.001: # One millisecond
187 rate = float(current) / dif
188 eta = long((float(total) - float(current)) / rate)
189 (eta_mins, eta_secs) = divmod(eta, 60)
192 return '%02d:%02d' % (eta_mins, eta_secs)
195 def calc_speed(start, now, bytes):
197 if bytes == 0 or dif < 0.001: # One millisecond
198 return '%10s' % '---b/s'
199 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
202 def best_block_size(elapsed_time, bytes):
203 new_min = max(bytes / 2.0, 1.0)
204 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
205 if elapsed_time < 0.001:
207 rate = bytes / elapsed_time
215 def parse_bytes(bytestr):
216 """Parse a string indicating a byte quantity into a long integer."""
217 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
220 number = float(matchobj.group(1))
221 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
222 return long(round(number * multiplier))
226 """Verify a URL is valid and data could be downloaded. Return real data URL."""
227 request = urllib2.Request(url, None, std_headers)
228 data = urllib2.urlopen(request)
234 def add_info_extractor(self, ie):
235 """Add an InfoExtractor object to the end of the list."""
237 ie.set_downloader(self)
239 def add_post_processor(self, pp):
240 """Add a PostProcessor object to the end of the chain."""
242 pp.set_downloader(self)
244 def to_stdout(self, message, skip_eol=False):
245 """Print message to stdout if not in quiet mode."""
246 if not self.params.get('quiet', False):
247 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
250 def to_stderr(self, message):
251 """Print message to stderr."""
252 print >>sys.stderr, message.encode(preferredencoding())
254 def fixed_template(self):
255 """Checks if the output template is fixed."""
256 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
258 def trouble(self, message=None):
259 """Determine action to take when a download problem appears.
261 Depending on if the downloader has been configured to ignore
262 download errors or not, this method may throw an exception or
263 not when errors are found, after printing the message.
265 if message is not None:
266 self.to_stderr(message)
267 if not self.params.get('ignoreerrors', False):
268 raise DownloadError(message)
269 self._download_retcode = 1
271 def slow_down(self, start_time, byte_counter):
272 """Sleep if the download speed is over the rate limit."""
273 rate_limit = self.params.get('ratelimit', None)
274 if rate_limit is None or byte_counter == 0:
277 elapsed = now - start_time
280 speed = float(byte_counter) / elapsed
281 if speed > rate_limit:
282 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
284 def report_destination(self, filename):
285 """Report destination filename."""
286 self.to_stdout(u'[download] Destination: %s' % filename)
288 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
289 """Report download progress."""
290 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
291 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
293 def report_resuming_byte(self, resume_len):
294 """Report attemtp to resume at given byte."""
295 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
297 def report_file_already_downloaded(self, file_name):
298 """Report file has already been fully downloaded."""
299 self.to_stdout(u'[download] %s has already been downloaded' % file_name)
301 def report_unable_to_resume(self):
302 """Report it was impossible to resume download."""
303 self.to_stdout(u'[download] Unable to resume')
305 def report_finish(self):
306 """Report download finished."""
309 def process_info(self, info_dict):
310 """Process a single dictionary returned by an InfoExtractor."""
311 # Do nothing else if in simulate mode
312 if self.params.get('simulate', False):
314 info_dict['url'] = self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
315 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
316 raise UnavailableFormatError
319 if self.params.get('forcetitle', False):
320 print info_dict['title'].encode(preferredencoding())
321 if self.params.get('forceurl', False):
322 print info_dict['url'].encode(preferredencoding())
327 template_dict = dict(info_dict)
328 template_dict['epoch'] = unicode(long(time.time()))
329 filename = self.params['outtmpl'] % template_dict
330 except (ValueError, KeyError), err:
331 self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
332 if self.params.get('nooverwrites', False) and os.path.exists(filename):
333 self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
337 self.pmkdir(filename)
338 except (OSError, IOError), err:
339 self.trouble('ERROR: unable to create directories: %s' % str(err))
343 success = self._do_download(filename, info_dict['url'].encode('utf-8'))
344 except (OSError, IOError), err:
345 raise UnavailableFormatError
346 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
347 self.trouble('ERROR: unable to download video data: %s' % str(err))
349 except (ContentTooShortError, ), err:
350 self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
355 self.post_process(filename, info_dict)
356 except (PostProcessingError), err:
357 self.trouble('ERROR: postprocessing: %s' % str(err))
360 def download(self, url_list):
361 """Download a given list of URLs."""
362 if len(url_list) > 1 and self.fixed_template():
363 raise SameFileError(self.params['outtmpl'])
366 suitable_found = False
368 # Go to next InfoExtractor if not suitable
369 if not ie.suitable(url):
372 # Suitable InfoExtractor found
373 suitable_found = True
375 # Extract information from URL and process it
378 # Suitable InfoExtractor had been found; go to next URL
381 if not suitable_found:
382 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
384 return self._download_retcode
386 def post_process(self, filename, ie_info):
387 """Run the postprocessing chain on the given file."""
389 info['filepath'] = filename
395 def _download_with_rtmpdump(self, filename, url):
396 self.report_destination(filename)
398 # Check for rtmpdump first
400 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
401 except (OSError, IOError):
402 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
405 # Download using rtmpdump. rtmpdump returns exit code 2 when
406 # the connection was interrumpted and resuming appears to be
407 # possible. This is part of rtmpdump's normal usage, AFAIK.
408 retval = subprocess.call(['rtmpdump', '-q', '-r', url, '-o', filename] + [[], ['-e']][self.params.get('continuedl', False)])
410 self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename), skip_eol=True)
411 time.sleep(2.0) # This seems to be needed
412 retval = subprocess.call(['rtmpdump', '-q', '-e', '-r', url, '-o', filename])
414 self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
417 self.trouble('ERROR: rtmpdump exited with code %d' % retval)
420 def _do_download(self, filename, url):
421 # Attempt to download using rtmpdump
422 if url.startswith('rtmp'):
423 return self._download_with_rtmpdump(filename, url)
427 basic_request = urllib2.Request(url, None, std_headers)
428 request = urllib2.Request(url, None, std_headers)
430 # Establish possible resume length
431 if os.path.isfile(filename):
432 resume_len = os.path.getsize(filename)
436 # Request parameters in case of being able to resume
437 if self.params.get('continuedl', False) and resume_len != 0:
438 self.report_resuming_byte(resume_len)
439 request.add_header('Range','bytes=%d-' % resume_len)
442 # Establish connection
444 data = urllib2.urlopen(request)
445 except (urllib2.HTTPError, ), err:
446 if err.code != 416: # 416 is 'Requested range not satisfiable'
449 data = urllib2.urlopen(basic_request)
450 content_length = data.info()['Content-Length']
452 if content_length is not None and long(content_length) == resume_len:
453 # Because the file had already been fully downloaded
454 self.report_file_already_downloaded(filename)
457 # Because the server didn't let us
458 self.report_unable_to_resume()
461 data_len = data.info().get('Content-length', None)
462 data_len_str = self.format_bytes(data_len)
469 data_block = data.read(block_size)
471 data_block_len = len(data_block)
472 if data_block_len == 0:
474 byte_counter += data_block_len
476 # Open file just in time
479 stream = open(filename, open_mode)
480 self.report_destination(filename)
481 except (OSError, IOError), err:
482 self.trouble('ERROR: unable to open for writing: %s' % str(err))
484 stream.write(data_block)
485 block_size = self.best_block_size(after - before, data_block_len)
488 percent_str = self.calc_percent(byte_counter, data_len)
489 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
490 speed_str = self.calc_speed(start, time.time(), byte_counter)
491 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
494 self.slow_down(start, byte_counter)
497 if data_len is not None and str(byte_counter) != data_len:
498 raise ContentTooShortError(byte_counter, long(data_len))
501 class InfoExtractor(object):
502 """Information Extractor class.
504 Information extractors are the classes that, given a URL, extract
505 information from the video (or videos) the URL refers to. This
506 information includes the real video URL, the video title and simplified
507 title, author and others. The information is stored in a dictionary
508 which is then passed to the FileDownloader. The FileDownloader
509 processes this information possibly downloading the video to the file
510 system, among other possible outcomes. The dictionaries must include
511 the following fields:
513 id: Video identifier.
514 url: Final video URL.
515 uploader: Nickname of the video uploader.
516 title: Literal title.
517 stitle: Simplified title.
518 ext: Video filename extension.
520 Subclasses of this one should re-define the _real_initialize() and
521 _real_extract() methods, as well as the suitable() static method.
522 Probably, they should also be instantiated and added to the main
529 def __init__(self, downloader=None):
530 """Constructor. Receives an optional downloader."""
532 self.set_downloader(downloader)
536 """Receives a URL and returns True if suitable for this IE."""
539 def initialize(self):
540 """Initializes an instance (authentication, etc)."""
542 self._real_initialize()
545 def extract(self, url):
546 """Extracts URL information and returns it in list of dicts."""
548 return self._real_extract(url)
550 def set_downloader(self, downloader):
551 """Sets the downloader for this IE."""
552 self._downloader = downloader
554 def _real_initialize(self):
555 """Real initialization process. Redefine in subclasses."""
558 def _real_extract(self, url):
559 """Real extraction process. Redefine in subclasses."""
562 class YoutubeIE(InfoExtractor):
563 """Information extractor for youtube.com."""
565 _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
566 _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
567 _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
568 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
569 _NETRC_MACHINE = 'youtube'
570 _available_formats = ['37', '22', '35', '18', '5', '17', '13', None] # listed in order of priority for -b flag
571 _video_extensions = {
581 return (re.match(YoutubeIE._VALID_URL, url) is not None)
584 def htmlentity_transform(matchobj):
585 """Transforms an HTML entity to a Unicode character."""
586 entity = matchobj.group(1)
588 # Known non-numeric HTML entity
589 if entity in htmlentitydefs.name2codepoint:
590 return unichr(htmlentitydefs.name2codepoint[entity])
593 mobj = re.match(ur'(?u)#(x?\d+)', entity)
595 numstr = mobj.group(1)
596 if numstr.startswith(u'x'):
598 numstr = u'0%s' % numstr
601 return unichr(long(numstr, base))
603 # Unknown entity in name, return its literal representation
604 return (u'&%s;' % entity)
606 def report_lang(self):
607 """Report attempt to set language."""
608 self._downloader.to_stdout(u'[youtube] Setting language')
610 def report_login(self):
611 """Report attempt to log in."""
612 self._downloader.to_stdout(u'[youtube] Logging in')
614 def report_age_confirmation(self):
615 """Report attempt to confirm age."""
616 self._downloader.to_stdout(u'[youtube] Confirming age')
618 def report_video_info_webpage_download(self, video_id):
619 """Report attempt to download video info webpage."""
620 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
622 def report_information_extraction(self, video_id):
623 """Report attempt to extract video information."""
624 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
626 def report_unavailable_format(self, video_id, format):
627 """Report extracted video URL."""
628 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
630 def report_rtmp_download(self):
631 """Indicate the download will use the RTMP protocol."""
632 self._downloader.to_stdout(u'[youtube] RTMP download detected')
634 def _real_initialize(self):
635 if self._downloader is None:
640 downloader_params = self._downloader.params
642 # Attempt to use provided username and password or .netrc data
643 if downloader_params.get('username', None) is not None:
644 username = downloader_params['username']
645 password = downloader_params['password']
646 elif downloader_params.get('usenetrc', False):
648 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
653 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
654 except (IOError, netrc.NetrcParseError), err:
655 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
659 request = urllib2.Request(self._LANG_URL, None, std_headers)
662 urllib2.urlopen(request).read()
663 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
664 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
667 # No authentication to be performed
673 'current_form': 'loginForm',
675 'action_login': 'Log In',
676 'username': username,
677 'password': password,
679 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
682 login_results = urllib2.urlopen(request).read()
683 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
684 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
686 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
687 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
693 'action_confirm': 'Confirm',
695 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
697 self.report_age_confirmation()
698 age_results = urllib2.urlopen(request).read()
699 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
700 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
703 def _real_extract(self, url):
704 # Extract video id from URL
705 mobj = re.match(self._VALID_URL, url)
707 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
709 video_id = mobj.group(2)
711 # Downloader parameters
715 if self._downloader is not None:
716 params = self._downloader.params
717 format_param = params.get('format', None)
718 if format_param == '0':
719 format_param = self._available_formats[quality_index]
724 video_extension = self._video_extensions.get(format_param, 'flv')
727 video_info_url = 'http://www.youtube.com/get_video_info?&video_id=%s&el=detailpage&ps=default&eurl=&gl=US&hl=en' % video_id
728 request = urllib2.Request(video_info_url, None, std_headers)
730 self.report_video_info_webpage_download(video_id)
731 video_info_webpage = urllib2.urlopen(request).read()
732 video_info = urlparse.parse_qs(video_info_webpage)
733 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
734 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
736 self.report_information_extraction(video_id)
739 if 'token' not in video_info:
740 # Attempt to see if YouTube has issued an error message
741 if 'reason' not in video_info:
742 self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
743 stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
744 stream.write(video_info_webpage)
747 reason = urllib.unquote_plus(video_info['reason'][0])
748 self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
750 token = urllib.unquote_plus(video_info['token'][0])
751 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
752 if format_param is not None:
753 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
755 # Check possible RTMP download
756 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
757 self.report_rtmp_download()
758 video_real_url = video_info['conn'][0]
761 if 'author' not in video_info:
762 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
764 video_uploader = urllib.unquote_plus(video_info['author'][0])
767 if 'title' not in video_info:
768 self._downloader.trouble(u'ERROR: unable to extract video title')
770 video_title = urllib.unquote_plus(video_info['title'][0])
771 video_title = video_title.decode('utf-8')
772 video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
773 video_title = video_title.replace(os.sep, u'%')
776 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
777 simple_title = simple_title.strip(ur'_')
780 # Process video information
781 self._downloader.process_info({
782 'id': video_id.decode('utf-8'),
783 'url': video_real_url.decode('utf-8'),
784 'uploader': video_uploader.decode('utf-8'),
785 'title': video_title,
786 'stitle': simple_title,
787 'ext': video_extension.decode('utf-8'),
792 except UnavailableFormatError, err:
794 if quality_index == len(self._available_formats) - 1:
795 # I don't ever expect this to happen
796 self._downloader.trouble(u'ERROR: no known formats available for video')
799 self.report_unavailable_format(video_id, format_param)
801 format_param = self._available_formats[quality_index]
804 self._downloader.trouble('ERROR: format not available for video')
808 class MetacafeIE(InfoExtractor):
809 """Information Extractor for metacafe.com."""
811 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
812 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
813 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
816 def __init__(self, youtube_ie, downloader=None):
817 InfoExtractor.__init__(self, downloader)
818 self._youtube_ie = youtube_ie
822 return (re.match(MetacafeIE._VALID_URL, url) is not None)
824 def report_disclaimer(self):
825 """Report disclaimer retrieval."""
826 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
828 def report_age_confirmation(self):
829 """Report attempt to confirm age."""
830 self._downloader.to_stdout(u'[metacafe] Confirming age')
832 def report_download_webpage(self, video_id):
833 """Report webpage download."""
834 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
836 def report_extraction(self, video_id):
837 """Report information extraction."""
838 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
840 def _real_initialize(self):
841 # Retrieve disclaimer
842 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
844 self.report_disclaimer()
845 disclaimer = urllib2.urlopen(request).read()
846 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
847 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
853 'submit': "Continue - I'm over 18",
855 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
857 self.report_age_confirmation()
858 disclaimer = urllib2.urlopen(request).read()
859 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
860 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
863 def _real_extract(self, url):
864 # Extract id and simplified title from URL
865 mobj = re.match(self._VALID_URL, url)
867 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
870 video_id = mobj.group(1)
872 # Check if video comes from YouTube
873 mobj2 = re.match(r'^yt-(.*)$', video_id)
874 if mobj2 is not None:
875 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
878 simple_title = mobj.group(2).decode('utf-8')
879 video_extension = 'flv'
881 # Retrieve video webpage to extract further information
882 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
884 self.report_download_webpage(video_id)
885 webpage = urllib2.urlopen(request).read()
886 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
887 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
890 # Extract URL, uploader and title from webpage
891 self.report_extraction(video_id)
892 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
894 self._downloader.trouble(u'ERROR: unable to extract media URL')
896 mediaURL = urllib.unquote(mobj.group(1))
898 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
900 # self._downloader.trouble(u'ERROR: unable to extract gdaKey')
902 #gdaKey = mobj.group(1)
904 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
908 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
910 self._downloader.trouble(u'ERROR: unable to extract title')
912 video_title = mobj.group(1).decode('utf-8')
914 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
916 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
918 video_uploader = mobj.group(1)
921 # Process video information
922 self._downloader.process_info({
923 'id': video_id.decode('utf-8'),
924 'url': video_url.decode('utf-8'),
925 'uploader': video_uploader.decode('utf-8'),
926 'title': video_title,
927 'stitle': simple_title,
928 'ext': video_extension.decode('utf-8'),
930 except UnavailableFormatError:
931 self._downloader.trouble(u'ERROR: format not available for video')
934 class YoutubeSearchIE(InfoExtractor):
935 """Information Extractor for YouTube search queries."""
936 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
937 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
938 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
939 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
941 _max_youtube_results = 1000
943 def __init__(self, youtube_ie, downloader=None):
944 InfoExtractor.__init__(self, downloader)
945 self._youtube_ie = youtube_ie
949 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
951 def report_download_page(self, query, pagenum):
952 """Report attempt to download playlist page with given number."""
953 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
955 def _real_initialize(self):
956 self._youtube_ie.initialize()
958 def _real_extract(self, query):
959 mobj = re.match(self._VALID_QUERY, query)
961 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
964 prefix, query = query.split(':')
967 self._download_n_results(query, 1)
969 elif prefix == 'all':
970 self._download_n_results(query, self._max_youtube_results)
976 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
978 elif n > self._max_youtube_results:
979 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
980 n = self._max_youtube_results
981 self._download_n_results(query, n)
983 except ValueError: # parsing prefix as integer fails
984 self._download_n_results(query, 1)
987 def _download_n_results(self, query, n):
988 """Downloads a specified number of results for a query"""
995 self.report_download_page(query, pagenum)
996 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
997 request = urllib2.Request(result_url, None, std_headers)
999 page = urllib2.urlopen(request).read()
1000 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1001 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1004 # Extract video identifiers
1005 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1006 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1007 if video_id not in already_seen:
1008 video_ids.append(video_id)
1009 already_seen.add(video_id)
1010 if len(video_ids) == n:
1011 # Specified n videos reached
1012 for id in video_ids:
1013 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1016 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1017 for id in video_ids:
1018 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1021 pagenum = pagenum + 1
1023 class YoutubePlaylistIE(InfoExtractor):
1024 """Information Extractor for YouTube playlists."""
1026 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:view_play_list|my_playlists)\?.*?p=([^&]+).*'
1027 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1028 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1029 _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
1032 def __init__(self, youtube_ie, downloader=None):
1033 InfoExtractor.__init__(self, downloader)
1034 self._youtube_ie = youtube_ie
1038 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1040 def report_download_page(self, playlist_id, pagenum):
1041 """Report attempt to download playlist page with given number."""
1042 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1044 def _real_initialize(self):
1045 self._youtube_ie.initialize()
1047 def _real_extract(self, url):
1048 # Extract playlist id
1049 mobj = re.match(self._VALID_URL, url)
1051 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1054 # Download playlist pages
1055 playlist_id = mobj.group(1)
1060 self.report_download_page(playlist_id, pagenum)
1061 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1063 page = urllib2.urlopen(request).read()
1064 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1065 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1068 # Extract video identifiers
1070 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1071 if mobj.group(1) not in ids_in_page:
1072 ids_in_page.append(mobj.group(1))
1073 video_ids.extend(ids_in_page)
1075 if (self._MORE_PAGES_INDICATOR % (playlist_id.upper(), pagenum + 1)) not in page:
1077 pagenum = pagenum + 1
1079 for id in video_ids:
1080 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1083 class YoutubeUserIE(InfoExtractor):
1084 """Information Extractor for YouTube users."""
1086 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1087 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1088 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)'
1091 def __init__(self, youtube_ie, downloader=None):
1092 InfoExtractor.__init__(self, downloader)
1093 self._youtube_ie = youtube_ie
1097 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1099 def report_download_page(self, username):
1100 """Report attempt to download user page."""
1101 self._downloader.to_stdout(u'[youtube] USR %s: Downloading page ' % (username))
1103 def _real_initialize(self):
1104 self._youtube_ie.initialize()
1106 def _real_extract(self, url):
1108 mobj = re.match(self._VALID_URL, url)
1110 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1113 # Download user page
1114 username = mobj.group(1)
1118 self.report_download_page(username)
1119 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1121 page = urllib2.urlopen(request).read()
1122 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1123 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1126 # Extract video identifiers
1129 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1131 if mobj.group(1) not in ids_in_page:
1132 ids_in_page.append(mobj.group(1))
1133 video_ids.extend(ids_in_page)
1135 for id in video_ids:
1136 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1139 class PostProcessor(object):
1140 """Post Processor class.
1142 PostProcessor objects can be added to downloaders with their
1143 add_post_processor() method. When the downloader has finished a
1144 successful download, it will take its internal chain of PostProcessors
1145 and start calling the run() method on each one of them, first with
1146 an initial argument and then with the returned value of the previous
1149 The chain will be stopped if one of them ever returns None or the end
1150 of the chain is reached.
1152 PostProcessor objects follow a "mutual registration" process similar
1153 to InfoExtractor objects.
1158 def __init__(self, downloader=None):
1159 self._downloader = downloader
1161 def set_downloader(self, downloader):
1162 """Sets the downloader for this PP."""
1163 self._downloader = downloader
1165 def run(self, information):
1166 """Run the PostProcessor.
1168 The "information" argument is a dictionary like the ones
1169 composed by InfoExtractors. The only difference is that this
1170 one has an extra field called "filepath" that points to the
1173 When this method returns None, the postprocessing chain is
1174 stopped. However, this method may return an information
1175 dictionary that will be passed to the next postprocessing
1176 object in the chain. It can be the one it received after
1177 changing some fields.
1179 In addition, this method may raise a PostProcessingError
1180 exception that will be taken into account by the downloader
1183 return information # by default, do nothing
1185 ### MAIN PROGRAM ###
1186 if __name__ == '__main__':
1188 # Modules needed only when running the main program
1192 # Function to update the program file with the latest version from bitbucket.org
1193 def update_self(downloader, filename):
1194 # Note: downloader only used for options
1195 if not os.access (filename, os.W_OK):
1196 sys.exit('ERROR: no write permissions on %s' % filename)
1198 downloader.to_stdout('Updating to latest stable version...')
1199 latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
1200 latest_version = urllib.urlopen(latest_url).read().strip()
1201 prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
1202 newcontent = urllib.urlopen(prog_url).read()
1203 stream = open(filename, 'w')
1204 stream.write(newcontent)
1206 downloader.to_stdout('Updated to version %s' % latest_version)
1208 # General configuration
1209 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1210 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1211 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1213 # Parse command line
1214 parser = optparse.OptionParser(
1215 usage='Usage: %prog [options] url...',
1216 version='2009.12.26',
1217 conflict_handler='resolve',
1220 parser.add_option('-h', '--help',
1221 action='help', help='print this help text and exit')
1222 parser.add_option('-v', '--version',
1223 action='version', help='print program version and exit')
1224 parser.add_option('-U', '--update',
1225 action='store_true', dest='update_self', help='update this program to latest stable version')
1226 parser.add_option('-i', '--ignore-errors',
1227 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1228 parser.add_option('-r', '--rate-limit',
1229 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1231 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1232 authentication.add_option('-u', '--username',
1233 dest='username', metavar='UN', help='account username')
1234 authentication.add_option('-p', '--password',
1235 dest='password', metavar='PW', help='account password')
1236 authentication.add_option('-n', '--netrc',
1237 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1238 parser.add_option_group(authentication)
1240 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1241 video_format.add_option('-f', '--format',
1242 action='store', dest='format', metavar='FMT', help='video format code')
1243 video_format.add_option('-b', '--best-quality',
1244 action='store_const', dest='format', help='download the best quality video possible', const='0')
1245 video_format.add_option('-m', '--mobile-version',
1246 action='store_const', dest='format', help='alias for -f 17', const='17')
1247 video_format.add_option('-d', '--high-def',
1248 action='store_const', dest='format', help='alias for -f 22', const='22')
1249 parser.add_option_group(video_format)
1251 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1252 verbosity.add_option('-q', '--quiet',
1253 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1254 verbosity.add_option('-s', '--simulate',
1255 action='store_true', dest='simulate', help='do not download video', default=False)
1256 verbosity.add_option('-g', '--get-url',
1257 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1258 verbosity.add_option('-e', '--get-title',
1259 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1260 parser.add_option_group(verbosity)
1262 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1263 filesystem.add_option('-t', '--title',
1264 action='store_true', dest='usetitle', help='use title in file name', default=False)
1265 filesystem.add_option('-l', '--literal',
1266 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1267 filesystem.add_option('-o', '--output',
1268 dest='outtmpl', metavar='TPL', help='output filename template')
1269 filesystem.add_option('-a', '--batch-file',
1270 dest='batchfile', metavar='F', help='file containing URLs to download')
1271 filesystem.add_option('-w', '--no-overwrites',
1272 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1273 filesystem.add_option('-c', '--continue',
1274 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
1275 parser.add_option_group(filesystem)
1277 (opts, args) = parser.parse_args()
1279 # Batch file verification
1281 if opts.batchfile is not None:
1283 batchurls = open(opts.batchfile, 'r').readlines()
1284 batchurls = [x.strip() for x in batchurls]
1285 batchurls = [x for x in batchurls if len(x) > 0]
1287 sys.exit(u'ERROR: batch file could not be read')
1288 all_urls = batchurls + args
1290 # Conflicting, missing and erroneous options
1291 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1292 parser.error(u'using .netrc conflicts with giving username/password')
1293 if opts.password is not None and opts.username is None:
1294 parser.error(u'account username missing')
1295 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1296 parser.error(u'using output template conflicts with using title or literal title')
1297 if opts.usetitle and opts.useliteral:
1298 parser.error(u'using title conflicts with using literal title')
1299 if opts.username is not None and opts.password is None:
1300 opts.password = getpass.getpass(u'Type account password and press return:')
1301 if opts.ratelimit is not None:
1302 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1303 if numeric_limit is None:
1304 parser.error(u'invalid rate limit specified')
1305 opts.ratelimit = numeric_limit
1307 # Information extractors
1308 youtube_ie = YoutubeIE()
1309 metacafe_ie = MetacafeIE(youtube_ie)
1310 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1311 youtube_user_ie = YoutubeUserIE(youtube_ie)
1312 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1315 fd = FileDownloader({
1316 'usenetrc': opts.usenetrc,
1317 'username': opts.username,
1318 'password': opts.password,
1319 'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1320 'forceurl': opts.geturl,
1321 'forcetitle': opts.gettitle,
1322 'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1323 'format': opts.format,
1324 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
1325 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1326 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1327 or u'%(id)s.%(ext)s'),
1328 'ignoreerrors': opts.ignoreerrors,
1329 'ratelimit': opts.ratelimit,
1330 'nooverwrites': opts.nooverwrites,
1331 'continuedl': opts.continue_dl,
1333 fd.add_info_extractor(youtube_search_ie)
1334 fd.add_info_extractor(youtube_pl_ie)
1335 fd.add_info_extractor(youtube_user_ie)
1336 fd.add_info_extractor(metacafe_ie)
1337 fd.add_info_extractor(youtube_ie)
1340 if opts.update_self:
1341 update_self(fd, sys.argv[0])
1344 if len(all_urls) < 1:
1345 if not opts.update_self:
1346 parser.error(u'you must provide at least one URL')
1349 retcode = fd.download(all_urls)
1352 except DownloadError:
1354 except SameFileError:
1355 sys.exit(u'ERROR: fixed output name but more than one file to download')
1356 except KeyboardInterrupt:
1357 sys.exit(u'\nERROR: Interrupted by user')