2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # License: Public domain code
24 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2',
25 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
26 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
27 'Accept-Language': 'en-us,en;q=0.5',
30 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
32 def preferredencoding():
33 """Get preferred encoding.
35 Returns the best encoding scheme for the system, based on
36 locale.getpreferredencoding() and some further tweaks.
38 def yield_preferredencoding():
40 pref = locale.getpreferredencoding()
46 return yield_preferredencoding().next()
48 class DownloadError(Exception):
49 """Download Error exception.
51 This exception may be thrown by FileDownloader objects if they are not
52 configured to continue on errors. They will contain the appropriate
57 class SameFileError(Exception):
58 """Same File exception.
60 This exception will be thrown by FileDownloader objects if they detect
61 multiple files would have to be downloaded to the same file on disk.
65 class PostProcessingError(Exception):
66 """Post Processing exception.
68 This exception may be raised by PostProcessor's .run() method to
69 indicate an error in the postprocessing task.
73 class UnavailableFormatError(Exception):
74 """Unavailable Format exception.
76 This exception will be thrown when a video is requested
77 in a format that is not available for that video.
81 class ContentTooShortError(Exception):
82 """Content Too Short exception.
84 This exception may be raised by FileDownloader objects when a file they
85 download is too small for what the server announced first, indicating
86 the connection was probably interrupted.
92 def __init__(self, downloaded, expected):
93 self.downloaded = downloaded
94 self.expected = expected
96 class FileDownloader(object):
97 """File Downloader class.
99 File downloader objects are the ones responsible of downloading the
100 actual video file and writing it to disk if the user has requested
101 it, among some other tasks. In most cases there should be one per
102 program. As, given a video URL, the downloader doesn't know how to
103 extract all the needed information, task that InfoExtractors do, it
104 has to pass the URL to one of them.
106 For this, file downloader objects have a method that allows
107 InfoExtractors to be registered in a given order. When it is passed
108 a URL, the file downloader handles it to the first InfoExtractor it
109 finds that reports being able to handle it. The InfoExtractor extracts
110 all the information about the video or videos the URL refers to, and
111 asks the FileDownloader to process the video information, possibly
112 downloading the video.
114 File downloaders accept a lot of parameters. In order not to saturate
115 the object constructor with arguments, it receives a dictionary of
116 options instead. These options are available through the params
117 attribute for the InfoExtractors to use. The FileDownloader also
118 registers itself as the downloader in charge for the InfoExtractors
119 that are added to it, so this is a "mutual registration".
123 username: Username for authentication purposes.
124 password: Password for authentication purposes.
125 usenetrc: Use netrc for authentication instead.
126 quiet: Do not print messages to stdout.
127 forceurl: Force printing final URL.
128 forcetitle: Force printing title.
129 simulate: Do not download the video files.
130 format: Video format code.
131 outtmpl: Template for output names.
132 ignoreerrors: Do not stop on download errors.
133 ratelimit: Download speed limit, in bytes/sec.
134 nooverwrites: Prevent overwriting files.
135 continuedl: Try to continue downloads if possible.
141 _download_retcode = None
143 def __init__(self, params):
144 """Create a FileDownloader object with the given options."""
147 self._download_retcode = 0
151 def pmkdir(filename):
152 """Create directory components in filename. Similar to Unix "mkdir -p"."""
153 components = filename.split(os.sep)
154 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
155 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
156 for dir in aggregate:
157 if not os.path.exists(dir):
161 def format_bytes(bytes):
164 if type(bytes) is str:
169 exponent = long(math.log(bytes, 1024.0))
170 suffix = 'bkMGTPEZY'[exponent]
171 converted = float(bytes) / float(1024**exponent)
172 return '%.2f%s' % (converted, suffix)
175 def calc_percent(byte_counter, data_len):
178 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
181 def calc_eta(start, now, total, current):
185 if current == 0 or dif < 0.001: # One millisecond
187 rate = float(current) / dif
188 eta = long((float(total) - float(current)) / rate)
189 (eta_mins, eta_secs) = divmod(eta, 60)
192 return '%02d:%02d' % (eta_mins, eta_secs)
195 def calc_speed(start, now, bytes):
197 if bytes == 0 or dif < 0.001: # One millisecond
198 return '%10s' % '---b/s'
199 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
202 def best_block_size(elapsed_time, bytes):
203 new_min = max(bytes / 2.0, 1.0)
204 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
205 if elapsed_time < 0.001:
207 rate = bytes / elapsed_time
215 def parse_bytes(bytestr):
216 """Parse a string indicating a byte quantity into a long integer."""
217 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
220 number = float(matchobj.group(1))
221 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
222 return long(round(number * multiplier))
226 """Verify a URL is valid and data could be downloaded. Return real data URL."""
227 request = urllib2.Request(url, None, std_headers)
228 data = urllib2.urlopen(request)
234 def add_info_extractor(self, ie):
235 """Add an InfoExtractor object to the end of the list."""
237 ie.set_downloader(self)
239 def add_post_processor(self, pp):
240 """Add a PostProcessor object to the end of the chain."""
242 pp.set_downloader(self)
244 def to_stdout(self, message, skip_eol=False):
245 """Print message to stdout if not in quiet mode."""
246 if not self.params.get('quiet', False):
247 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
250 def to_stderr(self, message):
251 """Print message to stderr."""
252 print >>sys.stderr, message.encode(preferredencoding())
254 def fixed_template(self):
255 """Checks if the output template is fixed."""
256 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
258 def trouble(self, message=None):
259 """Determine action to take when a download problem appears.
261 Depending on if the downloader has been configured to ignore
262 download errors or not, this method may throw an exception or
263 not when errors are found, after printing the message.
265 if message is not None:
266 self.to_stderr(message)
267 if not self.params.get('ignoreerrors', False):
268 raise DownloadError(message)
269 self._download_retcode = 1
271 def slow_down(self, start_time, byte_counter):
272 """Sleep if the download speed is over the rate limit."""
273 rate_limit = self.params.get('ratelimit', None)
274 if rate_limit is None or byte_counter == 0:
277 elapsed = now - start_time
280 speed = float(byte_counter) / elapsed
281 if speed > rate_limit:
282 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
284 def report_destination(self, filename):
285 """Report destination filename."""
286 self.to_stdout(u'[download] Destination: %s' % filename)
288 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
289 """Report download progress."""
290 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
291 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
293 def report_resuming_byte(self, resume_len):
294 """Report attemtp to resume at given byte."""
295 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
297 def report_file_already_downloaded(self, file_name):
298 """Report file has already been fully downloaded."""
299 self.to_stdout(u'[download] %s has already been downloaded' % file_name)
301 def report_unable_to_resume(self):
302 """Report it was impossible to resume download."""
303 self.to_stdout(u'[download] Unable to resume')
305 def report_finish(self):
306 """Report download finished."""
309 def process_info(self, info_dict):
310 """Process a single dictionary returned by an InfoExtractor."""
311 # Do nothing else if in simulate mode
312 if self.params.get('simulate', False):
314 info_dict['url'] = self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
315 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
316 raise UnavailableFormatError
319 if self.params.get('forcetitle', False):
320 print info_dict['title'].encode(preferredencoding())
321 if self.params.get('forceurl', False):
322 print info_dict['url'].encode(preferredencoding())
327 template_dict = dict(info_dict)
328 template_dict['epoch'] = unicode(long(time.time()))
329 filename = self.params['outtmpl'] % template_dict
330 except (ValueError, KeyError), err:
331 self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
332 if self.params.get('nooverwrites', False) and os.path.exists(filename):
333 self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
337 self.pmkdir(filename)
338 except (OSError, IOError), err:
339 self.trouble('ERROR: unable to create directories: %s' % str(err))
343 success = self._do_download(filename, info_dict['url'].encode('utf-8'))
344 except (OSError, IOError), err:
345 raise UnavailableFormatError
346 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
347 self.trouble('ERROR: unable to download video data: %s' % str(err))
349 except (ContentTooShortError, ), err:
350 self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
355 self.post_process(filename, info_dict)
356 except (PostProcessingError), err:
357 self.trouble('ERROR: postprocessing: %s' % str(err))
360 def download(self, url_list):
361 """Download a given list of URLs."""
362 if len(url_list) > 1 and self.fixed_template():
363 raise SameFileError(self.params['outtmpl'])
366 suitable_found = False
368 # Go to next InfoExtractor if not suitable
369 if not ie.suitable(url):
372 # Suitable InfoExtractor found
373 suitable_found = True
375 # Extract information from URL and process it
378 # Suitable InfoExtractor had been found; go to next URL
381 if not suitable_found:
382 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
384 return self._download_retcode
386 def post_process(self, filename, ie_info):
387 """Run the postprocessing chain on the given file."""
389 info['filepath'] = filename
395 def _download_with_rtmpdump(self, filename, url):
396 self.report_destination(filename)
398 # Check for rtmpdump first
400 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
401 except (OSError, IOError):
402 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
405 # Download using rtmpdump. rtmpdump returns exit code 2 when
406 # the connection was interrumpted and resuming appears to be
407 # possible. This is part of rtmpdump's normal usage, AFAIK.
408 retval = subprocess.call(['rtmpdump', '-q', '-r', url, '-o', filename] + [[], ['-e']][self.params.get('continuedl', False)])
410 self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename), skip_eol=True)
411 time.sleep(2.0) # This seems to be needed
412 retval = subprocess.call(['rtmpdump', '-q', '-e', '-r', url, '-o', filename])
414 self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
417 self.trouble('ERROR: rtmpdump exited with code %d' % retval)
420 def _do_download(self, filename, url):
421 # Attempt to download using rtmpdump
422 if url.startswith('rtmp'):
423 return self._download_with_rtmpdump(filename, url)
427 basic_request = urllib2.Request(url, None, std_headers)
428 request = urllib2.Request(url, None, std_headers)
430 # Establish possible resume length
431 if os.path.isfile(filename):
432 resume_len = os.path.getsize(filename)
436 # Request parameters in case of being able to resume
437 if self.params.get('continuedl', False) and resume_len != 0:
438 self.report_resuming_byte(resume_len)
439 request.add_header('Range','bytes=%d-' % resume_len)
442 # Establish connection
444 data = urllib2.urlopen(request)
445 except (urllib2.HTTPError, ), err:
446 if err.code != 416: # 416 is 'Requested range not satisfiable'
449 data = urllib2.urlopen(basic_request)
450 content_length = data.info()['Content-Length']
452 if content_length is not None and long(content_length) == resume_len:
453 # Because the file had already been fully downloaded
454 self.report_file_already_downloaded(filename)
457 # Because the server didn't let us
458 self.report_unable_to_resume()
461 data_len = data.info().get('Content-length', None)
462 data_len_str = self.format_bytes(data_len)
469 data_block = data.read(block_size)
471 data_block_len = len(data_block)
472 if data_block_len == 0:
474 byte_counter += data_block_len
476 # Open file just in time
479 stream = open(filename, open_mode)
480 self.report_destination(filename)
481 except (OSError, IOError), err:
482 self.trouble('ERROR: unable to open for writing: %s' % str(err))
484 stream.write(data_block)
485 block_size = self.best_block_size(after - before, data_block_len)
488 percent_str = self.calc_percent(byte_counter, data_len)
489 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
490 speed_str = self.calc_speed(start, time.time(), byte_counter)
491 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
494 self.slow_down(start, byte_counter)
497 if data_len is not None and str(byte_counter) != data_len:
498 raise ContentTooShortError(byte_counter, long(data_len))
501 class InfoExtractor(object):
502 """Information Extractor class.
504 Information extractors are the classes that, given a URL, extract
505 information from the video (or videos) the URL refers to. This
506 information includes the real video URL, the video title and simplified
507 title, author and others. The information is stored in a dictionary
508 which is then passed to the FileDownloader. The FileDownloader
509 processes this information possibly downloading the video to the file
510 system, among other possible outcomes. The dictionaries must include
511 the following fields:
513 id: Video identifier.
514 url: Final video URL.
515 uploader: Nickname of the video uploader.
516 title: Literal title.
517 stitle: Simplified title.
518 ext: Video filename extension.
520 Subclasses of this one should re-define the _real_initialize() and
521 _real_extract() methods, as well as the suitable() static method.
522 Probably, they should also be instantiated and added to the main
529 def __init__(self, downloader=None):
530 """Constructor. Receives an optional downloader."""
532 self.set_downloader(downloader)
536 """Receives a URL and returns True if suitable for this IE."""
539 def initialize(self):
540 """Initializes an instance (authentication, etc)."""
542 self._real_initialize()
545 def extract(self, url):
546 """Extracts URL information and returns it in list of dicts."""
548 return self._real_extract(url)
550 def set_downloader(self, downloader):
551 """Sets the downloader for this IE."""
552 self._downloader = downloader
554 def _real_initialize(self):
555 """Real initialization process. Redefine in subclasses."""
558 def _real_extract(self, url):
559 """Real extraction process. Redefine in subclasses."""
562 class YoutubeIE(InfoExtractor):
563 """Information extractor for youtube.com."""
565 _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
566 _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
567 _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
568 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
569 _NETRC_MACHINE = 'youtube'
570 _available_formats = ['37', '22', '35', '18', '5', '17', '13', None] # listed in order of priority for -b flag
571 _video_extensions = {
581 return (re.match(YoutubeIE._VALID_URL, url) is not None)
584 def htmlentity_transform(matchobj):
585 """Transforms an HTML entity to a Unicode character."""
586 entity = matchobj.group(1)
588 # Known non-numeric HTML entity
589 if entity in htmlentitydefs.name2codepoint:
590 return unichr(htmlentitydefs.name2codepoint[entity])
593 mobj = re.match(ur'(?u)#(x?\d+)', entity)
595 numstr = mobj.group(1)
596 if numstr.startswith(u'x'):
598 numstr = u'0%s' % numstr
601 return unichr(long(numstr, base))
603 # Unknown entity in name, return its literal representation
604 return (u'&%s;' % entity)
606 def report_lang(self):
607 """Report attempt to set language."""
608 self._downloader.to_stdout(u'[youtube] Setting language')
610 def report_login(self):
611 """Report attempt to log in."""
612 self._downloader.to_stdout(u'[youtube] Logging in')
614 def report_age_confirmation(self):
615 """Report attempt to confirm age."""
616 self._downloader.to_stdout(u'[youtube] Confirming age')
618 def report_video_info_webpage_download(self, video_id):
619 """Report attempt to download video info webpage."""
620 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
622 def report_information_extraction(self, video_id):
623 """Report attempt to extract video information."""
624 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
626 def report_unavailable_format(self, video_id, format):
627 """Report extracted video URL."""
628 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
630 def report_rtmp_download(self):
631 """Indicate the download will use the RTMP protocol."""
632 self._downloader.to_stdout(u'[youtube] RTMP download detected')
634 def _real_initialize(self):
635 if self._downloader is None:
640 downloader_params = self._downloader.params
642 # Attempt to use provided username and password or .netrc data
643 if downloader_params.get('username', None) is not None:
644 username = downloader_params['username']
645 password = downloader_params['password']
646 elif downloader_params.get('usenetrc', False):
648 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
653 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
654 except (IOError, netrc.NetrcParseError), err:
655 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
659 request = urllib2.Request(self._LANG_URL, None, std_headers)
662 urllib2.urlopen(request).read()
663 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
664 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
667 # No authentication to be performed
673 'current_form': 'loginForm',
675 'action_login': 'Log In',
676 'username': username,
677 'password': password,
679 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
682 login_results = urllib2.urlopen(request).read()
683 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
684 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
686 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
687 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
693 'action_confirm': 'Confirm',
695 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
697 self.report_age_confirmation()
698 age_results = urllib2.urlopen(request).read()
699 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
700 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
703 def _real_extract(self, url):
704 # Extract video id from URL
705 mobj = re.match(self._VALID_URL, url)
707 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
709 video_id = mobj.group(2)
711 # Downloader parameters
715 if self._downloader is not None:
716 params = self._downloader.params
717 format_param = params.get('format', None)
718 if format_param == '0':
719 format_param = self._available_formats[quality_index]
724 video_extension = self._video_extensions.get(format_param, 'flv')
727 video_info_url = 'http://www.youtube.com/get_video_info?&video_id=%s&el=detailpage&ps=default&eurl=&gl=US&hl=en' % video_id
728 request = urllib2.Request(video_info_url, None, std_headers)
730 self.report_video_info_webpage_download(video_id)
731 video_info_webpage = urllib2.urlopen(request).read()
732 video_info = urlparse.parse_qs(video_info_webpage)
733 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
734 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
736 self.report_information_extraction(video_id)
739 if 'token' not in video_info:
740 # Attempt to see if YouTube has issued an error message
741 if 'reason' not in video_info:
742 self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
743 stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
744 stream.write(video_info_webpage)
747 reason = urllib.unquote_plus(video_info['reason'][0])
748 self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
750 token = urllib.unquote_plus(video_info['token'][0])
751 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
752 if format_param is not None:
753 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
755 # Check possible RTMP download
756 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
757 self.report_rtmp_download()
758 video_real_url = video_info['conn'][0]
761 if 'author' not in video_info:
762 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
764 video_uploader = urllib.unquote_plus(video_info['author'][0])
767 if 'title' not in video_info:
768 self._downloader.trouble(u'ERROR: unable to extract video title')
770 video_title = urllib.unquote_plus(video_info['title'][0])
771 video_title = video_title.decode('utf-8')
772 video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
773 video_title = video_title.replace(os.sep, u'%')
776 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
777 simple_title = simple_title.strip(ur'_')
780 # Process video information
781 self._downloader.process_info({
782 'id': video_id.decode('utf-8'),
783 'url': video_real_url.decode('utf-8'),
784 'uploader': video_uploader.decode('utf-8'),
785 'title': video_title,
786 'stitle': simple_title,
787 'ext': video_extension.decode('utf-8'),
792 except UnavailableFormatError, err:
794 if quality_index == len(self._available_formats) - 1:
795 # I don't ever expect this to happen
796 self._downloader.trouble(u'ERROR: no known formats available for video')
799 self.report_unavailable_format(video_id, format_param)
801 format_param = self._available_formats[quality_index]
804 self._downloader.trouble('ERROR: format not available for video')
808 class MetacafeIE(InfoExtractor):
809 """Information Extractor for metacafe.com."""
811 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
812 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
813 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
816 def __init__(self, youtube_ie, downloader=None):
817 InfoExtractor.__init__(self, downloader)
818 self._youtube_ie = youtube_ie
822 return (re.match(MetacafeIE._VALID_URL, url) is not None)
824 def report_disclaimer(self):
825 """Report disclaimer retrieval."""
826 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
828 def report_age_confirmation(self):
829 """Report attempt to confirm age."""
830 self._downloader.to_stdout(u'[metacafe] Confirming age')
832 def report_download_webpage(self, video_id):
833 """Report webpage download."""
834 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
836 def report_extraction(self, video_id):
837 """Report information extraction."""
838 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
840 def _real_initialize(self):
841 # Retrieve disclaimer
842 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
844 self.report_disclaimer()
845 disclaimer = urllib2.urlopen(request).read()
846 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
847 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
853 'submit': "Continue - I'm over 18",
855 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
857 self.report_age_confirmation()
858 disclaimer = urllib2.urlopen(request).read()
859 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
860 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
863 def _real_extract(self, url):
864 # Extract id and simplified title from URL
865 mobj = re.match(self._VALID_URL, url)
867 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
870 video_id = mobj.group(1)
872 # Check if video comes from YouTube
873 mobj2 = re.match(r'^yt-(.*)$', video_id)
874 if mobj2 is not None:
875 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
878 simple_title = mobj.group(2).decode('utf-8')
879 video_extension = 'flv'
881 # Retrieve video webpage to extract further information
882 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
884 self.report_download_webpage(video_id)
885 webpage = urllib2.urlopen(request).read()
886 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
887 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
890 # Extract URL, uploader and title from webpage
891 self.report_extraction(video_id)
892 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
894 self._downloader.trouble(u'ERROR: unable to extract media URL')
896 mediaURL = urllib.unquote(mobj.group(1))
898 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
900 # self._downloader.trouble(u'ERROR: unable to extract gdaKey')
902 #gdaKey = mobj.group(1)
904 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
908 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
910 self._downloader.trouble(u'ERROR: unable to extract title')
912 video_title = mobj.group(1).decode('utf-8')
914 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
916 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
918 video_uploader = mobj.group(1)
921 # Process video information
922 self._downloader.process_info({
923 'id': video_id.decode('utf-8'),
924 'url': video_url.decode('utf-8'),
925 'uploader': video_uploader.decode('utf-8'),
926 'title': video_title,
927 'stitle': simple_title,
928 'ext': video_extension.decode('utf-8'),
930 except UnavailableFormatError:
931 self._downloader.trouble(u'ERROR: format not available for video')
934 class YoutubeSearchIE(InfoExtractor):
935 """Information Extractor for YouTube search queries."""
936 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
937 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
938 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
939 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
941 _max_youtube_results = 1000
943 def __init__(self, youtube_ie, downloader=None):
944 InfoExtractor.__init__(self, downloader)
945 self._youtube_ie = youtube_ie
949 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
951 def report_download_page(self, query, pagenum):
952 """Report attempt to download playlist page with given number."""
953 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
955 def _real_initialize(self):
956 self._youtube_ie.initialize()
958 def _real_extract(self, query):
959 mobj = re.match(self._VALID_QUERY, query)
961 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
964 prefix, query = query.split(':')
967 self._download_n_results(query, 1)
969 elif prefix == 'all':
970 self._download_n_results(query, self._max_youtube_results)
976 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
978 elif n > self._max_youtube_results:
979 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
980 n = self._max_youtube_results
981 self._download_n_results(query, n)
983 except ValueError: # parsing prefix as integer fails
984 self._download_n_results(query, 1)
987 def _download_n_results(self, query, n):
988 """Downloads a specified number of results for a query"""
995 self.report_download_page(query, pagenum)
996 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
997 request = urllib2.Request(result_url, None, std_headers)
999 page = urllib2.urlopen(request).read()
1000 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1001 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1004 # Extract video identifiers
1005 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1006 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1007 if video_id not in already_seen:
1008 video_ids.append(video_id)
1009 already_seen.add(video_id)
1010 if len(video_ids) == n:
1011 # Specified n videos reached
1012 for id in video_ids:
1013 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1016 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1017 for id in video_ids:
1018 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1021 pagenum = pagenum + 1
1023 class YoutubePlaylistIE(InfoExtractor):
1024 """Information Extractor for YouTube playlists."""
1026 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:view_play_list|my_playlists)\?.*?p=([^&]+).*'
1027 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1028 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1029 _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
1032 def __init__(self, youtube_ie, downloader=None):
1033 InfoExtractor.__init__(self, downloader)
1034 self._youtube_ie = youtube_ie
1038 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1040 def report_download_page(self, playlist_id, pagenum):
1041 """Report attempt to download playlist page with given number."""
1042 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1044 def _real_initialize(self):
1045 self._youtube_ie.initialize()
1047 def _real_extract(self, url):
1048 # Extract playlist id
1049 mobj = re.match(self._VALID_URL, url)
1051 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1054 # Download playlist pages
1055 playlist_id = mobj.group(1)
1060 self.report_download_page(playlist_id, pagenum)
1061 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1063 page = urllib2.urlopen(request).read()
1064 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1065 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1068 # Extract video identifiers
1070 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1071 if mobj.group(1) not in ids_in_page:
1072 ids_in_page.append(mobj.group(1))
1073 video_ids.extend(ids_in_page)
1075 if (self._MORE_PAGES_INDICATOR % (playlist_id.upper(), pagenum + 1)) not in page:
1077 pagenum = pagenum + 1
1079 for id in video_ids:
1080 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1083 class YoutubeUserIE(InfoExtractor):
1084 """Information Extractor for YouTube users."""
1086 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1087 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1088 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1091 def __init__(self, youtube_ie, downloader=None):
1092 InfoExtractor.__init__(self, downloader)
1093 self._youtube_ie = youtube_ie
1097 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1099 def report_download_page(self, username):
1100 """Report attempt to download user page."""
1101 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1103 def _real_initialize(self):
1104 self._youtube_ie.initialize()
1106 def _real_extract(self, url):
1108 mobj = re.match(self._VALID_URL, url)
1110 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1113 # Download user page
1114 username = mobj.group(1)
1118 self.report_download_page(username)
1119 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1121 page = urllib2.urlopen(request).read()
1122 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1123 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1126 # Extract video identifiers
1129 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1130 if mobj.group(1) not in ids_in_page:
1131 ids_in_page.append(mobj.group(1))
1132 video_ids.extend(ids_in_page)
1134 for id in video_ids:
1135 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1138 class PostProcessor(object):
1139 """Post Processor class.
1141 PostProcessor objects can be added to downloaders with their
1142 add_post_processor() method. When the downloader has finished a
1143 successful download, it will take its internal chain of PostProcessors
1144 and start calling the run() method on each one of them, first with
1145 an initial argument and then with the returned value of the previous
1148 The chain will be stopped if one of them ever returns None or the end
1149 of the chain is reached.
1151 PostProcessor objects follow a "mutual registration" process similar
1152 to InfoExtractor objects.
1157 def __init__(self, downloader=None):
1158 self._downloader = downloader
1160 def set_downloader(self, downloader):
1161 """Sets the downloader for this PP."""
1162 self._downloader = downloader
1164 def run(self, information):
1165 """Run the PostProcessor.
1167 The "information" argument is a dictionary like the ones
1168 composed by InfoExtractors. The only difference is that this
1169 one has an extra field called "filepath" that points to the
1172 When this method returns None, the postprocessing chain is
1173 stopped. However, this method may return an information
1174 dictionary that will be passed to the next postprocessing
1175 object in the chain. It can be the one it received after
1176 changing some fields.
1178 In addition, this method may raise a PostProcessingError
1179 exception that will be taken into account by the downloader
1182 return information # by default, do nothing
1184 ### MAIN PROGRAM ###
1185 if __name__ == '__main__':
1187 # Modules needed only when running the main program
1191 # Function to update the program file with the latest version from bitbucket.org
1192 def update_self(downloader, filename):
1193 # Note: downloader only used for options
1194 if not os.access (filename, os.W_OK):
1195 sys.exit('ERROR: no write permissions on %s' % filename)
1197 downloader.to_stdout('Updating to latest stable version...')
1198 latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
1199 latest_version = urllib.urlopen(latest_url).read().strip()
1200 prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
1201 newcontent = urllib.urlopen(prog_url).read()
1202 stream = open(filename, 'w')
1203 stream.write(newcontent)
1205 downloader.to_stdout('Updated to version %s' % latest_version)
1207 # General configuration
1208 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1209 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1210 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1212 # Parse command line
1213 parser = optparse.OptionParser(
1214 usage='Usage: %prog [options] url...',
1215 version='2009.12.26',
1216 conflict_handler='resolve',
1219 parser.add_option('-h', '--help',
1220 action='help', help='print this help text and exit')
1221 parser.add_option('-v', '--version',
1222 action='version', help='print program version and exit')
1223 parser.add_option('-U', '--update',
1224 action='store_true', dest='update_self', help='update this program to latest stable version')
1225 parser.add_option('-i', '--ignore-errors',
1226 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1227 parser.add_option('-r', '--rate-limit',
1228 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1230 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1231 authentication.add_option('-u', '--username',
1232 dest='username', metavar='UN', help='account username')
1233 authentication.add_option('-p', '--password',
1234 dest='password', metavar='PW', help='account password')
1235 authentication.add_option('-n', '--netrc',
1236 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1237 parser.add_option_group(authentication)
1239 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1240 video_format.add_option('-f', '--format',
1241 action='store', dest='format', metavar='FMT', help='video format code')
1242 video_format.add_option('-b', '--best-quality',
1243 action='store_const', dest='format', help='download the best quality video possible', const='0')
1244 video_format.add_option('-m', '--mobile-version',
1245 action='store_const', dest='format', help='alias for -f 17', const='17')
1246 video_format.add_option('-d', '--high-def',
1247 action='store_const', dest='format', help='alias for -f 22', const='22')
1248 parser.add_option_group(video_format)
1250 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1251 verbosity.add_option('-q', '--quiet',
1252 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1253 verbosity.add_option('-s', '--simulate',
1254 action='store_true', dest='simulate', help='do not download video', default=False)
1255 verbosity.add_option('-g', '--get-url',
1256 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1257 verbosity.add_option('-e', '--get-title',
1258 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1259 parser.add_option_group(verbosity)
1261 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1262 filesystem.add_option('-t', '--title',
1263 action='store_true', dest='usetitle', help='use title in file name', default=False)
1264 filesystem.add_option('-l', '--literal',
1265 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1266 filesystem.add_option('-o', '--output',
1267 dest='outtmpl', metavar='TPL', help='output filename template')
1268 filesystem.add_option('-a', '--batch-file',
1269 dest='batchfile', metavar='F', help='file containing URLs to download')
1270 filesystem.add_option('-w', '--no-overwrites',
1271 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1272 filesystem.add_option('-c', '--continue',
1273 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
1274 parser.add_option_group(filesystem)
1276 (opts, args) = parser.parse_args()
1278 # Batch file verification
1280 if opts.batchfile is not None:
1282 batchurls = open(opts.batchfile, 'r').readlines()
1283 batchurls = [x.strip() for x in batchurls]
1284 batchurls = [x for x in batchurls if len(x) > 0]
1286 sys.exit(u'ERROR: batch file could not be read')
1287 all_urls = batchurls + args
1289 # Conflicting, missing and erroneous options
1290 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1291 parser.error(u'using .netrc conflicts with giving username/password')
1292 if opts.password is not None and opts.username is None:
1293 parser.error(u'account username missing')
1294 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1295 parser.error(u'using output template conflicts with using title or literal title')
1296 if opts.usetitle and opts.useliteral:
1297 parser.error(u'using title conflicts with using literal title')
1298 if opts.username is not None and opts.password is None:
1299 opts.password = getpass.getpass(u'Type account password and press return:')
1300 if opts.ratelimit is not None:
1301 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1302 if numeric_limit is None:
1303 parser.error(u'invalid rate limit specified')
1304 opts.ratelimit = numeric_limit
1306 # Information extractors
1307 youtube_ie = YoutubeIE()
1308 metacafe_ie = MetacafeIE(youtube_ie)
1309 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1310 youtube_user_ie = YoutubeUserIE(youtube_ie)
1311 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1314 fd = FileDownloader({
1315 'usenetrc': opts.usenetrc,
1316 'username': opts.username,
1317 'password': opts.password,
1318 'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1319 'forceurl': opts.geturl,
1320 'forcetitle': opts.gettitle,
1321 'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1322 'format': opts.format,
1323 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
1324 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1325 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1326 or u'%(id)s.%(ext)s'),
1327 'ignoreerrors': opts.ignoreerrors,
1328 'ratelimit': opts.ratelimit,
1329 'nooverwrites': opts.nooverwrites,
1330 'continuedl': opts.continue_dl,
1332 fd.add_info_extractor(youtube_search_ie)
1333 fd.add_info_extractor(youtube_pl_ie)
1334 fd.add_info_extractor(youtube_user_ie)
1335 fd.add_info_extractor(metacafe_ie)
1336 fd.add_info_extractor(youtube_ie)
1339 if opts.update_self:
1340 update_self(fd, sys.argv[0])
1343 if len(all_urls) < 1:
1344 if not opts.update_self:
1345 parser.error(u'you must provide at least one URL')
1348 retcode = fd.download(all_urls)
1351 except DownloadError:
1353 except SameFileError:
1354 sys.exit(u'ERROR: fixed output name but more than one file to download')
1355 except KeyboardInterrupt:
1356 sys.exit(u'\nERROR: Interrupted by user')