2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # License: Public domain code
22 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8',
23 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
24 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
25 'Accept-Language': 'en-us,en;q=0.5',
28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
30 class DownloadError(Exception):
31 """Download Error exception.
33 This exception may be thrown by FileDownloader objects if they are not
34 configured to continue on errors. They will contain the appropriate
39 class SameFileError(Exception):
40 """Same File exception.
42 This exception will be thrown by FileDownloader objects if they detect
43 multiple files would have to be downloaded to the same file on disk.
47 class PostProcessingError(Exception):
48 """Post Processing exception.
50 This exception may be raised by PostProcessor's .run() method to
51 indicate an error in the postprocessing task.
55 class FileDownloader(object):
56 """File Downloader class.
58 File downloader objects are the ones responsible of downloading the
59 actual video file and writing it to disk if the user has requested
60 it, among some other tasks. In most cases there should be one per
61 program. As, given a video URL, the downloader doesn't know how to
62 extract all the needed information, task that InfoExtractors do, it
63 has to pass the URL to one of them.
65 For this, file downloader objects have a method that allows
66 InfoExtractors to be registered in a given order. When it is passed
67 a URL, the file downloader handles it to the first InfoExtractor it
68 finds that reports being able to handle it. The InfoExtractor extracts
69 all the information about the video or videos the URL refers to, and
70 asks the FileDownloader to process the video information, possibly
71 downloading the video.
73 File downloaders accept a lot of parameters. In order not to saturate
74 the object constructor with arguments, it receives a dictionary of
75 options instead. These options are available through the params
76 attribute for the InfoExtractors to use. The FileDownloader also
77 registers itself as the downloader in charge for the InfoExtractors
78 that are added to it, so this is a "mutual registration".
82 username: Username for authentication purposes.
83 password: Password for authentication purposes.
84 usenetrc: Use netrc for authentication instead.
85 quiet: Do not print messages to stdout.
86 forceurl: Force printing final URL.
87 forcetitle: Force printing title.
88 simulate: Do not download the video files.
89 format: Video format code.
90 outtmpl: Template for output names.
91 ignoreerrors: Do not stop on download errors.
92 ratelimit: Download speed limit, in bytes/sec.
93 nooverwrites: Prevent overwriting files.
99 _download_retcode = None
101 def __init__(self, params):
102 """Create a FileDownloader object with the given options."""
105 self._download_retcode = 0
109 def pmkdir(filename):
110 """Create directory components in filename. Similar to Unix "mkdir -p"."""
111 components = filename.split(os.sep)
112 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
113 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
114 for dir in aggregate:
115 if not os.path.exists(dir):
119 def format_bytes(bytes):
125 exponent = long(math.log(float(bytes), 1024.0))
126 suffix = 'bkMGTPEZY'[exponent]
127 converted = float(bytes) / float(1024**exponent)
128 return '%.2f%s' % (converted, suffix)
131 def calc_percent(byte_counter, data_len):
134 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
137 def calc_eta(start, now, total, current):
141 if current == 0 or dif < 0.001: # One millisecond
143 rate = float(current) / dif
144 eta = long((float(total) - float(current)) / rate)
145 (eta_mins, eta_secs) = divmod(eta, 60)
148 return '%02d:%02d' % (eta_mins, eta_secs)
151 def calc_speed(start, now, bytes):
153 if bytes == 0 or dif < 0.001: # One millisecond
154 return '%10s' % '---b/s'
155 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
158 def best_block_size(elapsed_time, bytes):
159 new_min = max(bytes / 2.0, 1.0)
160 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
161 if elapsed_time < 0.001:
163 rate = bytes / elapsed_time
171 def parse_bytes(bytestr):
172 """Parse a string indicating a byte quantity into a long integer."""
173 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
176 number = float(matchobj.group(1))
177 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
178 return long(round(number * multiplier))
180 def add_info_extractor(self, ie):
181 """Add an InfoExtractor object to the end of the list."""
183 ie.set_downloader(self)
185 def add_post_processor(self, pp):
186 """Add a PostProcessor object to the end of the chain."""
188 pp.set_downloader(self)
190 def to_stdout(self, message, skip_eol=False):
191 """Print message to stdout if not in quiet mode."""
192 if not self.params.get('quiet', False):
193 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(locale.getpreferredencoding()),
196 def to_stderr(self, message):
197 """Print message to stderr."""
198 print >>sys.stderr, message
200 def fixed_template(self):
201 """Checks if the output template is fixed."""
202 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
204 def trouble(self, message=None):
205 """Determine action to take when a download problem appears.
207 Depending on if the downloader has been configured to ignore
208 download errors or not, this method may throw an exception or
209 not when errors are found, after printing the message.
211 if message is not None:
212 self.to_stderr(message)
213 if not self.params.get('ignoreerrors', False):
214 raise DownloadError(message)
215 self._download_retcode = 1
217 def slow_down(self, start_time, byte_counter):
218 """Sleep if the download speed is over the rate limit."""
219 rate_limit = self.params.get('ratelimit', None)
220 if rate_limit is None or byte_counter == 0:
223 elapsed = now - start_time
226 speed = float(byte_counter) / elapsed
227 if speed > rate_limit:
228 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
230 def report_destination(self, filename):
231 """Report destination filename."""
232 self.to_stdout(u'[download] Destination: %s' % filename)
234 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
235 """Report download progress."""
236 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
237 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
239 def report_finish(self):
240 """Report download finished."""
243 def process_info(self, info_dict):
244 """Process a single dictionary returned by an InfoExtractor."""
246 if self.params.get('forcetitle', False):
247 print info_dict['title'].encode(locale.getpreferredencoding())
248 if self.params.get('forceurl', False):
249 print info_dict['url'].encode(locale.getpreferredencoding())
251 # Do nothing else if in simulate mode
252 if self.params.get('simulate', False):
256 filename = self.params['outtmpl'] % info_dict
257 self.report_destination(filename)
258 except (ValueError, KeyError), err:
259 self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
260 if self.params['nooverwrites'] and os.path.exists(filename):
261 self.to_stderr('WARNING: file exists: %s; skipping' % filename)
264 self.pmkdir(filename)
265 except (OSError, IOError), err:
266 self.trouble('ERROR: unable to create directories: %s' % str(err))
269 outstream = open(filename, 'wb')
270 except (OSError, IOError), err:
271 self.trouble('ERROR: unable to open for writing: %s' % str(err))
274 self._do_download(outstream, info_dict['url'])
276 except (OSError, IOError), err:
277 self.trouble('ERROR: unable to write video data: %s' % str(err))
279 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
280 self.trouble('ERROR: unable to download video data: %s' % str(err))
283 self.post_process(filename, info_dict)
284 except (PostProcessingError), err:
285 self.trouble('ERROR: postprocessing: %s' % str(err))
290 def download(self, url_list):
291 """Download a given list of URLs."""
292 if len(url_list) > 1 and self.fixed_template():
293 raise SameFileError(self.params['outtmpl'])
296 suitable_found = False
298 # Go to next InfoExtractor if not suitable
299 if not ie.suitable(url):
302 # Suitable InfoExtractor found
303 suitable_found = True
305 # Extract information from URL and process it
308 # Suitable InfoExtractor had been found; go to next URL
311 if not suitable_found:
312 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
314 return self._download_retcode
316 def post_process(self, filename, ie_info):
317 """Run the postprocessing chain on the given file."""
319 info['filepath'] = filename
325 def _do_download(self, stream, url):
326 request = urllib2.Request(url, None, std_headers)
327 data = urllib2.urlopen(request)
328 data_len = data.info().get('Content-length', None)
329 data_len_str = self.format_bytes(data_len)
335 percent_str = self.calc_percent(byte_counter, data_len)
336 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
337 speed_str = self.calc_speed(start, time.time(), byte_counter)
338 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
342 data_block = data.read(block_size)
344 data_block_len = len(data_block)
345 if data_block_len == 0:
347 byte_counter += data_block_len
348 stream.write(data_block)
349 block_size = self.best_block_size(after - before, data_block_len)
352 self.slow_down(start, byte_counter)
355 if data_len is not None and str(byte_counter) != data_len:
356 raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
358 class InfoExtractor(object):
359 """Information Extractor class.
361 Information extractors are the classes that, given a URL, extract
362 information from the video (or videos) the URL refers to. This
363 information includes the real video URL, the video title and simplified
364 title, author and others. The information is stored in a dictionary
365 which is then passed to the FileDownloader. The FileDownloader
366 processes this information possibly downloading the video to the file
367 system, among other possible outcomes. The dictionaries must include
368 the following fields:
370 id: Video identifier.
371 url: Final video URL.
372 uploader: Nickname of the video uploader.
373 title: Literal title.
374 stitle: Simplified title.
375 ext: Video filename extension.
377 Subclasses of this one should re-define the _real_initialize() and
378 _real_extract() methods, as well as the suitable() static method.
379 Probably, they should also be instantiated and added to the main
386 def __init__(self, downloader=None):
387 """Constructor. Receives an optional downloader."""
389 self.set_downloader(downloader)
393 """Receives a URL and returns True if suitable for this IE."""
396 def initialize(self):
397 """Initializes an instance (authentication, etc)."""
399 self._real_initialize()
402 def extract(self, url):
403 """Extracts URL information and returns it in list of dicts."""
405 return self._real_extract(url)
407 def set_downloader(self, downloader):
408 """Sets the downloader for this IE."""
409 self._downloader = downloader
411 def _real_initialize(self):
412 """Real initialization process. Redefine in subclasses."""
415 def _real_extract(self, url):
416 """Real extraction process. Redefine in subclasses."""
419 class YoutubeIE(InfoExtractor):
420 """Information extractor for youtube.com."""
422 _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
423 _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
424 _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
425 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
426 _NETRC_MACHINE = 'youtube'
430 return (re.match(YoutubeIE._VALID_URL, url) is not None)
433 def htmlentity_transform(matchobj):
434 """Transforms an HTML entity to a Unicode character."""
435 entity = matchobj.group(1)
437 # Known non-numeric HTML entity
438 if entity in htmlentitydefs.name2codepoint:
439 return unichr(htmlentitydefs.name2codepoint[entity])
442 mobj = re.match(ur'(?u)#(x?\d+)', entity)
444 numstr = mobj.group(1)
445 if numstr.startswith(u'x'):
447 numstr = u'0%s' % numstr
450 return unichr(long(numstr, base))
452 # Unknown entity in name, return its literal representation
453 return (u'&%s;' % entity)
455 def report_lang(self):
456 """Report attempt to set language."""
457 self._downloader.to_stdout(u'[youtube] Setting language')
459 def report_login(self):
460 """Report attempt to log in."""
461 self._downloader.to_stdout(u'[youtube] Logging in')
463 def report_age_confirmation(self):
464 """Report attempt to confirm age."""
465 self._downloader.to_stdout(u'[youtube] Confirming age')
467 def report_webpage_download(self, video_id):
468 """Report attempt to download webpage."""
469 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
471 def report_information_extraction(self, video_id):
472 """Report attempt to extract video information."""
473 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
475 def report_video_url(self, video_id, video_real_url):
476 """Report extracted video URL."""
477 self._downloader.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
479 def _real_initialize(self):
480 if self._downloader is None:
485 downloader_params = self._downloader.params
487 # Attempt to use provided username and password or .netrc data
488 if downloader_params.get('username', None) is not None:
489 username = downloader_params['username']
490 password = downloader_params['password']
491 elif downloader_params.get('usenetrc', False):
493 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
498 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
499 except (IOError, netrc.NetrcParseError), err:
500 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
504 request = urllib2.Request(self._LANG_URL, None, std_headers)
507 urllib2.urlopen(request).read()
508 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
509 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
512 # No authentication to be performed
518 'current_form': 'loginForm',
520 'action_login': 'Log In',
521 'username': username,
522 'password': password,
524 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
527 login_results = urllib2.urlopen(request).read()
528 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
529 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
531 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
532 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
538 'action_confirm': 'Confirm',
540 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
542 self.report_age_confirmation()
543 age_results = urllib2.urlopen(request).read()
544 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
545 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
548 def _real_extract(self, url):
549 # Extract video id from URL
550 mobj = re.match(self._VALID_URL, url)
552 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
554 video_id = mobj.group(2)
556 # Downloader parameters
558 if self._downloader is not None:
559 params = self._downloader.params
560 format_param = params.get('format', None)
568 }.get(format_param, 'flv')
570 # Normalize URL, including format
571 normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
572 if format_param is not None:
573 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
574 request = urllib2.Request(normalized_url, None, std_headers)
576 self.report_webpage_download(video_id)
577 video_webpage = urllib2.urlopen(request).read()
578 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
579 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
581 self.report_information_extraction(video_id)
584 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
586 self._downloader.trouble(u'ERROR: unable to extract "t" parameter')
588 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&el=detailpage&ps=' % (video_id, mobj.group(1))
589 if format_param is not None:
590 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
591 self.report_video_url(video_id, video_real_url)
594 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
596 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
598 video_uploader = mobj.group(1)
601 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
603 self._downloader.trouble(u'ERROR: unable to extract video title')
605 video_title = mobj.group(1).decode('utf-8')
606 video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
607 video_title = video_title.replace(os.sep, u'%')
610 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
611 simple_title = simple_title.strip(ur'_')
613 # Process video information
614 self._downloader.process_info({
615 'id': video_id.decode('utf-8'),
616 'url': video_real_url.decode('utf-8'),
617 'uploader': video_uploader.decode('utf-8'),
618 'title': video_title,
619 'stitle': simple_title,
620 'ext': video_extension.decode('utf-8'),
623 class MetacafeIE(InfoExtractor):
624 """Information Extractor for metacafe.com."""
626 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
627 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
628 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
631 def __init__(self, youtube_ie, downloader=None):
632 InfoExtractor.__init__(self, downloader)
633 self._youtube_ie = youtube_ie
637 return (re.match(MetacafeIE._VALID_URL, url) is not None)
639 def report_disclaimer(self):
640 """Report disclaimer retrieval."""
641 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
643 def report_age_confirmation(self):
644 """Report attempt to confirm age."""
645 self._downloader.to_stdout(u'[metacafe] Confirming age')
647 def report_download_webpage(self, video_id):
648 """Report webpage download."""
649 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
651 def report_extraction(self, video_id):
652 """Report information extraction."""
653 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
655 def _real_initialize(self):
656 # Retrieve disclaimer
657 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
659 self.report_disclaimer()
660 disclaimer = urllib2.urlopen(request).read()
661 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
662 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
668 'submit': "Continue - I'm over 18",
670 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
672 self.report_age_confirmation()
673 disclaimer = urllib2.urlopen(request).read()
674 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
675 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
678 def _real_extract(self, url):
679 # Extract id and simplified title from URL
680 mobj = re.match(self._VALID_URL, url)
682 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
685 video_id = mobj.group(1)
687 # Check if video comes from YouTube
688 mobj2 = re.match(r'^yt-(.*)$', video_id)
689 if mobj2 is not None:
690 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
693 simple_title = mobj.group(2).decode('utf-8')
694 video_extension = 'flv'
696 # Retrieve video webpage to extract further information
697 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
699 self.report_download_webpage(video_id)
700 webpage = urllib2.urlopen(request).read()
701 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
702 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
705 # Extract URL, uploader and title from webpage
706 self.report_extraction(video_id)
707 mobj = re.search(r'(?m)&mediaURL=(http.*?\.flv)', webpage)
709 self._downloader.trouble(u'ERROR: unable to extract media URL')
711 mediaURL = urllib.unquote(mobj.group(1))
713 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
715 self._downloader.trouble(u'ERROR: unable to extract gdaKey')
717 gdaKey = mobj.group(1)
719 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
721 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
723 self._downloader.trouble(u'ERROR: unable to extract title')
725 video_title = mobj.group(1).decode('utf-8')
727 mobj = re.search(r'(?ms)<li id="ChnlUsr">.*?Submitter:.*?<a .*?>(.*?)<', webpage)
729 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
731 video_uploader = mobj.group(1)
733 # Process video information
734 self._downloader.process_info({
735 'id': video_id.decode('utf-8'),
736 'url': video_url.decode('utf-8'),
737 'uploader': video_uploader.decode('utf-8'),
738 'title': video_title,
739 'stitle': simple_title,
740 'ext': video_extension.decode('utf-8'),
744 class YoutubeSearchIE(InfoExtractor):
745 """Information Extractor for YouTube search queries."""
746 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
747 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
748 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
749 _MORE_PAGES_INDICATOR = r'>Next</a>'
751 _max_youtube_results = 1000
753 def __init__(self, youtube_ie, downloader=None):
754 InfoExtractor.__init__(self, downloader)
755 self._youtube_ie = youtube_ie
759 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
761 def report_download_page(self, query, pagenum):
762 """Report attempt to download playlist page with given number."""
763 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
765 def _real_initialize(self):
766 self._youtube_ie.initialize()
768 def _real_extract(self, query):
769 mobj = re.match(self._VALID_QUERY, query)
771 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
774 prefix, query = query.split(':')
777 self._download_n_results(query, 1)
779 elif prefix == 'all':
780 self._download_n_results(query, self._max_youtube_results)
786 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
788 elif n > self._max_youtube_results:
789 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
790 n = self._max_youtube_results
791 self._download_n_results(query, n)
793 except ValueError: # parsing prefix as int fails
794 self._download_n_results(query, 1)
797 def _download_n_results(self, query, n):
798 """Downloads a specified number of results for a query"""
805 self.report_download_page(query, pagenum)
806 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
807 request = urllib2.Request(result_url, None, std_headers)
809 page = urllib2.urlopen(request).read()
810 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
811 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
814 # Extract video identifiers
815 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
816 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
817 if video_id not in already_seen:
818 video_ids.append(video_id)
819 already_seen.add(video_id)
820 if len(video_ids) == n:
821 # Specified n videos reached
823 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
826 if self._MORE_PAGES_INDICATOR not in page:
828 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
831 pagenum = pagenum + 1
833 class YoutubePlaylistIE(InfoExtractor):
834 """Information Extractor for YouTube playlists."""
836 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
837 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
838 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
839 _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
842 def __init__(self, youtube_ie, downloader=None):
843 InfoExtractor.__init__(self, downloader)
844 self._youtube_ie = youtube_ie
848 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
850 def report_download_page(self, playlist_id, pagenum):
851 """Report attempt to download playlist page with given number."""
852 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
854 def _real_initialize(self):
855 self._youtube_ie.initialize()
857 def _real_extract(self, url):
858 # Extract playlist id
859 mobj = re.match(self._VALID_URL, url)
861 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
864 # Download playlist pages
865 playlist_id = mobj.group(1)
870 self.report_download_page(playlist_id, pagenum)
871 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
873 page = urllib2.urlopen(request).read()
874 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
875 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
878 # Extract video identifiers
880 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
881 if mobj.group(1) not in ids_in_page:
882 ids_in_page.append(mobj.group(1))
883 video_ids.extend(ids_in_page)
885 if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
887 pagenum = pagenum + 1
890 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
893 class PostProcessor(object):
894 """Post Processor class.
896 PostProcessor objects can be added to downloaders with their
897 add_post_processor() method. When the downloader has finished a
898 successful download, it will take its internal chain of PostProcessors
899 and start calling the run() method on each one of them, first with
900 an initial argument and then with the returned value of the previous
903 The chain will be stopped if one of them ever returns None or the end
904 of the chain is reached.
906 PostProcessor objects follow a "mutual registration" process similar
907 to InfoExtractor objects.
912 def __init__(self, downloader=None):
913 self._downloader = downloader
915 def set_downloader(self, downloader):
916 """Sets the downloader for this PP."""
917 self._downloader = downloader
919 def run(self, information):
920 """Run the PostProcessor.
922 The "information" argument is a dictionary like the ones
923 composed by InfoExtractors. The only difference is that this
924 one has an extra field called "filepath" that points to the
927 When this method returns None, the postprocessing chain is
928 stopped. However, this method may return an information
929 dictionary that will be passed to the next postprocessing
930 object in the chain. It can be the one it received after
931 changing some fields.
933 In addition, this method may raise a PostProcessingError
934 exception that will be taken into account by the downloader
937 return information # by default, do nothing
940 if __name__ == '__main__':
942 # Modules needed only when running the main program
946 # General configuration
947 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
948 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
949 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
952 parser = optparse.OptionParser(
953 usage='Usage: %prog [options] url...',
955 conflict_handler='resolve',
957 parser.add_option('-h', '--help',
958 action='help', help='print this help text and exit')
959 parser.add_option('-v', '--version',
960 action='version', help='print program version and exit')
961 parser.add_option('-u', '--username',
962 dest='username', metavar='UN', help='account username')
963 parser.add_option('-p', '--password',
964 dest='password', metavar='PW', help='account password')
965 parser.add_option('-o', '--output',
966 dest='outtmpl', metavar='TPL', help='output filename template')
967 parser.add_option('-q', '--quiet',
968 action='store_true', dest='quiet', help='activates quiet mode', default=False)
969 parser.add_option('-s', '--simulate',
970 action='store_true', dest='simulate', help='do not download video', default=False)
971 parser.add_option('-t', '--title',
972 action='store_true', dest='usetitle', help='use title in file name', default=False)
973 parser.add_option('-l', '--literal',
974 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
975 parser.add_option('-n', '--netrc',
976 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
977 parser.add_option('-g', '--get-url',
978 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
979 parser.add_option('-e', '--get-title',
980 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
981 parser.add_option('-f', '--format',
982 dest='format', metavar='FMT', help='video format code')
983 parser.add_option('-m', '--mobile-version',
984 action='store_const', dest='format', help='alias for -f 17', const='17')
985 parser.add_option('-d', '--high-def',
986 action='store_const', dest='format', help='alias for -f 22', const='22')
987 parser.add_option('-i', '--ignore-errors',
988 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
989 parser.add_option('-r', '--rate-limit',
990 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
991 parser.add_option('-a', '--batch-file',
992 dest='batchfile', metavar='F', help='file containing URLs to download')
993 parser.add_option('-w', '--no-overwrites',
994 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
995 (opts, args) = parser.parse_args()
997 # Batch file verification
999 if opts.batchfile is not None:
1001 batchurls = open(opts.batchfile, 'r').readlines()
1002 batchurls = [x.strip() for x in batchurls]
1003 batchurls = [x for x in batchurls if len(x) > 0]
1005 sys.exit(u'ERROR: batch file could not be read')
1006 all_urls = batchurls + args
1008 # Conflicting, missing and erroneous options
1009 if len(all_urls) < 1:
1010 sys.exit(u'ERROR: you must provide at least one URL')
1011 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1012 sys.exit(u'ERROR: using .netrc conflicts with giving username/password')
1013 if opts.password is not None and opts.username is None:
1014 sys.exit(u'ERROR: account username missing')
1015 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1016 sys.exit(u'ERROR: using output template conflicts with using title or literal title')
1017 if opts.usetitle and opts.useliteral:
1018 sys.exit(u'ERROR: using title conflicts with using literal title')
1019 if opts.username is not None and opts.password is None:
1020 opts.password = getpass.getpass(u'Type account password and press return:')
1021 if opts.ratelimit is not None:
1022 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1023 if numeric_limit is None:
1024 sys.exit(u'ERROR: invalid rate limit specified')
1025 opts.ratelimit = numeric_limit
1027 # Information extractors
1028 youtube_ie = YoutubeIE()
1029 metacafe_ie = MetacafeIE(youtube_ie)
1030 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1031 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1034 fd = FileDownloader({
1035 'usenetrc': opts.usenetrc,
1036 'username': opts.username,
1037 'password': opts.password,
1038 'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1039 'forceurl': opts.geturl,
1040 'forcetitle': opts.gettitle,
1041 'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1042 'format': opts.format,
1043 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(locale.getpreferredencoding()))
1044 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1045 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1046 or u'%(id)s.%(ext)s'),
1047 'ignoreerrors': opts.ignoreerrors,
1048 'ratelimit': opts.ratelimit,
1049 'nooverwrites': opts.nooverwrites,
1051 fd.add_info_extractor(youtube_search_ie)
1052 fd.add_info_extractor(youtube_pl_ie)
1053 fd.add_info_extractor(metacafe_ie)
1054 fd.add_info_extractor(youtube_ie)
1055 retcode = fd.download(all_urls)
1058 except DownloadError:
1060 except SameFileError:
1061 sys.exit(u'ERROR: fixed output name but more than one file to download')
1062 except KeyboardInterrupt:
1063 sys.exit(u'\nERROR: Interrupted by user')