2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # License: Public domain code
22 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8',
23 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
24 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
25 'Accept-Language': 'en-us,en;q=0.5',
28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
30 class DownloadError(Exception):
31 """Download Error exception.
33 This exception may be thrown by FileDownloader objects if they are not
34 configured to continue on errors. They will contain the appropriate
39 class SameFileError(Exception):
40 """Same File exception.
42 This exception will be thrown by FileDownloader objects if they detect
43 multiple files would have to be downloaded to the same file on disk.
47 class PostProcessingError(Exception):
48 """Post Processing exception.
50 This exception may be raised by PostProcessor's .run() method to
51 indicate an error in the postprocessing task.
55 class FileDownloader(object):
56 """File Downloader class.
58 File downloader objects are the ones responsible of downloading the
59 actual video file and writing it to disk if the user has requested
60 it, among some other tasks. In most cases there should be one per
61 program. As, given a video URL, the downloader doesn't know how to
62 extract all the needed information, task that InfoExtractors do, it
63 has to pass the URL to one of them.
65 For this, file downloader objects have a method that allows
66 InfoExtractors to be registered in a given order. When it is passed
67 a URL, the file downloader handles it to the first InfoExtractor it
68 finds that reports being able to handle it. The InfoExtractor extracts
69 all the information about the video or videos the URL refers to, and
70 asks the FileDownloader to process the video information, possibly
71 downloading the video.
73 File downloaders accept a lot of parameters. In order not to saturate
74 the object constructor with arguments, it receives a dictionary of
75 options instead. These options are available through the params
76 attribute for the InfoExtractors to use. The FileDownloader also
77 registers itself as the downloader in charge for the InfoExtractors
78 that are added to it, so this is a "mutual registration".
82 username: Username for authentication purposes.
83 password: Password for authentication purposes.
84 usenetrc: Use netrc for authentication instead.
85 quiet: Do not print messages to stdout.
86 forceurl: Force printing final URL.
87 forcetitle: Force printing title.
88 simulate: Do not download the video files.
89 format: Video format code.
90 outtmpl: Template for output names.
91 ignoreerrors: Do not stop on download errors.
92 ratelimit: Download speed limit, in bytes/sec.
93 nooverwrites: Prevent overwriting files.
99 _download_retcode = None
101 def __init__(self, params):
102 """Create a FileDownloader object with the given options."""
105 self._download_retcode = 0
109 def pmkdir(filename):
110 """Create directory components in filename. Similar to Unix "mkdir -p"."""
111 components = filename.split(os.sep)
112 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
113 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
114 for dir in aggregate:
115 if not os.path.exists(dir):
119 def format_bytes(bytes):
125 exponent = long(math.log(float(bytes), 1024.0))
126 suffix = 'bkMGTPEZY'[exponent]
127 converted = float(bytes) / float(1024**exponent)
128 return '%.2f%s' % (converted, suffix)
131 def calc_percent(byte_counter, data_len):
134 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
137 def calc_eta(start, now, total, current):
141 if current == 0 or dif < 0.001: # One millisecond
143 rate = float(current) / dif
144 eta = long((float(total) - float(current)) / rate)
145 (eta_mins, eta_secs) = divmod(eta, 60)
148 return '%02d:%02d' % (eta_mins, eta_secs)
151 def calc_speed(start, now, bytes):
153 if bytes == 0 or dif < 0.001: # One millisecond
154 return '%10s' % '---b/s'
155 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
158 def best_block_size(elapsed_time, bytes):
159 new_min = max(bytes / 2.0, 1.0)
160 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
161 if elapsed_time < 0.001:
163 rate = bytes / elapsed_time
171 def parse_bytes(bytestr):
172 """Parse a string indicating a byte quantity into a long integer."""
173 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
176 number = float(matchobj.group(1))
177 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
178 return long(round(number * multiplier))
180 def add_info_extractor(self, ie):
181 """Add an InfoExtractor object to the end of the list."""
183 ie.set_downloader(self)
185 def add_post_processor(self, pp):
186 """Add a PostProcessor object to the end of the chain."""
188 pp.set_downloader(self)
190 def to_stdout(self, message, skip_eol=False):
191 """Print message to stdout if not in quiet mode."""
192 if not self.params.get('quiet', False):
193 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(locale.getpreferredencoding()),
196 def to_stderr(self, message):
197 """Print message to stderr."""
198 print >>sys.stderr, message
200 def fixed_template(self):
201 """Checks if the output template is fixed."""
202 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
204 def trouble(self, message=None):
205 """Determine action to take when a download problem appears.
207 Depending on if the downloader has been configured to ignore
208 download errors or not, this method may throw an exception or
209 not when errors are found, after printing the message.
211 if message is not None:
212 self.to_stderr(message)
213 if not self.params.get('ignoreerrors', False):
214 raise DownloadError(message)
215 self._download_retcode = 1
217 def slow_down(self, start_time, byte_counter):
218 """Sleep if the download speed is over the rate limit."""
219 rate_limit = self.params.get('ratelimit', None)
220 if rate_limit is None or byte_counter == 0:
223 elapsed = now - start_time
226 speed = float(byte_counter) / elapsed
227 if speed > rate_limit:
228 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
230 def report_destination(self, filename):
231 """Report destination filename."""
232 self.to_stdout(u'[download] Destination: %s' % filename)
234 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
235 """Report download progress."""
236 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
237 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
239 def report_finish(self):
240 """Report download finished."""
243 def process_info(self, info_dict):
244 """Process a single dictionary returned by an InfoExtractor."""
246 if self.params.get('forcetitle', False):
247 print info_dict['title'].encode(locale.getpreferredencoding())
248 if self.params.get('forceurl', False):
249 print info_dict['url'].encode(locale.getpreferredencoding())
251 # Do nothing else if in simulate mode
252 if self.params.get('simulate', False):
256 filename = self.params['outtmpl'] % info_dict
257 self.report_destination(filename)
258 except (ValueError, KeyError), err:
259 self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
260 if self.params['nooverwrites'] and os.path.exists(filename):
261 self.to_stderr('WARNING: file exists: %s; skipping' % filename)
264 self.pmkdir(filename)
265 except (OSError, IOError), err:
266 self.trouble('ERROR: unable to create directories: %s' % str(err))
269 outstream = open(filename, 'wb')
270 except (OSError, IOError), err:
271 self.trouble('ERROR: unable to open for writing: %s' % str(err))
274 self._do_download(outstream, info_dict['url'])
276 except (OSError, IOError), err:
277 self.trouble('ERROR: unable to write video data: %s' % str(err))
279 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
280 self.trouble('ERROR: unable to download video data: %s' % str(err))
283 self.post_process(filename, info_dict)
284 except (PostProcessingError), err:
285 self.trouble('ERROR: postprocessing: %s' % str(err))
290 def download(self, url_list):
291 """Download a given list of URLs."""
292 if len(url_list) > 1 and self.fixed_template():
293 raise SameFileError(self.params['outtmpl'])
296 suitable_found = False
298 # Go to next InfoExtractor if not suitable
299 if not ie.suitable(url):
302 # Suitable InfoExtractor found
303 suitable_found = True
305 # Extract information from URL and process it
308 # Suitable InfoExtractor had been found; go to next URL
311 if not suitable_found:
312 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
314 return self._download_retcode
316 def post_process(self, filename, ie_info):
317 """Run the postprocessing chain on the given file."""
319 info['filepath'] = filename
325 def _do_download(self, stream, url):
326 request = urllib2.Request(url, None, std_headers)
327 data = urllib2.urlopen(request)
328 data_len = data.info().get('Content-length', None)
329 data_len_str = self.format_bytes(data_len)
335 percent_str = self.calc_percent(byte_counter, data_len)
336 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
337 speed_str = self.calc_speed(start, time.time(), byte_counter)
338 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
342 data_block = data.read(block_size)
344 data_block_len = len(data_block)
345 if data_block_len == 0:
347 byte_counter += data_block_len
348 stream.write(data_block)
349 block_size = self.best_block_size(after - before, data_block_len)
352 self.slow_down(start, byte_counter)
355 if data_len is not None and str(byte_counter) != data_len:
356 raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
358 class InfoExtractor(object):
359 """Information Extractor class.
361 Information extractors are the classes that, given a URL, extract
362 information from the video (or videos) the URL refers to. This
363 information includes the real video URL, the video title and simplified
364 title, author and others. The information is stored in a dictionary
365 which is then passed to the FileDownloader. The FileDownloader
366 processes this information possibly downloading the video to the file
367 system, among other possible outcomes. The dictionaries must include
368 the following fields:
370 id: Video identifier.
371 url: Final video URL.
372 uploader: Nickname of the video uploader.
373 title: Literal title.
374 stitle: Simplified title.
375 ext: Video filename extension.
377 Subclasses of this one should re-define the _real_initialize() and
378 _real_extract() methods, as well as the suitable() static method.
379 Probably, they should also be instantiated and added to the main
386 def __init__(self, downloader=None):
387 """Constructor. Receives an optional downloader."""
389 self.set_downloader(downloader)
393 """Receives a URL and returns True if suitable for this IE."""
396 def initialize(self):
397 """Initializes an instance (authentication, etc)."""
399 self._real_initialize()
402 def extract(self, url):
403 """Extracts URL information and returns it in list of dicts."""
405 return self._real_extract(url)
407 def set_downloader(self, downloader):
408 """Sets the downloader for this IE."""
409 self._downloader = downloader
411 def _real_initialize(self):
412 """Real initialization process. Redefine in subclasses."""
415 def _real_extract(self, url):
416 """Real extraction process. Redefine in subclasses."""
419 class YoutubeIE(InfoExtractor):
420 """Information extractor for youtube.com."""
422 _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
423 _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
424 _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
425 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
426 _NETRC_MACHINE = 'youtube'
430 return (re.match(YoutubeIE._VALID_URL, url) is not None)
433 def htmlentity_transform(matchobj):
434 """Transforms an HTML entity to a Unicode character."""
435 entity = matchobj.group(1)
437 # Known non-numeric HTML entity
438 if entity in htmlentitydefs.name2codepoint:
439 return unichr(htmlentitydefs.name2codepoint[entity])
442 mobj = re.match(ur'(?u)#(x?\d+)', entity)
444 numstr = mobj.group(1)
445 if numstr.startswith(u'x'):
447 numstr = u'0%s' % numstr
450 return unichr(long(numstr, base))
452 # Unknown entity in name, return its literal representation
453 return (u'&%s;' % entity)
455 def report_lang(self):
456 """Report attempt to set language."""
457 self._downloader.to_stdout(u'[youtube] Setting language')
459 def report_login(self):
460 """Report attempt to log in."""
461 self._downloader.to_stdout(u'[youtube] Logging in')
463 def report_age_confirmation(self):
464 """Report attempt to confirm age."""
465 self._downloader.to_stdout(u'[youtube] Confirming age')
467 def report_webpage_download(self, video_id):
468 """Report attempt to download webpage."""
469 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
471 def report_information_extraction(self, video_id):
472 """Report attempt to extract video information."""
473 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
475 def report_video_url(self, video_id, video_real_url):
476 """Report extracted video URL."""
477 self._downloader.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
479 def _real_initialize(self):
480 if self._downloader is None:
485 downloader_params = self._downloader.params
487 # Attempt to use provided username and password or .netrc data
488 if downloader_params.get('username', None) is not None:
489 username = downloader_params['username']
490 password = downloader_params['password']
491 elif downloader_params.get('usenetrc', False):
493 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
498 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
499 except (IOError, netrc.NetrcParseError), err:
500 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
504 request = urllib2.Request(self._LANG_URL, None, std_headers)
507 urllib2.urlopen(request).read()
508 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
509 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
512 # No authentication to be performed
518 'current_form': 'loginForm',
520 'action_login': 'Log In',
521 'username': username,
522 'password': password,
524 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
527 login_results = urllib2.urlopen(request).read()
528 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
529 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
531 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
532 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
538 'action_confirm': 'Confirm',
540 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
542 self.report_age_confirmation()
543 age_results = urllib2.urlopen(request).read()
544 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
545 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
548 def _real_extract(self, url):
549 # Extract video id from URL
550 mobj = re.match(self._VALID_URL, url)
552 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
554 video_id = mobj.group(2)
556 # Downloader parameters
558 if self._downloader is not None:
559 params = self._downloader.params
560 format_param = params.get('format', None)
567 }.get(format_param, 'flv')
569 # Normalize URL, including format
570 normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
571 if format_param is not None:
572 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
573 request = urllib2.Request(normalized_url, None, std_headers)
575 self.report_webpage_download(video_id)
576 video_webpage = urllib2.urlopen(request).read()
577 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
578 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
580 self.report_information_extraction(video_id)
583 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
585 self._downloader.trouble(u'ERROR: unable to extract "t" parameter')
587 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&el=detailpage&ps=' % (video_id, mobj.group(1))
588 if format_param is not None:
589 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
590 self.report_video_url(video_id, video_real_url)
593 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
595 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
597 video_uploader = mobj.group(1)
600 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
602 self._downloader.trouble(u'ERROR: unable to extract video title')
604 video_title = mobj.group(1).decode('utf-8')
605 video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
606 video_title = video_title.replace(os.sep, u'%')
609 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
610 simple_title = simple_title.strip(ur'_')
612 # Process video information
613 self._downloader.process_info({
614 'id': video_id.decode('utf-8'),
615 'url': video_real_url.decode('utf-8'),
616 'uploader': video_uploader.decode('utf-8'),
617 'title': video_title,
618 'stitle': simple_title,
619 'ext': video_extension.decode('utf-8'),
622 class MetacafeIE(InfoExtractor):
623 """Information Extractor for metacafe.com."""
625 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
626 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
627 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
630 def __init__(self, youtube_ie, downloader=None):
631 InfoExtractor.__init__(self, downloader)
632 self._youtube_ie = youtube_ie
636 return (re.match(MetacafeIE._VALID_URL, url) is not None)
638 def report_disclaimer(self):
639 """Report disclaimer retrieval."""
640 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
642 def report_age_confirmation(self):
643 """Report attempt to confirm age."""
644 self._downloader.to_stdout(u'[metacafe] Confirming age')
646 def report_download_webpage(self, video_id):
647 """Report webpage download."""
648 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
650 def report_extraction(self, video_id):
651 """Report information extraction."""
652 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
654 def _real_initialize(self):
655 # Retrieve disclaimer
656 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
658 self.report_disclaimer()
659 disclaimer = urllib2.urlopen(request).read()
660 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
661 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
667 'submit': "Continue - I'm over 18",
669 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
671 self.report_age_confirmation()
672 disclaimer = urllib2.urlopen(request).read()
673 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
674 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
677 def _real_extract(self, url):
678 # Extract id and simplified title from URL
679 mobj = re.match(self._VALID_URL, url)
681 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
684 video_id = mobj.group(1)
686 # Check if video comes from YouTube
687 mobj2 = re.match(r'^yt-(.*)$', video_id)
688 if mobj2 is not None:
689 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
692 simple_title = mobj.group(2).decode('utf-8')
693 video_extension = 'flv'
695 # Retrieve video webpage to extract further information
696 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
698 self.report_download_webpage(video_id)
699 webpage = urllib2.urlopen(request).read()
700 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
701 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
704 # Extract URL, uploader and title from webpage
705 self.report_extraction(video_id)
706 mobj = re.search(r'(?m)&mediaURL=(http.*?\.flv)', webpage)
708 self._downloader.trouble(u'ERROR: unable to extract media URL')
710 mediaURL = urllib.unquote(mobj.group(1))
712 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
714 self._downloader.trouble(u'ERROR: unable to extract gdaKey')
716 gdaKey = mobj.group(1)
718 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
720 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
722 self._downloader.trouble(u'ERROR: unable to extract title')
724 video_title = mobj.group(1).decode('utf-8')
726 mobj = re.search(r'(?ms)<li id="ChnlUsr">.*?Submitter:.*?<a .*?>(.*?)<', webpage)
728 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
730 video_uploader = mobj.group(1)
732 # Process video information
733 self._downloader.process_info({
734 'id': video_id.decode('utf-8'),
735 'url': video_url.decode('utf-8'),
736 'uploader': video_uploader.decode('utf-8'),
737 'title': video_title,
738 'stitle': simple_title,
739 'ext': video_extension.decode('utf-8'),
743 class YoutubeSearchIE(InfoExtractor):
744 """Information Extractor for YouTube search queries."""
745 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
746 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
747 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
748 _MORE_PAGES_INDICATOR = r'>Next</a>'
750 _max_youtube_results = 1000
752 def __init__(self, youtube_ie, downloader=None):
753 InfoExtractor.__init__(self, downloader)
754 self._youtube_ie = youtube_ie
758 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
760 def report_download_page(self, query, pagenum):
761 """Report attempt to download playlist page with given number."""
762 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
764 def _real_initialize(self):
765 self._youtube_ie.initialize()
767 def _real_extract(self, query):
768 mobj = re.match(self._VALID_QUERY, query)
770 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
773 prefix, query = query.split(':')
776 self._download_n_results(query, 1)
778 elif prefix == 'all':
779 self._download_n_results(query, self._max_youtube_results)
785 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
787 elif n > self._max_youtube_results:
788 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
789 n = self._max_youtube_results
790 self._download_n_results(query, n)
792 except ValueError: # parsing prefix as int fails
793 self._download_n_results(query, 1)
796 def _download_n_results(self, query, n):
797 """Downloads a specified number of results for a query"""
804 self.report_download_page(query, pagenum)
805 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
806 request = urllib2.Request(result_url, None, std_headers)
808 page = urllib2.urlopen(request).read()
809 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
810 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
813 # Extract video identifiers
814 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
815 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
816 if video_id not in already_seen:
817 video_ids.append(video_id)
818 already_seen.add(video_id)
819 if len(video_ids) == n:
820 # Specified n videos reached
822 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
825 if self._MORE_PAGES_INDICATOR not in page:
827 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
830 pagenum = pagenum + 1
832 class YoutubePlaylistIE(InfoExtractor):
833 """Information Extractor for YouTube playlists."""
835 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
836 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
837 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
838 _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
841 def __init__(self, youtube_ie, downloader=None):
842 InfoExtractor.__init__(self, downloader)
843 self._youtube_ie = youtube_ie
847 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
849 def report_download_page(self, playlist_id, pagenum):
850 """Report attempt to download playlist page with given number."""
851 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
853 def _real_initialize(self):
854 self._youtube_ie.initialize()
856 def _real_extract(self, url):
857 # Extract playlist id
858 mobj = re.match(self._VALID_URL, url)
860 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
863 # Download playlist pages
864 playlist_id = mobj.group(1)
869 self.report_download_page(playlist_id, pagenum)
870 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
872 page = urllib2.urlopen(request).read()
873 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
874 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
877 # Extract video identifiers
879 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
880 if mobj.group(1) not in ids_in_page:
881 ids_in_page.append(mobj.group(1))
882 video_ids.extend(ids_in_page)
884 if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
886 pagenum = pagenum + 1
889 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
892 class PostProcessor(object):
893 """Post Processor class.
895 PostProcessor objects can be added to downloaders with their
896 add_post_processor() method. When the downloader has finished a
897 successful download, it will take its internal chain of PostProcessors
898 and start calling the run() method on each one of them, first with
899 an initial argument and then with the returned value of the previous
902 The chain will be stopped if one of them ever returns None or the end
903 of the chain is reached.
905 PostProcessor objects follow a "mutual registration" process similar
906 to InfoExtractor objects.
911 def __init__(self, downloader=None):
912 self._downloader = downloader
914 def set_downloader(self, downloader):
915 """Sets the downloader for this PP."""
916 self._downloader = downloader
918 def run(self, information):
919 """Run the PostProcessor.
921 The "information" argument is a dictionary like the ones
922 returned by InfoExtractors. The only difference is that this
923 one has an extra field called "filepath" that points to the
926 When this method returns None, the postprocessing chain is
927 stopped. However, this method may return an information
928 dictionary that will be passed to the next postprocessing
929 object in the chain. It can be the one it received after
930 changing some fields.
932 In addition, this method may raise a PostProcessingError
933 exception that will be taken into account by the downloader
936 return information # by default, do nothing
939 if __name__ == '__main__':
941 # Modules needed only when running the main program
945 # General configuration
946 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
947 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
948 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
951 parser = optparse.OptionParser(
952 usage='Usage: %prog [options] url...',
954 conflict_handler='resolve',
956 parser.add_option('-h', '--help',
957 action='help', help='print this help text and exit')
958 parser.add_option('-v', '--version',
959 action='version', help='print program version and exit')
960 parser.add_option('-u', '--username',
961 dest='username', metavar='UN', help='account username')
962 parser.add_option('-p', '--password',
963 dest='password', metavar='PW', help='account password')
964 parser.add_option('-o', '--output',
965 dest='outtmpl', metavar='TPL', help='output filename template')
966 parser.add_option('-q', '--quiet',
967 action='store_true', dest='quiet', help='activates quiet mode', default=False)
968 parser.add_option('-s', '--simulate',
969 action='store_true', dest='simulate', help='do not download video', default=False)
970 parser.add_option('-t', '--title',
971 action='store_true', dest='usetitle', help='use title in file name', default=False)
972 parser.add_option('-l', '--literal',
973 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
974 parser.add_option('-n', '--netrc',
975 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
976 parser.add_option('-g', '--get-url',
977 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
978 parser.add_option('-e', '--get-title',
979 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
980 parser.add_option('-f', '--format',
981 dest='format', metavar='FMT', help='video format code')
982 parser.add_option('-m', '--mobile-version',
983 action='store_const', dest='format', help='alias for -f 17', const='17')
984 parser.add_option('-d', '--high-def',
985 action='store_const', dest='format', help='alias for -f 22', const='22')
986 parser.add_option('-i', '--ignore-errors',
987 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
988 parser.add_option('-r', '--rate-limit',
989 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
990 parser.add_option('-a', '--batch-file',
991 dest='batchfile', metavar='F', help='file containing URLs to download')
992 parser.add_option('-w', '--no-overwrites',
993 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
994 (opts, args) = parser.parse_args()
996 # Batch file verification
998 if opts.batchfile is not None:
1000 batchurls = open(opts.batchfile, 'r').readlines()
1001 batchurls = [x.strip() for x in batchurls]
1002 batchurls = [x for x in batchurls if len(x) > 0]
1004 sys.exit(u'ERROR: batch file could not be read')
1005 all_urls = batchurls + args
1007 # Conflicting, missing and erroneous options
1008 if len(all_urls) < 1:
1009 sys.exit(u'ERROR: you must provide at least one URL')
1010 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1011 sys.exit(u'ERROR: using .netrc conflicts with giving username/password')
1012 if opts.password is not None and opts.username is None:
1013 sys.exit(u'ERROR: account username missing')
1014 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1015 sys.exit(u'ERROR: using output template conflicts with using title or literal title')
1016 if opts.usetitle and opts.useliteral:
1017 sys.exit(u'ERROR: using title conflicts with using literal title')
1018 if opts.username is not None and opts.password is None:
1019 opts.password = getpass.getpass(u'Type account password and press return:')
1020 if opts.ratelimit is not None:
1021 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1022 if numeric_limit is None:
1023 sys.exit(u'ERROR: invalid rate limit specified')
1024 opts.ratelimit = numeric_limit
1026 # Information extractors
1027 youtube_ie = YoutubeIE()
1028 metacafe_ie = MetacafeIE(youtube_ie)
1029 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1030 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1033 fd = FileDownloader({
1034 'usenetrc': opts.usenetrc,
1035 'username': opts.username,
1036 'password': opts.password,
1037 'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1038 'forceurl': opts.geturl,
1039 'forcetitle': opts.gettitle,
1040 'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1041 'format': opts.format,
1042 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(locale.getpreferredencoding()))
1043 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1044 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1045 or u'%(id)s.%(ext)s'),
1046 'ignoreerrors': opts.ignoreerrors,
1047 'ratelimit': opts.ratelimit,
1048 'nooverwrites': opts.nooverwrites,
1050 fd.add_info_extractor(youtube_search_ie)
1051 fd.add_info_extractor(youtube_pl_ie)
1052 fd.add_info_extractor(metacafe_ie)
1053 fd.add_info_extractor(youtube_ie)
1054 retcode = fd.download(all_urls)
1057 except DownloadError:
1059 except SameFileError:
1060 sys.exit(u'ERROR: fixed output name but more than one file to download')
1061 except KeyboardInterrupt:
1062 sys.exit(u'\nERROR: Interrupted by user')