2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # License: Public domain code
22 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8',
23 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
24 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
25 'Accept-Language': 'en-us,en;q=0.5',
28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
30 class DownloadError(Exception):
31 """Download Error exception.
33 This exception may be thrown by FileDownloader objects if they are not
34 configured to continue on errors. They will contain the appropriate
39 class SameFileError(Exception):
40 """Same File exception.
42 This exception will be thrown by FileDownloader objects if they detect
43 multiple files would have to be downloaded to the same file on disk.
47 class PostProcessingError(Exception):
48 """Post Processing exception.
50 This exception may be raised by PostProcessor's .run() method to
51 indicate an error in the postprocessing task.
55 class FileDownloader(object):
56 """File Downloader class.
58 File downloader objects are the ones responsible of downloading the
59 actual video file and writing it to disk if the user has requested
60 it, among some other tasks. In most cases there should be one per
61 program. As, given a video URL, the downloader doesn't know how to
62 extract all the needed information, task that InfoExtractors do, it
63 has to pass the URL to one of them.
65 For this, file downloader objects have a method that allows
66 InfoExtractors to be registered in a given order. When it is passed
67 a URL, the file downloader handles it to the first InfoExtractor it
68 finds that reports being able to handle it. The InfoExtractor extracts
69 all the information about the video or videos the URL refers to, and
70 asks the FileDownloader to process the video information, possibly
71 downloading the video.
73 File downloaders accept a lot of parameters. In order not to saturate
74 the object constructor with arguments, it receives a dictionary of
75 options instead. These options are available through the params
76 attribute for the InfoExtractors to use. The FileDownloader also
77 registers itself as the downloader in charge for the InfoExtractors
78 that are added to it, so this is a "mutual registration".
82 username: Username for authentication purposes.
83 password: Password for authentication purposes.
84 usenetrc: Use netrc for authentication instead.
85 quiet: Do not print messages to stdout.
86 forceurl: Force printing final URL.
87 forcetitle: Force printing title.
88 simulate: Do not download the video files.
89 format: Video format code.
90 outtmpl: Template for output names.
91 ignoreerrors: Do not stop on download errors.
92 ratelimit: Download speed limit, in bytes/sec.
93 nooverwrites: Prevent overwriting files.
99 _download_retcode = None
101 def __init__(self, params):
102 """Create a FileDownloader object with the given options."""
105 self._download_retcode = 0
109 def pmkdir(filename):
110 """Create directory components in filename. Similar to Unix "mkdir -p"."""
111 components = filename.split(os.sep)
112 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
113 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
114 for dir in aggregate:
115 if not os.path.exists(dir):
119 def format_bytes(bytes):
125 exponent = long(math.log(float(bytes), 1024.0))
126 suffix = 'bkMGTPEZY'[exponent]
127 converted = float(bytes) / float(1024**exponent)
128 return '%.2f%s' % (converted, suffix)
131 def calc_percent(byte_counter, data_len):
134 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
137 def calc_eta(start, now, total, current):
141 if current == 0 or dif < 0.001: # One millisecond
143 rate = float(current) / dif
144 eta = long((float(total) - float(current)) / rate)
145 (eta_mins, eta_secs) = divmod(eta, 60)
148 return '%02d:%02d' % (eta_mins, eta_secs)
151 def calc_speed(start, now, bytes):
153 if bytes == 0 or dif < 0.001: # One millisecond
154 return '%10s' % '---b/s'
155 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
158 def best_block_size(elapsed_time, bytes):
159 new_min = max(bytes / 2.0, 1.0)
160 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
161 if elapsed_time < 0.001:
163 rate = bytes / elapsed_time
171 def parse_bytes(bytestr):
172 """Parse a string indicating a byte quantity into a long integer."""
173 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
176 number = float(matchobj.group(1))
177 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
178 return long(round(number * multiplier))
180 def add_info_extractor(self, ie):
181 """Add an InfoExtractor object to the end of the list."""
183 ie.set_downloader(self)
185 def add_post_processor(self, pp):
186 """Add a PostProcessor object to the end of the chain."""
188 pp.set_downloader(self)
190 def to_stdout(self, message, skip_eol=False):
191 """Print message to stdout if not in quiet mode."""
192 if not self.params.get('quiet', False):
193 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(locale.getpreferredencoding()),
196 def to_stderr(self, message):
197 """Print message to stderr."""
198 print >>sys.stderr, message
200 def fixed_template(self):
201 """Checks if the output template is fixed."""
202 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
204 def trouble(self, message=None):
205 """Determine action to take when a download problem appears.
207 Depending on if the downloader has been configured to ignore
208 download errors or not, this method may throw an exception or
209 not when errors are found, after printing the message.
211 if message is not None:
212 self.to_stderr(message)
213 if not self.params.get('ignoreerrors', False):
214 raise DownloadError(message)
215 self._download_retcode = 1
217 def slow_down(self, start_time, byte_counter):
218 """Sleep if the download speed is over the rate limit."""
219 rate_limit = self.params.get('ratelimit', None)
220 if rate_limit is None or byte_counter == 0:
223 elapsed = now - start_time
226 speed = float(byte_counter) / elapsed
227 if speed > rate_limit:
228 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
230 def report_destination(self, filename):
231 """Report destination filename."""
232 self.to_stdout(u'[download] Destination: %s' % filename)
234 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
235 """Report download progress."""
236 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
237 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
239 def report_finish(self):
240 """Report download finished."""
243 def process_info(self, info_dict):
244 """Process a single dictionary returned by an InfoExtractor."""
246 if self.params.get('forcetitle', False):
247 print info_dict['title'].encode(locale.getpreferredencoding())
248 if self.params.get('forceurl', False):
249 print info_dict['url'].encode(locale.getpreferredencoding())
251 # Do nothing else if in simulate mode
252 if self.params.get('simulate', False):
256 filename = self.params['outtmpl'] % info_dict
257 self.report_destination(filename)
258 except (ValueError, KeyError), err:
259 self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
260 if self.params['nooverwrites'] and os.path.exists(filename):
261 self.to_stderr('WARNING: file exists: %s; skipping' % filename)
264 self.pmkdir(filename)
265 except (OSError, IOError), err:
266 self.trouble('ERROR: unable to create directories: %s' % str(err))
269 outstream = open(filename, 'wb')
270 except (OSError, IOError), err:
271 self.trouble('ERROR: unable to open for writing: %s' % str(err))
274 self._do_download(outstream, info_dict['url'])
276 except (OSError, IOError), err:
277 self.trouble('ERROR: unable to write video data: %s' % str(err))
279 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
280 self.trouble('ERROR: unable to download video data: %s' % str(err))
283 self.post_process(filename, info_dict)
284 except (PostProcessingError), err:
285 self.trouble('ERROR: postprocessing: %s' % str(err))
290 def download(self, url_list):
291 """Download a given list of URLs."""
292 if len(url_list) > 1 and self.fixed_template():
293 raise SameFileError(self.params['outtmpl'])
296 suitable_found = False
298 # Go to next InfoExtractor if not suitable
299 if not ie.suitable(url):
302 # Suitable InfoExtractor found
303 suitable_found = True
305 # Extract information from URL and process it
308 # Suitable InfoExtractor had been found; go to next URL
311 if not suitable_found:
312 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
314 return self._download_retcode
316 def post_process(self, filename, ie_info):
317 """Run the postprocessing chain on the given file."""
319 info['filepath'] = filename
325 def _do_download(self, stream, url):
326 request = urllib2.Request(url, None, std_headers)
327 data = urllib2.urlopen(request)
328 data_len = data.info().get('Content-length', None)
329 data_len_str = self.format_bytes(data_len)
335 percent_str = self.calc_percent(byte_counter, data_len)
336 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
337 speed_str = self.calc_speed(start, time.time(), byte_counter)
338 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
342 data_block = data.read(block_size)
344 data_block_len = len(data_block)
345 if data_block_len == 0:
347 byte_counter += data_block_len
348 stream.write(data_block)
349 block_size = self.best_block_size(after - before, data_block_len)
352 self.slow_down(start, byte_counter)
355 if data_len is not None and str(byte_counter) != data_len:
356 raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
358 class InfoExtractor(object):
359 """Information Extractor class.
361 Information extractors are the classes that, given a URL, extract
362 information from the video (or videos) the URL refers to. This
363 information includes the real video URL, the video title and simplified
364 title, author and others. The information is stored in a dictionary
365 which is then passed to the FileDownloader. The FileDownloader
366 processes this information possibly downloading the video to the file
367 system, among other possible outcomes. The dictionaries must include
368 the following fields:
370 id: Video identifier.
371 url: Final video URL.
372 uploader: Nickname of the video uploader.
373 title: Literal title.
374 stitle: Simplified title.
375 ext: Video filename extension.
377 Subclasses of this one should re-define the _real_initialize() and
378 _real_extract() methods, as well as the suitable() static method.
379 Probably, they should also be instantiated and added to the main
386 def __init__(self, downloader=None):
387 """Constructor. Receives an optional downloader."""
389 self.set_downloader(downloader)
393 """Receives a URL and returns True if suitable for this IE."""
396 def initialize(self):
397 """Initializes an instance (authentication, etc)."""
399 self._real_initialize()
402 def extract(self, url):
403 """Extracts URL information and returns it in list of dicts."""
405 return self._real_extract(url)
407 def set_downloader(self, downloader):
408 """Sets the downloader for this IE."""
409 self._downloader = downloader
411 def _real_initialize(self):
412 """Real initialization process. Redefine in subclasses."""
415 def _real_extract(self, url):
416 """Real extraction process. Redefine in subclasses."""
419 class YoutubeIE(InfoExtractor):
420 """Information extractor for youtube.com."""
422 _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
423 _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
424 _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
425 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
426 _NETRC_MACHINE = 'youtube'
430 return (re.match(YoutubeIE._VALID_URL, url) is not None)
433 def htmlentity_transform(matchobj):
434 """Transforms an HTML entity to a Unicode character."""
435 entity = matchobj.group(1)
437 # Known non-numeric HTML entity
438 if entity in htmlentitydefs.name2codepoint:
439 return unichr(htmlentitydefs.name2codepoint[entity])
442 mobj = re.match(ur'(?u)#(x?\d+)', entity)
444 numstr = mobj.group(1)
445 if numstr.startswith(u'x'):
447 numstr = u'0%s' % numstr
450 return unichr(long(numstr, base))
452 # Unknown entity in name, return its literal representation
453 return (u'&%s;' % entity)
455 def report_lang(self):
456 """Report attempt to set language."""
457 self._downloader.to_stdout(u'[youtube] Setting language')
459 def report_login(self):
460 """Report attempt to log in."""
461 self._downloader.to_stdout(u'[youtube] Logging in')
463 def report_age_confirmation(self):
464 """Report attempt to confirm age."""
465 self._downloader.to_stdout(u'[youtube] Confirming age')
467 def report_webpage_download(self, video_id):
468 """Report attempt to download webpage."""
469 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
471 def report_information_extraction(self, video_id):
472 """Report attempt to extract video information."""
473 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
475 def report_video_url(self, video_id, video_real_url):
476 """Report extracted video URL."""
477 self._downloader.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
479 def _real_initialize(self):
480 if self._downloader is None:
485 downloader_params = self._downloader.params
487 # Attempt to use provided username and password or .netrc data
488 if downloader_params.get('username', None) is not None:
489 username = downloader_params['username']
490 password = downloader_params['password']
491 elif downloader_params.get('usenetrc', False):
493 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
498 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
499 except (IOError, netrc.NetrcParseError), err:
500 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
504 request = urllib2.Request(self._LANG_URL, None, std_headers)
507 urllib2.urlopen(request).read()
508 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
509 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
512 # No authentication to be performed
518 'current_form': 'loginForm',
520 'action_login': 'Log In',
521 'username': username,
522 'password': password,
524 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
527 login_results = urllib2.urlopen(request).read()
528 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
529 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
531 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
532 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
538 'action_confirm': 'Confirm',
540 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
542 self.report_age_confirmation()
543 age_results = urllib2.urlopen(request).read()
544 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
545 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
548 def _real_extract(self, url):
549 # Extract video id from URL
550 mobj = re.match(self._VALID_URL, url)
552 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
554 video_id = mobj.group(2)
556 # Downloader parameters
558 if self._downloader is not None:
559 params = self._downloader.params
560 format_param = params.get('format', None)
567 }.get(format_param, 'flv')
569 # Normalize URL, including format
570 normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
571 if format_param is not None:
572 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
573 request = urllib2.Request(normalized_url, None, std_headers)
575 self.report_webpage_download(video_id)
576 video_webpage = urllib2.urlopen(request).read()
577 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
578 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
580 self.report_information_extraction(video_id)
583 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
585 self._downloader.trouble(u'ERROR: unable to extract "t" parameter')
587 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&el=detailpage&ps=' % (video_id, mobj.group(1))
588 if format_param is not None:
589 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
590 self.report_video_url(video_id, video_real_url)
593 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
595 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
597 video_uploader = mobj.group(1)
600 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
602 self._downloader.trouble(u'ERROR: unable to extract video title')
604 video_title = mobj.group(1).decode('utf-8')
605 video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
606 video_title = video_title.replace(os.sep, u'%')
609 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
610 simple_title = simple_title.strip(ur'_')
612 # Process video information
613 self._downloader.process_info({
614 'id': video_id.decode('utf-8'),
615 'url': video_real_url.decode('utf-8'),
616 'uploader': video_uploader.decode('utf-8'),
617 'title': video_title,
618 'stitle': simple_title,
619 'ext': video_extension.decode('utf-8'),
622 class MetacafeIE(InfoExtractor):
623 """Information Extractor for metacafe.com."""
625 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
626 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
629 def __init__(self, youtube_ie, downloader=None):
630 InfoExtractor.__init__(self, downloader)
631 self._youtube_ie = youtube_ie
635 return (re.match(MetacafeIE._VALID_URL, url) is not None)
637 def report_disclaimer(self):
638 """Report disclaimer retrieval."""
639 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
641 def report_age_confirmation(self):
642 """Report attempt to confirm age."""
643 self._downloader.to_stdout(u'[metacafe] Confirming age')
645 def report_download_webpage(self, video_id):
646 """Report webpage download."""
647 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
649 def report_extraction(self, video_id):
650 """Report information extraction."""
651 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
653 def _real_initialize(self):
654 # Retrieve disclaimer
655 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
657 self.report_disclaimer()
658 disclaimer = urllib2.urlopen(request).read()
659 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
660 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
666 'submit': "Continue - I'm over 18",
668 request = urllib2.Request('http://www.metacafe.com/', urllib.urlencode(disclaimer_form), std_headers)
670 self.report_age_confirmation()
671 disclaimer = urllib2.urlopen(request).read()
672 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
673 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
676 def _real_extract(self, url):
677 # Extract id and simplified title from URL
678 mobj = re.match(self._VALID_URL, url)
680 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
683 video_id = mobj.group(1)
685 # Check if video comes from YouTube
686 mobj2 = re.match(r'^yt-(.*)$', video_id)
687 if mobj2 is not None:
688 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
691 simple_title = mobj.group(2).decode('utf-8')
692 video_extension = 'flv'
694 # Retrieve video webpage to extract further information
695 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
697 self.report_download_webpage(video_id)
698 webpage = urllib2.urlopen(request).read()
699 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
700 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
703 # Extract URL, uploader and title from webpage
704 self.report_extraction(video_id)
705 mobj = re.search(r'(?m)"mediaURL":"(http.*?\.flv)"', webpage)
707 self._downloader.trouble(u'ERROR: unable to extract media URL')
709 mediaURL = mobj.group(1).replace('\\', '')
711 mobj = re.search(r'(?m)"gdaKey":"(.*?)"', webpage)
713 self._downloader.trouble(u'ERROR: unable to extract gdaKey')
715 gdaKey = mobj.group(1)
717 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
719 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
721 self._downloader.trouble(u'ERROR: unable to extract title')
723 video_title = mobj.group(1).decode('utf-8')
725 mobj = re.search(r'(?m)<li id="ChnlUsr">.*?Submitter:<br />(.*?)</li>', webpage)
727 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
729 video_uploader = re.sub(r'<.*?>', '', mobj.group(1))
731 # Process video information
732 self._downloader.process_info({
733 'id': video_id.decode('utf-8'),
734 'url': video_url.decode('utf-8'),
735 'uploader': video_uploader.decode('utf-8'),
736 'title': video_title,
737 'stitle': simple_title,
738 'ext': video_extension.decode('utf-8'),
742 class YoutubeSearchIE(InfoExtractor):
743 """Information Extractor for YouTube search queries."""
744 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
745 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
746 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
747 _MORE_PAGES_INDICATOR = r'>Next</a>'
749 _max_youtube_results = 1000
751 def __init__(self, youtube_ie, downloader=None):
752 InfoExtractor.__init__(self, downloader)
753 self._youtube_ie = youtube_ie
757 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
759 def report_download_page(self, query, pagenum):
760 """Report attempt to download playlist page with given number."""
761 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
763 def _real_initialize(self):
764 self._youtube_ie.initialize()
766 def _real_extract(self, query):
767 mobj = re.match(self._VALID_QUERY, query)
769 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
772 prefix, query = query.split(':')
775 self._download_n_results(query, 1)
777 elif prefix == 'all':
778 self._download_n_results(query, self._max_youtube_results)
784 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
786 elif n > self._max_youtube_results:
787 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
788 n = self._max_youtube_results
789 self._download_n_results(query, n)
791 except ValueError: # parsing prefix as int fails
792 self._download_n_results(query, 1)
795 def _download_n_results(self, query, n):
796 """Downloads a specified number of results for a query"""
803 self.report_download_page(query, pagenum)
804 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
805 request = urllib2.Request(result_url, None, std_headers)
807 page = urllib2.urlopen(request).read()
808 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
809 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
812 # Extract video identifiers
813 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
814 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
815 if video_id not in already_seen:
816 video_ids.append(video_id)
817 already_seen.add(video_id)
818 if len(video_ids) == n:
819 # Specified n videos reached
821 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
824 if self._MORE_PAGES_INDICATOR not in page:
826 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
829 pagenum = pagenum + 1
831 class YoutubePlaylistIE(InfoExtractor):
832 """Information Extractor for YouTube playlists."""
834 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
835 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
836 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
837 _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
840 def __init__(self, youtube_ie, downloader=None):
841 InfoExtractor.__init__(self, downloader)
842 self._youtube_ie = youtube_ie
846 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
848 def report_download_page(self, playlist_id, pagenum):
849 """Report attempt to download playlist page with given number."""
850 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
852 def _real_initialize(self):
853 self._youtube_ie.initialize()
855 def _real_extract(self, url):
856 # Extract playlist id
857 mobj = re.match(self._VALID_URL, url)
859 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
862 # Download playlist pages
863 playlist_id = mobj.group(1)
868 self.report_download_page(playlist_id, pagenum)
869 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
871 page = urllib2.urlopen(request).read()
872 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
873 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
876 # Extract video identifiers
878 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
879 if mobj.group(1) not in ids_in_page:
880 ids_in_page.append(mobj.group(1))
881 video_ids.extend(ids_in_page)
883 if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
885 pagenum = pagenum + 1
888 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
891 class PostProcessor(object):
892 """Post Processor class.
894 PostProcessor objects can be added to downloaders with their
895 add_post_processor() method. When the downloader has finished a
896 successful download, it will take its internal chain of PostProcessors
897 and start calling the run() method on each one of them, first with
898 an initial argument and then with the returned value of the previous
901 The chain will be stopped if one of them ever returns None or the end
902 of the chain is reached.
904 PostProcessor objects follow a "mutual registration" process similar
905 to InfoExtractor objects.
910 def __init__(self, downloader=None):
911 self._downloader = downloader
913 def set_downloader(self, downloader):
914 """Sets the downloader for this PP."""
915 self._downloader = downloader
917 def run(self, information):
918 """Run the PostProcessor.
920 The "information" argument is a dictionary like the ones
921 returned by InfoExtractors. The only difference is that this
922 one has an extra field called "filepath" that points to the
925 When this method returns None, the postprocessing chain is
926 stopped. However, this method may return an information
927 dictionary that will be passed to the next postprocessing
928 object in the chain. It can be the one it received after
929 changing some fields.
931 In addition, this method may raise a PostProcessingError
932 exception that will be taken into account by the downloader
935 return information # by default, do nothing
938 if __name__ == '__main__':
940 # Modules needed only when running the main program
944 # General configuration
945 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
946 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
947 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
950 parser = optparse.OptionParser(
951 usage='Usage: %prog [options] url...',
953 conflict_handler='resolve',
955 parser.add_option('-h', '--help',
956 action='help', help='print this help text and exit')
957 parser.add_option('-v', '--version',
958 action='version', help='print program version and exit')
959 parser.add_option('-u', '--username',
960 dest='username', metavar='UN', help='account username')
961 parser.add_option('-p', '--password',
962 dest='password', metavar='PW', help='account password')
963 parser.add_option('-o', '--output',
964 dest='outtmpl', metavar='TPL', help='output filename template')
965 parser.add_option('-q', '--quiet',
966 action='store_true', dest='quiet', help='activates quiet mode', default=False)
967 parser.add_option('-s', '--simulate',
968 action='store_true', dest='simulate', help='do not download video', default=False)
969 parser.add_option('-t', '--title',
970 action='store_true', dest='usetitle', help='use title in file name', default=False)
971 parser.add_option('-l', '--literal',
972 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
973 parser.add_option('-n', '--netrc',
974 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
975 parser.add_option('-g', '--get-url',
976 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
977 parser.add_option('-e', '--get-title',
978 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
979 parser.add_option('-f', '--format',
980 dest='format', metavar='FMT', help='video format code')
981 parser.add_option('-m', '--mobile-version',
982 action='store_const', dest='format', help='alias for -f 17', const='17')
983 parser.add_option('-d', '--high-def',
984 action='store_const', dest='format', help='alias for -f 22', const='22')
985 parser.add_option('-i', '--ignore-errors',
986 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
987 parser.add_option('-r', '--rate-limit',
988 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
989 parser.add_option('-a', '--batch-file',
990 dest='batchfile', metavar='F', help='file containing URLs to download')
991 parser.add_option('-w', '--no-overwrites',
992 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
993 (opts, args) = parser.parse_args()
995 # Batch file verification
997 if opts.batchfile is not None:
999 batchurls = [line.strip() for line in open(opts.batchfile, 'r')]
1001 sys.exit(u'ERROR: batch file could not be read')
1002 all_urls = batchurls + args
1004 # Conflicting, missing and erroneous options
1005 if len(all_urls) < 1:
1006 sys.exit(u'ERROR: you must provide at least one URL')
1007 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1008 sys.exit(u'ERROR: using .netrc conflicts with giving username/password')
1009 if opts.password is not None and opts.username is None:
1010 sys.exit(u'ERROR: account username missing')
1011 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1012 sys.exit(u'ERROR: using output template conflicts with using title or literal title')
1013 if opts.usetitle and opts.useliteral:
1014 sys.exit(u'ERROR: using title conflicts with using literal title')
1015 if opts.username is not None and opts.password is None:
1016 opts.password = getpass.getpass(u'Type account password and press return:')
1017 if opts.ratelimit is not None:
1018 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1019 if numeric_limit is None:
1020 sys.exit(u'ERROR: invalid rate limit specified')
1021 opts.ratelimit = numeric_limit
1023 # Information extractors
1024 youtube_ie = YoutubeIE()
1025 metacafe_ie = MetacafeIE(youtube_ie)
1026 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1027 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1030 fd = FileDownloader({
1031 'usenetrc': opts.usenetrc,
1032 'username': opts.username,
1033 'password': opts.password,
1034 'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1035 'forceurl': opts.geturl,
1036 'forcetitle': opts.gettitle,
1037 'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1038 'format': opts.format,
1039 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(locale.getpreferredencoding()))
1040 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1041 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1042 or u'%(id)s.%(ext)s'),
1043 'ignoreerrors': opts.ignoreerrors,
1044 'ratelimit': opts.ratelimit,
1045 'nooverwrites': opts.nooverwrites,
1047 fd.add_info_extractor(youtube_search_ie)
1048 fd.add_info_extractor(youtube_pl_ie)
1049 fd.add_info_extractor(metacafe_ie)
1050 fd.add_info_extractor(youtube_ie)
1051 retcode = fd.download(all_urls)
1054 except DownloadError:
1056 except SameFileError:
1057 sys.exit(u'ERROR: fixed output name but more than one file to download')
1058 except KeyboardInterrupt:
1059 sys.exit(u'\nERROR: Interrupted by user')