2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # License: Public domain code
22 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8',
23 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
24 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
25 'Accept-Language': 'en-us,en;q=0.5',
28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
30 class DownloadError(Exception):
31 """Download Error exception.
33 This exception may be thrown by FileDownloader objects if they are not
34 configured to continue on errors. They will contain the appropriate
39 class SameFileError(Exception):
40 """Same File exception.
42 This exception will be thrown by FileDownloader objects if they detect
43 multiple files would have to be downloaded to the same file on disk.
47 class PostProcessingError(Exception):
48 """Post Processing exception.
50 This exception may be raised by PostProcessor's .run() method to
51 indicate an error in the postprocessing task.
55 class FileDownloader(object):
56 """File Downloader class.
58 File downloader objects are the ones responsible of downloading the
59 actual video file and writing it to disk if the user has requested
60 it, among some other tasks. In most cases there should be one per
61 program. As, given a video URL, the downloader doesn't know how to
62 extract all the needed information, task that InfoExtractors do, it
63 has to pass the URL to one of them.
65 For this, file downloader objects have a method that allows
66 InfoExtractors to be registered in a given order. When it is passed
67 a URL, the file downloader handles it to the first InfoExtractor it
68 finds that reports being able to handle it. The InfoExtractor returns
69 all the information to the FileDownloader and the latter downloads the
70 file or does whatever it's instructed to do.
72 File downloaders accept a lot of parameters. In order not to saturate
73 the object constructor with arguments, it receives a dictionary of
74 options instead. These options are available through the params
75 attribute for the InfoExtractors to use. The FileDownloader also
76 registers itself as the downloader in charge for the InfoExtractors
77 that are added to it, so this is a "mutual registration".
81 username: Username for authentication purposes.
82 password: Password for authentication purposes.
83 usenetrc: Use netrc for authentication instead.
84 quiet: Do not print messages to stdout.
85 forceurl: Force printing final URL.
86 forcetitle: Force printing title.
87 simulate: Do not download the video files.
88 format: Video format code.
89 outtmpl: Template for output names.
90 ignoreerrors: Do not stop on download errors.
91 ratelimit: Download speed limit, in bytes/sec.
92 nooverwrites: Prevent overwriting files.
98 _download_retcode = None
100 def __init__(self, params):
101 """Create a FileDownloader object with the given options."""
104 self._download_retcode = 0
108 def pmkdir(filename):
109 """Create directory components in filename. Similar to Unix "mkdir -p"."""
110 components = filename.split(os.sep)
111 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
112 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
113 for dir in aggregate:
114 if not os.path.exists(dir):
118 def format_bytes(bytes):
124 exponent = long(math.log(float(bytes), 1024.0))
125 suffix = 'bkMGTPEZY'[exponent]
126 converted = float(bytes) / float(1024**exponent)
127 return '%.2f%s' % (converted, suffix)
130 def calc_percent(byte_counter, data_len):
133 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
136 def calc_eta(start, now, total, current):
140 if current == 0 or dif < 0.001: # One millisecond
142 rate = float(current) / dif
143 eta = long((float(total) - float(current)) / rate)
144 (eta_mins, eta_secs) = divmod(eta, 60)
147 return '%02d:%02d' % (eta_mins, eta_secs)
150 def calc_speed(start, now, bytes):
152 if bytes == 0 or dif < 0.001: # One millisecond
153 return '%10s' % '---b/s'
154 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
157 def best_block_size(elapsed_time, bytes):
158 new_min = max(bytes / 2.0, 1.0)
159 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
160 if elapsed_time < 0.001:
162 rate = bytes / elapsed_time
170 def parse_bytes(bytestr):
171 """Parse a string indicating a byte quantity into a long integer."""
172 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
175 number = float(matchobj.group(1))
176 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
177 return long(round(number * multiplier))
179 def add_info_extractor(self, ie):
180 """Add an InfoExtractor object to the end of the list."""
182 ie.set_downloader(self)
184 def add_post_processor(self, pp):
185 """Add a PostProcessor object to the end of the chain."""
187 pp.set_downloader(self)
189 def to_stdout(self, message, skip_eol=False):
190 """Print message to stdout if not in quiet mode."""
191 if not self.params.get('quiet', False):
192 print u'%s%s' % (message, [u'\n', u''][skip_eol]),
195 def to_stderr(self, message):
196 """Print message to stderr."""
197 print >>sys.stderr, message
199 def fixed_template(self):
200 """Checks if the output template is fixed."""
201 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
203 def trouble(self, message=None):
204 """Determine action to take when a download problem appears.
206 Depending on if the downloader has been configured to ignore
207 download errors or not, this method may throw an exception or
208 not when errors are found, after printing the message.
210 if message is not None:
211 self.to_stderr(message)
212 if not self.params.get('ignoreerrors', False):
213 raise DownloadError(message)
214 self._download_retcode = 1
216 def slow_down(self, start_time, byte_counter):
217 """Sleep if the download speed is over the rate limit."""
218 rate_limit = self.params.get('ratelimit', None)
219 if rate_limit is None or byte_counter == 0:
222 elapsed = now - start_time
225 speed = float(byte_counter) / elapsed
226 if speed > rate_limit:
227 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
229 def report_destination(self, filename):
230 """Report destination filename."""
231 self.to_stdout(u'[download] Destination: %s' % filename)
233 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
234 """Report download progress."""
235 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
236 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
238 def report_finish(self):
239 """Report download finished."""
242 def process_info(self, info_dict):
243 """Process a single dictionary returned by an InfoExtractor."""
245 if self.params.get('forcetitle', False):
246 print info_dict['title']
247 if self.params.get('forceurl', False):
248 print info_dict['url']
250 # Do nothing else if in simulate mode
251 if self.params.get('simulate', False):
255 filename = self.params['outtmpl'] % info_dict
256 self.report_destination(filename)
257 except (ValueError, KeyError), err:
258 self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
259 if self.params['nooverwrites'] and os.path.exists(filename):
260 self.to_stderr('WARNING: file exists: %s; skipping' % filename)
263 self.pmkdir(filename)
264 except (OSError, IOError), err:
265 self.trouble('ERROR: unable to create directories: %s' % str(err))
268 outstream = open(filename, 'wb')
269 except (OSError, IOError), err:
270 self.trouble('ERROR: unable to open for writing: %s' % str(err))
273 self._do_download(outstream, info_dict['url'])
275 except (OSError, IOError), err:
276 self.trouble('ERROR: unable to write video data: %s' % str(err))
278 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
279 self.trouble('ERROR: unable to download video data: %s' % str(err))
282 self.post_process(filename, info_dict)
283 except (PostProcessingError), err:
284 self.trouble('ERROR: postprocessing: %s' % str(err))
289 def download(self, url_list):
290 """Download a given list of URLs."""
291 if len(url_list) > 1 and self.fixed_template():
292 raise SameFileError(self.params['outtmpl'])
295 suitable_found = False
297 # Go to next InfoExtractor if not suitable
298 if not ie.suitable(url):
301 # Suitable InfoExtractor found
302 suitable_found = True
304 # Extract information from URL and process it
307 # Suitable InfoExtractor had been found; go to next URL
310 if not suitable_found:
311 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
313 return self._download_retcode
315 def post_process(self, filename, ie_info):
316 """Run the postprocessing chain on the given file."""
318 info['filepath'] = filename
324 def _do_download(self, stream, url):
325 request = urllib2.Request(url, None, std_headers)
326 data = urllib2.urlopen(request)
327 data_len = data.info().get('Content-length', None)
328 data_len_str = self.format_bytes(data_len)
334 percent_str = self.calc_percent(byte_counter, data_len)
335 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
336 speed_str = self.calc_speed(start, time.time(), byte_counter)
337 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
341 data_block = data.read(block_size)
343 data_block_len = len(data_block)
344 if data_block_len == 0:
346 byte_counter += data_block_len
347 stream.write(data_block)
348 block_size = self.best_block_size(after - before, data_block_len)
351 self.slow_down(start, byte_counter)
354 if data_len is not None and str(byte_counter) != data_len:
355 raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
357 class InfoExtractor(object):
358 """Information Extractor class.
360 Information extractors are the classes that, given a URL, extract
361 information from the video (or videos) the URL refers to. This
362 information includes the real video URL, the video title and simplified
363 title, author and others. It is returned in a list of dictionaries when
364 calling its extract() method. It is a list because a URL can refer to
365 more than one video (think of playlists). The dictionaries must include
366 the following fields:
368 id: Video identifier.
369 url: Final video URL.
370 uploader: Nickname of the video uploader.
371 title: Literal title.
372 stitle: Simplified title.
373 ext: Video filename extension.
375 Subclasses of this one should re-define the _real_initialize() and
376 _real_extract() methods, as well as the suitable() static method.
377 Probably, they should also be instantiated and added to the main
384 def __init__(self, downloader=None):
385 """Constructor. Receives an optional downloader."""
387 self.set_downloader(downloader)
391 """Receives a URL and returns True if suitable for this IE."""
394 def initialize(self):
395 """Initializes an instance (authentication, etc)."""
397 self._real_initialize()
400 def extract(self, url):
401 """Extracts URL information and returns it in list of dicts."""
403 return self._real_extract(url)
405 def set_downloader(self, downloader):
406 """Sets the downloader for this IE."""
407 self._downloader = downloader
409 def _real_initialize(self):
410 """Real initialization process. Redefine in subclasses."""
413 def _real_extract(self, url):
414 """Real extraction process. Redefine in subclasses."""
417 class YoutubeIE(InfoExtractor):
418 """Information extractor for youtube.com."""
420 _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
421 _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
422 _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
423 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
424 _NETRC_MACHINE = 'youtube'
428 return (re.match(YoutubeIE._VALID_URL, url) is not None)
431 def htmlentity_transform(matchobj):
432 """Transforms an HTML entity to a Unicode character."""
433 entity = matchobj.group(1)
435 # Known non-numeric HTML entity
436 if entity in htmlentitydefs.name2codepoint:
437 return unichr(htmlentitydefs.name2codepoint[entity])
440 mobj = re.match(ur'(?u)#(x?\d+)', entity)
442 numstr = mobj.group(1)
443 if numstr.startswith(u'x'):
445 numstr = u'0%s' % numstr
448 return unichr(long(numstr, base))
450 # Unknown entity in name, return its literal representation
451 return (u'&%s;' % entity)
453 def report_lang(self):
454 """Report attempt to set language."""
455 self._downloader.to_stdout(u'[youtube] Setting language')
457 def report_login(self):
458 """Report attempt to log in."""
459 self._downloader.to_stdout(u'[youtube] Logging in')
461 def report_age_confirmation(self):
462 """Report attempt to confirm age."""
463 self._downloader.to_stdout(u'[youtube] Confirming age')
465 def report_webpage_download(self, video_id):
466 """Report attempt to download webpage."""
467 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
469 def report_information_extraction(self, video_id):
470 """Report attempt to extract video information."""
471 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
473 def report_video_url(self, video_id, video_real_url):
474 """Report extracted video URL."""
475 self._downloader.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
477 def _real_initialize(self):
478 if self._downloader is None:
483 downloader_params = self._downloader.params
485 # Attempt to use provided username and password or .netrc data
486 if downloader_params.get('username', None) is not None:
487 username = downloader_params['username']
488 password = downloader_params['password']
489 elif downloader_params.get('usenetrc', False):
491 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
496 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
497 except (IOError, netrc.NetrcParseError), err:
498 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
502 request = urllib2.Request(self._LANG_URL, None, std_headers)
505 urllib2.urlopen(request).read()
506 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
507 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
510 # No authentication to be performed
516 'current_form': 'loginForm',
518 'action_login': 'Log In',
519 'username': username,
520 'password': password,
522 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
525 login_results = urllib2.urlopen(request).read()
526 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
527 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
529 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
530 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
536 'action_confirm': 'Confirm',
538 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
540 self.report_age_confirmation()
541 age_results = urllib2.urlopen(request).read()
542 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
543 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
546 def _real_extract(self, url):
547 # Extract video id from URL
548 mobj = re.match(self._VALID_URL, url)
550 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
552 video_id = mobj.group(2)
554 # Downloader parameters
556 if self._downloader is not None:
557 params = self._downloader.params
558 format_param = params.get('format', None)
565 }.get(format_param, 'flv')
567 # Normalize URL, including format
568 normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
569 if format_param is not None:
570 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
571 request = urllib2.Request(normalized_url, None, std_headers)
573 self.report_webpage_download(video_id)
574 video_webpage = urllib2.urlopen(request).read()
575 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
576 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
578 self.report_information_extraction(video_id)
581 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
583 self._downloader.trouble(u'ERROR: unable to extract "t" parameter')
585 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&el=detailpage&ps=' % (video_id, mobj.group(1))
586 if format_param is not None:
587 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
588 self.report_video_url(video_id, video_real_url)
591 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
593 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
595 video_uploader = mobj.group(1)
598 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
600 self._downloader.trouble(u'ERROR: unable to extract video title')
602 video_title = mobj.group(1).decode('utf-8')
603 video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
604 video_title = video_title.replace(os.sep, u'%')
607 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
608 simple_title = simple_title.strip(ur'_')
610 # Process video information
611 self._downloader.process_info({
612 'id': video_id.decode('utf-8'),
613 'url': video_real_url.decode('utf-8'),
614 'uploader': video_uploader.decode('utf-8'),
615 'title': video_title,
616 'stitle': simple_title,
617 'ext': video_extension.decode('utf-8'),
620 class MetacafeIE(InfoExtractor):
621 """Information Extractor for metacafe.com."""
623 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
624 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
627 def __init__(self, youtube_ie, downloader=None):
628 InfoExtractor.__init__(self, downloader)
629 self._youtube_ie = youtube_ie
633 return (re.match(MetacafeIE._VALID_URL, url) is not None)
635 def report_disclaimer(self):
636 """Report disclaimer retrieval."""
637 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
639 def report_age_confirmation(self):
640 """Report attempt to confirm age."""
641 self._downloader.to_stdout(u'[metacafe] Confirming age')
643 def report_download_webpage(self, video_id):
644 """Report webpage download."""
645 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
647 def report_extraction(self, video_id):
648 """Report information extraction."""
649 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
651 def _real_initialize(self):
652 # Retrieve disclaimer
653 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
655 self.report_disclaimer()
656 disclaimer = urllib2.urlopen(request).read()
657 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
658 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
664 'submit': "Continue - I'm over 18",
666 request = urllib2.Request('http://www.metacafe.com/', urllib.urlencode(disclaimer_form), std_headers)
668 self.report_age_confirmation()
669 disclaimer = urllib2.urlopen(request).read()
670 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
671 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
674 def _real_extract(self, url):
675 # Extract id and simplified title from URL
676 mobj = re.match(self._VALID_URL, url)
678 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
681 video_id = mobj.group(1)
683 # Check if video comes from YouTube
684 mobj2 = re.match(r'^yt-(.*)$', video_id)
685 if mobj2 is not None:
686 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
689 simple_title = mobj.group(2).decode('utf-8')
690 video_extension = 'flv'
692 # Retrieve video webpage to extract further information
693 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
695 self.report_download_webpage(video_id)
696 webpage = urllib2.urlopen(request).read()
697 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
698 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
701 # Extract URL, uploader and title from webpage
702 self.report_extraction(video_id)
703 mobj = re.search(r'(?m)"mediaURL":"(http.*?\.flv)"', webpage)
705 self._downloader.trouble(u'ERROR: unable to extract media URL')
707 mediaURL = mobj.group(1).replace('\\', '')
709 mobj = re.search(r'(?m)"gdaKey":"(.*?)"', webpage)
711 self._downloader.trouble(u'ERROR: unable to extract gdaKey')
713 gdaKey = mobj.group(1)
715 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
717 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
719 self._downloader.trouble(u'ERROR: unable to extract title')
721 video_title = mobj.group(1).decode('utf-8')
723 mobj = re.search(r'(?m)<li id="ChnlUsr">.*?Submitter:<br />(.*?)</li>', webpage)
725 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
727 video_uploader = re.sub(r'<.*?>', '', mobj.group(1))
729 # Process video information
730 self._downloader.process_info({
731 'id': video_id.decode('utf-8'),
732 'url': video_url.decode('utf-8'),
733 'uploader': video_uploader.decode('utf-8'),
734 'title': video_title,
735 'stitle': simple_title,
736 'ext': video_extension.decode('utf-8'),
740 class YoutubeSearchIE(InfoExtractor):
741 """Information Extractor for YouTube search queries."""
742 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
743 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
744 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
745 _MORE_PAGES_INDICATOR = r'>Next</a>'
747 _max_youtube_results = 1000
749 def __init__(self, youtube_ie, downloader=None):
750 InfoExtractor.__init__(self, downloader)
751 self._youtube_ie = youtube_ie
755 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
757 def report_download_page(self, query, pagenum):
758 """Report attempt to download playlist page with given number."""
759 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
761 def _real_initialize(self):
762 self._youtube_ie.initialize()
764 def _real_extract(self, query):
765 mobj = re.match(self._VALID_QUERY, query)
767 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
770 prefix, query = query.split(':')
773 self._download_n_results(query, 1)
775 elif prefix == 'all':
776 self._download_n_results(query, self._max_youtube_results)
782 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
784 elif n > self._max_youtube_results:
785 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
786 n = self._max_youtube_results
787 self._download_n_results(query, n)
789 except ValueError: # parsing prefix as int fails
790 self._download_n_results(query, 1)
793 def _download_n_results(self, query, n):
794 """Downloads a specified number of results for a query"""
801 self.report_download_page(query, pagenum)
802 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
803 request = urllib2.Request(result_url, None, std_headers)
805 page = urllib2.urlopen(request).read()
806 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
807 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
810 # Extract video identifiers
811 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
812 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
813 if video_id not in already_seen:
814 video_ids.append(video_id)
815 already_seen.add(video_id)
816 if len(video_ids) == n:
817 # Specified n videos reached
819 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
822 if self._MORE_PAGES_INDICATOR not in page:
824 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
827 pagenum = pagenum + 1
829 class YoutubePlaylistIE(InfoExtractor):
830 """Information Extractor for YouTube playlists."""
832 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
833 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
834 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
835 _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
838 def __init__(self, youtube_ie, downloader=None):
839 InfoExtractor.__init__(self, downloader)
840 self._youtube_ie = youtube_ie
844 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
846 def report_download_page(self, playlist_id, pagenum):
847 """Report attempt to download playlist page with given number."""
848 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
850 def _real_initialize(self):
851 self._youtube_ie.initialize()
853 def _real_extract(self, url):
854 # Extract playlist id
855 mobj = re.match(self._VALID_URL, url)
857 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
860 # Download playlist pages
861 playlist_id = mobj.group(1)
866 self.report_download_page(playlist_id, pagenum)
867 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
869 page = urllib2.urlopen(request).read()
870 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
871 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
874 # Extract video identifiers
876 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
877 if mobj.group(1) not in ids_in_page:
878 ids_in_page.append(mobj.group(1))
879 video_ids.extend(ids_in_page)
881 if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
883 pagenum = pagenum + 1
886 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
889 class PostProcessor(object):
890 """Post Processor class.
892 PostProcessor objects can be added to downloaders with their
893 add_post_processor() method. When the downloader has finished a
894 successful download, it will take its internal chain of PostProcessors
895 and start calling the run() method on each one of them, first with
896 an initial argument and then with the returned value of the previous
899 The chain will be stopped if one of them ever returns None or the end
900 of the chain is reached.
902 PostProcessor objects follow a "mutual registration" process similar
903 to InfoExtractor objects.
908 def __init__(self, downloader=None):
909 self._downloader = downloader
911 def set_downloader(self, downloader):
912 """Sets the downloader for this PP."""
913 self._downloader = downloader
915 def run(self, information):
916 """Run the PostProcessor.
918 The "information" argument is a dictionary like the ones
919 returned by InfoExtractors. The only difference is that this
920 one has an extra field called "filepath" that points to the
923 When this method returns None, the postprocessing chain is
924 stopped. However, this method may return an information
925 dictionary that will be passed to the next postprocessing
926 object in the chain. It can be the one it received after
927 changing some fields.
929 In addition, this method may raise a PostProcessingError
930 exception that will be taken into account by the downloader
933 return information # by default, do nothing
936 if __name__ == '__main__':
938 # Modules needed only when running the main program
942 # General configuration
943 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
944 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
945 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
948 parser = optparse.OptionParser(
949 usage='Usage: %prog [options] url...',
951 conflict_handler='resolve',
953 parser.add_option('-h', '--help',
954 action='help', help='print this help text and exit')
955 parser.add_option('-v', '--version',
956 action='version', help='print program version and exit')
957 parser.add_option('-u', '--username',
958 dest='username', metavar='UN', help='account username')
959 parser.add_option('-p', '--password',
960 dest='password', metavar='PW', help='account password')
961 parser.add_option('-o', '--output',
962 dest='outtmpl', metavar='TPL', help='output filename template')
963 parser.add_option('-q', '--quiet',
964 action='store_true', dest='quiet', help='activates quiet mode', default=False)
965 parser.add_option('-s', '--simulate',
966 action='store_true', dest='simulate', help='do not download video', default=False)
967 parser.add_option('-t', '--title',
968 action='store_true', dest='usetitle', help='use title in file name', default=False)
969 parser.add_option('-l', '--literal',
970 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
971 parser.add_option('-n', '--netrc',
972 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
973 parser.add_option('-g', '--get-url',
974 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
975 parser.add_option('-e', '--get-title',
976 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
977 parser.add_option('-f', '--format',
978 dest='format', metavar='FMT', help='video format code')
979 parser.add_option('-m', '--mobile-version',
980 action='store_const', dest='format', help='alias for -f 17', const='17')
981 parser.add_option('-d', '--high-def',
982 action='store_const', dest='format', help='alias for -f 22', const='22')
983 parser.add_option('-i', '--ignore-errors',
984 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
985 parser.add_option('-r', '--rate-limit',
986 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
987 parser.add_option('-a', '--batch-file',
988 dest='batchfile', metavar='F', help='file containing URLs to download')
989 parser.add_option('-w', '--no-overwrites',
990 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
991 (opts, args) = parser.parse_args()
993 # Batch file verification
995 if opts.batchfile is not None:
997 batchurls = [line.strip() for line in open(opts.batchfile, 'r')]
999 sys.exit(u'ERROR: batch file could not be read')
1000 all_urls = batchurls + args
1002 # Conflicting, missing and erroneous options
1003 if len(all_urls) < 1:
1004 sys.exit(u'ERROR: you must provide at least one URL')
1005 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1006 sys.exit(u'ERROR: using .netrc conflicts with giving username/password')
1007 if opts.password is not None and opts.username is None:
1008 sys.exit(u'ERROR: account username missing')
1009 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1010 sys.exit(u'ERROR: using output template conflicts with using title or literal title')
1011 if opts.usetitle and opts.useliteral:
1012 sys.exit(u'ERROR: using title conflicts with using literal title')
1013 if opts.username is not None and opts.password is None:
1014 opts.password = getpass.getpass(u'Type account password and press return:')
1015 if opts.ratelimit is not None:
1016 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1017 if numeric_limit is None:
1018 sys.exit(u'ERROR: invalid rate limit specified')
1019 opts.ratelimit = numeric_limit
1021 # Information extractors
1022 youtube_ie = YoutubeIE()
1023 metacafe_ie = MetacafeIE(youtube_ie)
1024 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1025 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1028 charset = locale.getpreferredencoding()
1031 fd = FileDownloader({
1032 'usenetrc': opts.usenetrc,
1033 'username': opts.username,
1034 'password': opts.password,
1035 'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1036 'forceurl': opts.geturl,
1037 'forcetitle': opts.gettitle,
1038 'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1039 'format': opts.format,
1040 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(charset))
1041 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1042 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1043 or u'%(id)s.%(ext)s'),
1044 'ignoreerrors': opts.ignoreerrors,
1045 'ratelimit': opts.ratelimit,
1046 'nooverwrites': opts.nooverwrites,
1048 fd.add_info_extractor(youtube_search_ie)
1049 fd.add_info_extractor(youtube_pl_ie)
1050 fd.add_info_extractor(metacafe_ie)
1051 fd.add_info_extractor(youtube_ie)
1052 retcode = fd.download(all_urls)
1055 except DownloadError:
1057 except SameFileError:
1058 sys.exit(u'ERROR: fixed output name but more than one file to download')
1059 except KeyboardInterrupt:
1060 sys.exit(u'\nERROR: Interrupted by user')