2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # License: Public domain code
22 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.5) Gecko/2008120122 Firefox/3.0.5',
23 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
24 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
25 'Accept-Language': 'en-us,en;q=0.5',
28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
30 class DownloadError(Exception):
31 """Download Error exception.
33 This exception may be thrown by FileDownloader objects if they are not
34 configured to continue on errors. They will contain the appropriate
39 class SameFileError(Exception):
40 """Same File exception.
42 This exception will be thrown by FileDownloader objects if they detect
43 multiple files would have to be downloaded to the same file on disk.
47 class PostProcessingError(Exception):
48 """Post Processing exception.
50 This exception may be raised by PostProcessor's .run() method to
51 indicate an error in the postprocessing task.
55 class FileDownloader(object):
56 """File Downloader class.
58 File downloader objects are the ones responsible of downloading the
59 actual video file and writing it to disk if the user has requested
60 it, among some other tasks. In most cases there should be one per
61 program. As, given a video URL, the downloader doesn't know how to
62 extract all the needed information, task that InfoExtractors do, it
63 has to pass the URL to one of them.
65 For this, file downloader objects have a method that allows
66 InfoExtractors to be registered in a given order. When it is passed
67 a URL, the file downloader handles it to the first InfoExtractor it
68 finds that reports being able to handle it. The InfoExtractor returns
69 all the information to the FileDownloader and the latter downloads the
70 file or does whatever it's instructed to do.
72 File downloaders accept a lot of parameters. In order not to saturate
73 the object constructor with arguments, it receives a dictionary of
74 options instead. These options are available through the get_params()
75 method for the InfoExtractors to use. The FileDownloader also registers
76 itself as the downloader in charge for the InfoExtractors that are
77 added to it, so this is a "mutual registration".
81 username: Username for authentication purposes.
82 password: Password for authentication purposes.
83 usenetrc: Use netrc for authentication instead.
84 quiet: Do not print messages to stdout.
85 forceurl: Force printing final URL.
86 forcetitle: Force printing title.
87 simulate: Do not download the video files.
88 format: Video format code.
89 outtmpl: Template for output names.
90 ignoreerrors: Do not stop on download errors.
91 ratelimit: Download speed limit, in bytes/sec.
92 nooverwrites: Prevent overwriting files.
99 def __init__(self, params):
100 """Create a FileDownloader object with the given options."""
103 self.set_params(params)
106 def pmkdir(filename):
107 """Create directory components in filename. Similar to Unix "mkdir -p"."""
108 components = filename.split(os.sep)
109 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
110 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
111 for dir in aggregate:
112 if not os.path.exists(dir):
116 def format_bytes(bytes):
122 exponent = long(math.log(float(bytes), 1024.0))
123 suffix = 'bkMGTPEZY'[exponent]
124 converted = float(bytes) / float(1024**exponent)
125 return '%.2f%s' % (converted, suffix)
128 def calc_percent(byte_counter, data_len):
131 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
134 def calc_eta(start, now, total, current):
138 if current == 0 or dif < 0.001: # One millisecond
140 rate = float(current) / dif
141 eta = long((float(total) - float(current)) / rate)
142 (eta_mins, eta_secs) = divmod(eta, 60)
145 return '%02d:%02d' % (eta_mins, eta_secs)
148 def calc_speed(start, now, bytes):
150 if bytes == 0 or dif < 0.001: # One millisecond
151 return '%10s' % '---b/s'
152 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
155 def best_block_size(elapsed_time, bytes):
156 new_min = max(bytes / 2.0, 1.0)
157 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
158 if elapsed_time < 0.001:
160 rate = bytes / elapsed_time
168 def parse_bytes(bytestr):
169 """Parse a string indicating a byte quantity into a long integer."""
170 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
173 number = float(matchobj.group(1))
174 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
175 return long(round(number * multiplier))
177 def set_params(self, params):
178 """Sets parameters."""
179 if type(params) != dict:
180 raise ValueError('params: dictionary expected')
181 self._params = params
183 def get_params(self):
184 """Get parameters."""
187 def add_info_extractor(self, ie):
188 """Add an InfoExtractor object to the end of the list."""
190 ie.set_downloader(self)
192 def add_post_processor(self, pp):
193 """Add a PostProcessor object to the end of the chain."""
195 pp.set_downloader(self)
197 def to_stdout(self, message, skip_eol=False):
198 """Print message to stdout if not in quiet mode."""
199 if not self._params.get('quiet', False):
200 print u'%s%s' % (message, [u'\n', u''][skip_eol]),
203 def to_stderr(self, message):
204 """Print message to stderr."""
205 print >>sys.stderr, message
207 def fixed_template(self):
208 """Checks if the output template is fixed."""
209 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
211 def trouble(self, message=None):
212 """Determine action to take when a download problem appears.
214 Depending on if the downloader has been configured to ignore
215 download errors or not, this method may throw an exception or
216 not when errors are found, after printing the message. If it
217 doesn't raise, it returns an error code suitable to be returned
218 later as a program exit code to indicate error.
220 if message is not None:
221 self.to_stderr(message)
222 if not self._params.get('ignoreerrors', False):
223 raise DownloadError(message)
226 def slow_down(self, start_time, byte_counter):
227 """Sleep if the download speed is over the rate limit."""
228 rate_limit = self._params.get('ratelimit', None)
229 if rate_limit is None or byte_counter == 0:
232 elapsed = now - start_time
235 speed = float(byte_counter) / elapsed
236 if speed > rate_limit:
237 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
239 def report_destination(self, filename):
240 """Report destination filename."""
241 self.to_stdout(u'[download] Destination: %s' % filename)
243 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
244 """Report download progress."""
245 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
246 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
248 def report_finish(self):
249 """Report download finished."""
252 def download(self, url_list):
253 """Download a given list of URLs."""
255 if len(url_list) > 1 and self.fixed_template():
256 raise SameFileError(self._params['outtmpl'])
259 suitable_found = False
261 if not ie.suitable(url):
263 # Suitable InfoExtractor found
264 suitable_found = True
265 all_results = ie.extract(url)
266 results = [x for x in all_results if x is not None]
267 if len(results) != len(all_results):
268 retcode = self.trouble()
270 if len(results) > 1 and self.fixed_template():
271 raise SameFileError(self._params['outtmpl'])
273 for result in results:
275 if self._params.get('forcetitle', False):
276 print result['title']
277 if self._params.get('forceurl', False):
280 # Do nothing else if in simulate mode
281 if self._params.get('simulate', False):
285 filename = self._params['outtmpl'] % result
286 self.report_destination(filename)
287 except (ValueError, KeyError), err:
288 retcode = self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
290 if self._params['nooverwrites'] and os.path.exists(filename):
291 self.to_stderr('WARNING: file exists: %s; skipping' % filename)
294 self.pmkdir(filename)
295 except (OSError, IOError), err:
296 retcode = self.trouble('ERROR: unable to create directories: %s' % str(err))
299 outstream = open(filename, 'wb')
300 except (OSError, IOError), err:
301 retcode = self.trouble('ERROR: unable to open for writing: %s' % str(err))
304 self._do_download(outstream, result['url'])
306 except (OSError, IOError), err:
307 retcode = self.trouble('ERROR: unable to write video data: %s' % str(err))
309 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
310 retcode = self.trouble('ERROR: unable to download video data: %s' % str(err))
313 self.post_process(filename, result)
314 except (PostProcessingError), err:
315 retcode = self.trouble('ERROR: postprocessing: %s' % str(err))
319 if not suitable_found:
320 retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
324 def post_process(self, filename, ie_info):
325 """Run the postprocessing chain on the given file."""
327 info['filepath'] = filename
333 def _do_download(self, stream, url):
334 request = urllib2.Request(url, None, std_headers)
335 data = urllib2.urlopen(request)
336 data_len = data.info().get('Content-length', None)
337 data_len_str = self.format_bytes(data_len)
343 percent_str = self.calc_percent(byte_counter, data_len)
344 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
345 speed_str = self.calc_speed(start, time.time(), byte_counter)
346 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
350 data_block = data.read(block_size)
352 data_block_len = len(data_block)
353 if data_block_len == 0:
355 byte_counter += data_block_len
356 stream.write(data_block)
357 block_size = self.best_block_size(after - before, data_block_len)
360 self.slow_down(start, byte_counter)
363 if data_len is not None and str(byte_counter) != data_len:
364 raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
366 class InfoExtractor(object):
367 """Information Extractor class.
369 Information extractors are the classes that, given a URL, extract
370 information from the video (or videos) the URL refers to. This
371 information includes the real video URL, the video title and simplified
372 title, author and others. It is returned in a list of dictionaries when
373 calling its extract() method. It is a list because a URL can refer to
374 more than one video (think of playlists). The dictionaries must include
375 the following fields:
377 id: Video identifier.
378 url: Final video URL.
379 uploader: Nickname of the video uploader.
380 title: Literal title.
381 stitle: Simplified title.
382 ext: Video filename extension.
384 Subclasses of this one should re-define the _real_initialize() and
385 _real_extract() methods, as well as the suitable() static method.
386 Probably, they should also be instantiated and added to the main
393 def __init__(self, downloader=None):
394 """Constructor. Receives an optional downloader."""
396 self.set_downloader(downloader)
400 """Receives a URL and returns True if suitable for this IE."""
403 def initialize(self):
404 """Initializes an instance (authentication, etc)."""
406 self._real_initialize()
409 def extract(self, url):
410 """Extracts URL information and returns it in list of dicts."""
412 return self._real_extract(url)
414 def set_downloader(self, downloader):
415 """Sets the downloader for this IE."""
416 self._downloader = downloader
418 def to_stdout(self, message):
419 """Print message to stdout if downloader is not in quiet mode."""
420 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
423 def to_stderr(self, message):
424 """Print message to stderr."""
425 print >>sys.stderr, message
427 def _real_initialize(self):
428 """Real initialization process. Redefine in subclasses."""
431 def _real_extract(self, url):
432 """Real extraction process. Redefine in subclasses."""
435 class YoutubeIE(InfoExtractor):
436 """Information extractor for youtube.com."""
438 _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
439 _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
440 _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
441 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
442 _NETRC_MACHINE = 'youtube'
446 return (re.match(YoutubeIE._VALID_URL, url) is not None)
448 def report_lang(self):
449 """Report attempt to set language."""
450 self.to_stdout(u'[youtube] Setting language')
452 def report_login(self):
453 """Report attempt to log in."""
454 self.to_stdout(u'[youtube] Logging in')
456 def report_age_confirmation(self):
457 """Report attempt to confirm age."""
458 self.to_stdout(u'[youtube] Confirming age')
460 def report_webpage_download(self, video_id):
461 """Report attempt to download webpage."""
462 self.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
464 def report_information_extraction(self, video_id):
465 """Report attempt to extract video information."""
466 self.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
468 def report_video_url(self, video_id, video_real_url):
469 """Report extracted video URL."""
470 self.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
472 def _real_initialize(self):
473 if self._downloader is None:
478 downloader_params = self._downloader.get_params()
480 # Attempt to use provided username and password or .netrc data
481 if downloader_params.get('username', None) is not None:
482 username = downloader_params['username']
483 password = downloader_params['password']
484 elif downloader_params.get('usenetrc', False):
486 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
491 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
492 except (IOError, netrc.NetrcParseError), err:
493 self.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
496 # No authentication to be performed
501 request = urllib2.Request(self._LOGIN_URL, None, std_headers)
504 urllib2.urlopen(request).read()
505 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
506 self.to_stderr(u'WARNING: unable to set language: %s' % str(err))
511 'current_form': 'loginForm',
513 'action_login': 'Log In',
514 'username': username,
515 'password': password,
517 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
520 login_results = urllib2.urlopen(request).read()
521 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
522 self.to_stderr(u'WARNING: unable to log in: bad username or password')
524 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
525 self.to_stderr(u'WARNING: unable to log in: %s' % str(err))
531 'action_confirm': 'Confirm',
533 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
535 self.report_age_confirmation()
536 age_results = urllib2.urlopen(request).read()
537 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
538 self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
541 def _real_extract(self, url):
542 # Extract video id from URL
543 mobj = re.match(self._VALID_URL, url)
545 self.to_stderr(u'ERROR: invalid URL: %s' % url)
547 video_id = mobj.group(2)
549 # Downloader parameters
551 if self._downloader is not None:
552 params = self._downloader.get_params()
553 format_param = params.get('format', None)
556 video_extension = {'18': 'mp4', '17': '3gp'}.get(format_param, 'flv')
558 # Normalize URL, including format
559 normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
560 if format_param is not None:
561 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
562 request = urllib2.Request(normalized_url, None, std_headers)
564 self.report_webpage_download(video_id)
565 video_webpage = urllib2.urlopen(request).read()
566 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
567 self.to_stderr(u'ERROR: unable to download video webpage: %s' % str(err))
569 self.report_information_extraction(video_id)
572 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
574 self.to_stderr(u'ERROR: unable to extract "t" parameter')
576 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
577 if format_param is not None:
578 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
579 self.report_video_url(video_id, video_real_url)
582 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
584 self.to_stderr(u'ERROR: unable to extract uploader nickname')
586 video_uploader = mobj.group(1)
589 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
591 self.to_stderr(u'ERROR: unable to extract video title')
593 video_title = mobj.group(1).decode('utf-8')
594 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
595 video_title = video_title.replace(os.sep, u'%')
598 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
599 simple_title = simple_title.strip(ur'_')
603 'id': video_id.decode('utf-8'),
604 'url': video_real_url.decode('utf-8'),
605 'uploader': video_uploader.decode('utf-8'),
606 'title': video_title,
607 'stitle': simple_title,
608 'ext': video_extension.decode('utf-8'),
611 class MetacafeIE(InfoExtractor):
612 """Information Extractor for metacafe.com."""
614 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
615 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
618 def __init__(self, youtube_ie, downloader=None):
619 InfoExtractor.__init__(self, downloader)
620 self._youtube_ie = youtube_ie
624 return (re.match(MetacafeIE._VALID_URL, url) is not None)
626 def report_disclaimer(self):
627 """Report disclaimer retrieval."""
628 self.to_stdout(u'[metacafe] Retrieving disclaimer')
630 def report_age_confirmation(self):
631 """Report attempt to confirm age."""
632 self.to_stdout(u'[metacafe] Confirming age')
634 def report_download_webpage(self, video_id):
635 """Report webpage download."""
636 self.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
638 def report_extraction(self, video_id):
639 """Report information extraction."""
640 self.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
642 def _real_initialize(self):
643 # Retrieve disclaimer
644 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
646 self.report_disclaimer()
647 disclaimer = urllib2.urlopen(request).read()
648 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
649 self.to_stderr(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
655 'submit': "Continue - I'm over 18",
657 request = urllib2.Request('http://www.metacafe.com/', urllib.urlencode(disclaimer_form), std_headers)
659 self.report_age_confirmation()
660 disclaimer = urllib2.urlopen(request).read()
661 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
662 self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
665 def _real_extract(self, url):
666 # Extract id and simplified title from URL
667 mobj = re.match(self._VALID_URL, url)
669 self.to_stderr(u'ERROR: invalid URL: %s' % url)
672 video_id = mobj.group(1)
674 # Check if video comes from YouTube
675 mobj2 = re.match(r'^yt-(.*)$', video_id)
676 if mobj2 is not None:
677 return self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
679 simple_title = mobj.group(2).decode('utf-8')
680 video_extension = 'flv'
682 # Retrieve video webpage to extract further information
683 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
685 self.report_download_webpage(video_id)
686 webpage = urllib2.urlopen(request).read()
687 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
688 self.to_stderr(u'ERROR: unable retrieve video webpage: %s' % str(err))
691 # Extract URL, uploader and title from webpage
692 self.report_extraction(video_id)
693 mobj = re.search(r'(?m)"mediaURL":"(http.*?\.flv)"', webpage)
695 self.to_stderr(u'ERROR: unable to extract media URL')
697 mediaURL = mobj.group(1).replace('\\', '')
699 mobj = re.search(r'(?m)"gdaKey":"(.*?)"', webpage)
701 self.to_stderr(u'ERROR: unable to extract gdaKey')
703 gdaKey = mobj.group(1)
705 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
707 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
709 self.to_stderr(u'ERROR: unable to extract title')
711 video_title = mobj.group(1).decode('utf-8')
713 mobj = re.search(r'(?m)<li id="ChnlUsr">.*?Submitter:<br />(.*?)</li>', webpage)
715 self.to_stderr(u'ERROR: unable to extract uploader nickname')
717 video_uploader = re.sub(r'<.*?>', '', mobj.group(1))
721 'id': video_id.decode('utf-8'),
722 'url': video_url.decode('utf-8'),
723 'uploader': video_uploader.decode('utf-8'),
724 'title': video_title,
725 'stitle': simple_title,
726 'ext': video_extension.decode('utf-8'),
730 class YoutubeSearchIE(InfoExtractor):
731 """Information Extractor for YouTube search queries."""
732 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
733 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
734 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
735 _MORE_PAGES_INDICATOR = r'>Next</a>'
738 def __init__(self, youtube_ie, downloader=None):
739 InfoExtractor.__init__(self, downloader)
740 self._youtube_ie = youtube_ie
744 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
746 def report_download_page(self, query, pagenum):
747 """Report attempt to download playlist page with given number."""
748 self.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
750 def _real_initialize(self):
751 self._youtube_ie.initialize()
753 def _real_extract(self, query):
754 mobj = re.match(self._VALID_QUERY, query)
756 self.to_stderr(u'ERROR: invalid search query "%s"' % query)
759 prefix, query = query.split(':')
762 return self._download_n_results(query, 1)
763 elif prefix == 'all':
764 return self._download_n_results(query, -1)
769 self.to_stderr(u'ERROR: invalid download number %s for query "%s"' % (n, query))
771 return self._download_n_results(query, n)
772 except ValueError: # parsing prefix as int fails
773 return self._download_n_results(query, 1)
775 def _download_n_results(self, query, n):
776 """Downloads a specified number of results for a query"""
783 self.report_download_page(query, pagenum)
784 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
785 request = urllib2.Request(result_url, None, std_headers)
787 page = urllib2.urlopen(request).read()
788 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
789 self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
792 # Extract video identifiers
793 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
794 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
795 if video_id not in already_seen:
796 video_ids.append(video_id)
797 already_seen.add(video_id)
798 if len(video_ids) == n:
799 # Specified n videos reached
802 information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
805 if self._MORE_PAGES_INDICATOR not in page:
808 information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
811 pagenum = pagenum + 1
813 class YoutubePlaylistIE(InfoExtractor):
814 """Information Extractor for YouTube playlists."""
816 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
817 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
818 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
819 _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
822 def __init__(self, youtube_ie, downloader=None):
823 InfoExtractor.__init__(self, downloader)
824 self._youtube_ie = youtube_ie
828 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
830 def report_download_page(self, playlist_id, pagenum):
831 """Report attempt to download playlist page with given number."""
832 self.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
834 def _real_initialize(self):
835 self._youtube_ie.initialize()
837 def _real_extract(self, url):
838 # Extract playlist id
839 mobj = re.match(self._VALID_URL, url)
841 self.to_stderr(u'ERROR: invalid url: %s' % url)
844 # Download playlist pages
845 playlist_id = mobj.group(1)
850 self.report_download_page(playlist_id, pagenum)
851 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
853 page = urllib2.urlopen(request).read()
854 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
855 self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
858 # Extract video identifiers
860 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
861 if mobj.group(1) not in ids_in_page:
862 ids_in_page.append(mobj.group(1))
863 video_ids.extend(ids_in_page)
865 if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
867 pagenum = pagenum + 1
871 information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
874 class PostProcessor(object):
875 """Post Processor class.
877 PostProcessor objects can be added to downloaders with their
878 add_post_processor() method. When the downloader has finished a
879 successful download, it will take its internal chain of PostProcessors
880 and start calling the run() method on each one of them, first with
881 an initial argument and then with the returned value of the previous
884 The chain will be stopped if one of them ever returns None or the end
885 of the chain is reached.
887 PostProcessor objects follow a "mutual registration" process similar
888 to InfoExtractor objects.
893 def __init__(self, downloader=None):
894 self._downloader = downloader
896 def to_stdout(self, message):
897 """Print message to stdout if downloader is not in quiet mode."""
898 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
901 def to_stderr(self, message):
902 """Print message to stderr."""
903 print >>sys.stderr, message
905 def set_downloader(self, downloader):
906 """Sets the downloader for this PP."""
907 self._downloader = downloader
909 def run(self, information):
910 """Run the PostProcessor.
912 The "information" argument is a dictionary like the ones
913 returned by InfoExtractors. The only difference is that this
914 one has an extra field called "filepath" that points to the
917 When this method returns None, the postprocessing chain is
918 stopped. However, this method may return an information
919 dictionary that will be passed to the next postprocessing
920 object in the chain. It can be the one it received after
921 changing some fields.
923 In addition, this method may raise a PostProcessingError
924 exception that will be taken into account by the downloader
927 return information # by default, do nothing
930 if __name__ == '__main__':
932 # Modules needed only when running the main program
936 # General configuration
937 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
938 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
939 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
942 parser = optparse.OptionParser(
943 usage='Usage: %prog [options] url...',
944 version='2009.02.07',
945 conflict_handler='resolve',
947 parser.add_option('-h', '--help',
948 action='help', help='print this help text and exit')
949 parser.add_option('-v', '--version',
950 action='version', help='print program version and exit')
951 parser.add_option('-u', '--username',
952 dest='username', metavar='UN', help='account username')
953 parser.add_option('-p', '--password',
954 dest='password', metavar='PW', help='account password')
955 parser.add_option('-o', '--output',
956 dest='outtmpl', metavar='TPL', help='output filename template')
957 parser.add_option('-q', '--quiet',
958 action='store_true', dest='quiet', help='activates quiet mode', default=False)
959 parser.add_option('-s', '--simulate',
960 action='store_true', dest='simulate', help='do not download video', default=False)
961 parser.add_option('-t', '--title',
962 action='store_true', dest='usetitle', help='use title in file name', default=False)
963 parser.add_option('-l', '--literal',
964 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
965 parser.add_option('-n', '--netrc',
966 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
967 parser.add_option('-g', '--get-url',
968 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
969 parser.add_option('-e', '--get-title',
970 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
971 parser.add_option('-f', '--format',
972 dest='format', metavar='FMT', help='video format code')
973 parser.add_option('-b', '--best-quality',
974 action='store_const', dest='format', help='alias for -f 18', const='18')
975 parser.add_option('-m', '--mobile-version',
976 action='store_const', dest='format', help='alias for -f 17', const='17')
977 parser.add_option('-i', '--ignore-errors',
978 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
979 parser.add_option('-r', '--rate-limit',
980 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
981 parser.add_option('-a', '--batch-file',
982 dest='batchfile', metavar='F', help='file containing URLs to download')
983 parser.add_option('-w', '--no-overwrites',
984 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
985 (opts, args) = parser.parse_args()
987 # Batch file verification
989 if opts.batchfile is not None:
991 batchurls = [line.strip() for line in open(opts.batchfile, 'r')]
993 sys.exit(u'ERROR: batch file could not be read')
994 all_urls = batchurls + args
996 # Conflicting, missing and erroneous options
997 if len(all_urls) < 1:
998 sys.exit(u'ERROR: you must provide at least one URL')
999 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1000 sys.exit(u'ERROR: using .netrc conflicts with giving username/password')
1001 if opts.password is not None and opts.username is None:
1002 sys.exit(u'ERROR: account username missing')
1003 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1004 sys.exit(u'ERROR: using output template conflicts with using title or literal title')
1005 if opts.usetitle and opts.useliteral:
1006 sys.exit(u'ERROR: using title conflicts with using literal title')
1007 if opts.username is not None and opts.password is None:
1008 opts.password = getpass.getpass(u'Type account password and press return:')
1009 if opts.ratelimit is not None:
1010 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1011 if numeric_limit is None:
1012 sys.exit(u'ERROR: invalid rate limit specified')
1013 opts.ratelimit = numeric_limit
1015 # Information extractors
1016 youtube_ie = YoutubeIE()
1017 metacafe_ie = MetacafeIE(youtube_ie)
1018 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1019 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1022 charset = locale.getdefaultlocale()[1]
1025 fd = FileDownloader({
1026 'usenetrc': opts.usenetrc,
1027 'username': opts.username,
1028 'password': opts.password,
1029 'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1030 'forceurl': opts.geturl,
1031 'forcetitle': opts.gettitle,
1032 'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1033 'format': opts.format,
1034 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(charset))
1035 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1036 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1037 or u'%(id)s.%(ext)s'),
1038 'ignoreerrors': opts.ignoreerrors,
1039 'ratelimit': opts.ratelimit,
1040 'nooverwrites': opts.nooverwrites,
1042 fd.add_info_extractor(youtube_search_ie)
1043 fd.add_info_extractor(youtube_pl_ie)
1044 fd.add_info_extractor(metacafe_ie)
1045 fd.add_info_extractor(youtube_ie)
1046 retcode = fd.download(all_urls)
1049 except DownloadError:
1051 except SameFileError:
1052 sys.exit(u'ERROR: fixed output name but more than one file to download')
1053 except KeyboardInterrupt:
1054 sys.exit(u'\nERROR: Interrupted by user')