2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
23 # parse_qs was moved from the cgi module to the urlparse module recently.
25 from urlparse import parse_qs
27 from cgi import parse_qs
30 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
31 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
32 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
33 'Accept-Language': 'en-us,en;q=0.5',
36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
38 def preferredencoding():
39 """Get preferred encoding.
41 Returns the best encoding scheme for the system, based on
42 locale.getpreferredencoding() and some further tweaks.
44 def yield_preferredencoding():
46 pref = locale.getpreferredencoding()
52 return yield_preferredencoding().next()
54 def htmlentity_transform(matchobj):
55 """Transforms an HTML entity to a Unicode character.
57 This function receives a match object and is intended to be used with
58 the re.sub() function.
60 entity = matchobj.group(1)
62 # Known non-numeric HTML entity
63 if entity in htmlentitydefs.name2codepoint:
64 return unichr(htmlentitydefs.name2codepoint[entity])
67 mobj = re.match(ur'(?u)#(x?\d+)', entity)
69 numstr = mobj.group(1)
70 if numstr.startswith(u'x'):
72 numstr = u'0%s' % numstr
75 return unichr(long(numstr, base))
77 # Unknown entity in name, return its literal representation
78 return (u'&%s;' % entity)
80 def sanitize_title(utitle):
81 """Sanitizes a video title so it could be used as part of a filename."""
82 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
83 return utitle.replace(unicode(os.sep), u'%')
85 def sanitize_open(filename, open_mode):
86 """Try to open the given filename, and slightly tweak it if this fails.
88 Attempts to open the given filename. If this fails, it tries to change
89 the filename slightly, step by step, until it's either able to open it
90 or it fails and raises a final exception, like the standard open()
93 It returns the tuple (stream, definitive_file_name).
97 return (sys.stdout, filename)
98 stream = open(filename, open_mode)
99 return (stream, filename)
100 except (IOError, OSError), err:
101 # In case of error, try to remove win32 forbidden chars
102 filename = re.sub(ur'[<>:"\|\?\*]', u'#', filename)
104 # An exception here should be caught in the caller
105 stream = open(filename, open_mode)
106 return (stream, filename)
109 class DownloadError(Exception):
110 """Download Error exception.
112 This exception may be thrown by FileDownloader objects if they are not
113 configured to continue on errors. They will contain the appropriate
118 class SameFileError(Exception):
119 """Same File exception.
121 This exception will be thrown by FileDownloader objects if they detect
122 multiple files would have to be downloaded to the same file on disk.
126 class PostProcessingError(Exception):
127 """Post Processing exception.
129 This exception may be raised by PostProcessor's .run() method to
130 indicate an error in the postprocessing task.
134 class UnavailableFormatError(Exception):
135 """Unavailable Format exception.
137 This exception will be thrown when a video is requested
138 in a format that is not available for that video.
142 class ContentTooShortError(Exception):
143 """Content Too Short exception.
145 This exception may be raised by FileDownloader objects when a file they
146 download is too small for what the server announced first, indicating
147 the connection was probably interrupted.
153 def __init__(self, downloaded, expected):
154 self.downloaded = downloaded
155 self.expected = expected
157 class FileDownloader(object):
158 """File Downloader class.
160 File downloader objects are the ones responsible of downloading the
161 actual video file and writing it to disk if the user has requested
162 it, among some other tasks. In most cases there should be one per
163 program. As, given a video URL, the downloader doesn't know how to
164 extract all the needed information, task that InfoExtractors do, it
165 has to pass the URL to one of them.
167 For this, file downloader objects have a method that allows
168 InfoExtractors to be registered in a given order. When it is passed
169 a URL, the file downloader handles it to the first InfoExtractor it
170 finds that reports being able to handle it. The InfoExtractor extracts
171 all the information about the video or videos the URL refers to, and
172 asks the FileDownloader to process the video information, possibly
173 downloading the video.
175 File downloaders accept a lot of parameters. In order not to saturate
176 the object constructor with arguments, it receives a dictionary of
177 options instead. These options are available through the params
178 attribute for the InfoExtractors to use. The FileDownloader also
179 registers itself as the downloader in charge for the InfoExtractors
180 that are added to it, so this is a "mutual registration".
184 username: Username for authentication purposes.
185 password: Password for authentication purposes.
186 usenetrc: Use netrc for authentication instead.
187 quiet: Do not print messages to stdout.
188 forceurl: Force printing final URL.
189 forcetitle: Force printing title.
190 simulate: Do not download the video files.
191 format: Video format code.
192 outtmpl: Template for output names.
193 ignoreerrors: Do not stop on download errors.
194 ratelimit: Download speed limit, in bytes/sec.
195 nooverwrites: Prevent overwriting files.
196 continuedl: Try to continue downloads if possible.
197 noprogress: Do not print the progress bar.
203 _download_retcode = None
204 _num_downloads = None
206 def __init__(self, params):
207 """Create a FileDownloader object with the given options."""
210 self._download_retcode = 0
211 self._num_downloads = 0
215 def pmkdir(filename):
216 """Create directory components in filename. Similar to Unix "mkdir -p"."""
217 components = filename.split(os.sep)
218 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
219 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
220 for dir in aggregate:
221 if not os.path.exists(dir):
225 def format_bytes(bytes):
228 if type(bytes) is str:
233 exponent = long(math.log(bytes, 1024.0))
234 suffix = 'bkMGTPEZY'[exponent]
235 converted = float(bytes) / float(1024**exponent)
236 return '%.2f%s' % (converted, suffix)
239 def calc_percent(byte_counter, data_len):
242 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
245 def calc_eta(start, now, total, current):
249 if current == 0 or dif < 0.001: # One millisecond
251 rate = float(current) / dif
252 eta = long((float(total) - float(current)) / rate)
253 (eta_mins, eta_secs) = divmod(eta, 60)
256 return '%02d:%02d' % (eta_mins, eta_secs)
259 def calc_speed(start, now, bytes):
261 if bytes == 0 or dif < 0.001: # One millisecond
262 return '%10s' % '---b/s'
263 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
266 def best_block_size(elapsed_time, bytes):
267 new_min = max(bytes / 2.0, 1.0)
268 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
269 if elapsed_time < 0.001:
271 rate = bytes / elapsed_time
279 def parse_bytes(bytestr):
280 """Parse a string indicating a byte quantity into a long integer."""
281 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
284 number = float(matchobj.group(1))
285 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
286 return long(round(number * multiplier))
290 """Verify a URL is valid and data could be downloaded. Return real data URL."""
291 request = urllib2.Request(url, None, std_headers)
292 data = urllib2.urlopen(request)
298 def add_info_extractor(self, ie):
299 """Add an InfoExtractor object to the end of the list."""
301 ie.set_downloader(self)
303 def add_post_processor(self, pp):
304 """Add a PostProcessor object to the end of the chain."""
306 pp.set_downloader(self)
308 def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
309 """Print message to stdout if not in quiet mode."""
311 if not self.params.get('quiet', False):
312 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
314 except (UnicodeEncodeError), err:
315 if not ignore_encoding_errors:
318 def to_stderr(self, message):
319 """Print message to stderr."""
320 print >>sys.stderr, message.encode(preferredencoding())
322 def fixed_template(self):
323 """Checks if the output template is fixed."""
324 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
326 def trouble(self, message=None):
327 """Determine action to take when a download problem appears.
329 Depending on if the downloader has been configured to ignore
330 download errors or not, this method may throw an exception or
331 not when errors are found, after printing the message.
333 if message is not None:
334 self.to_stderr(message)
335 if not self.params.get('ignoreerrors', False):
336 raise DownloadError(message)
337 self._download_retcode = 1
339 def slow_down(self, start_time, byte_counter):
340 """Sleep if the download speed is over the rate limit."""
341 rate_limit = self.params.get('ratelimit', None)
342 if rate_limit is None or byte_counter == 0:
345 elapsed = now - start_time
348 speed = float(byte_counter) / elapsed
349 if speed > rate_limit:
350 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
352 def report_destination(self, filename):
353 """Report destination filename."""
354 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
356 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
357 """Report download progress."""
358 if self.params.get('noprogress', False):
360 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
361 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
363 def report_resuming_byte(self, resume_len):
364 """Report attemtp to resume at given byte."""
365 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
367 def report_file_already_downloaded(self, file_name):
368 """Report file has already been fully downloaded."""
370 self.to_stdout(u'[download] %s has already been downloaded' % file_name)
371 except (UnicodeEncodeError), err:
372 self.to_stdout(u'[download] The file has already been downloaded')
374 def report_unable_to_resume(self):
375 """Report it was impossible to resume download."""
376 self.to_stdout(u'[download] Unable to resume')
378 def report_finish(self):
379 """Report download finished."""
380 if self.params.get('noprogress', False):
381 self.to_stdout(u'[download] Download completed')
385 def process_info(self, info_dict):
386 """Process a single dictionary returned by an InfoExtractor."""
387 # Do nothing else if in simulate mode
388 if self.params.get('simulate', False):
389 # Verify URL if it's an HTTP one
390 if info_dict['url'].startswith('http'):
392 self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
393 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
394 raise UnavailableFormatError
397 if self.params.get('forcetitle', False):
398 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
399 if self.params.get('forceurl', False):
400 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
401 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
402 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
403 if self.params.get('forcedescription', False) and 'description' in info_dict:
404 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
409 template_dict = dict(info_dict)
410 template_dict['epoch'] = unicode(long(time.time()))
411 template_dict['ord'] = unicode('%05d' % self._num_downloads)
412 filename = self.params['outtmpl'] % template_dict
413 except (ValueError, KeyError), err:
414 self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
415 if self.params.get('nooverwrites', False) and os.path.exists(filename):
416 self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
420 self.pmkdir(filename)
421 except (OSError, IOError), err:
422 self.trouble('ERROR: unable to create directories: %s' % str(err))
426 success = self._do_download(filename, info_dict['url'].encode('utf-8'))
427 except (OSError, IOError), err:
428 raise UnavailableFormatError
429 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
430 self.trouble('ERROR: unable to download video data: %s' % str(err))
432 except (ContentTooShortError, ), err:
433 self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
438 self.post_process(filename, info_dict)
439 except (PostProcessingError), err:
440 self.trouble('ERROR: postprocessing: %s' % str(err))
443 def download(self, url_list):
444 """Download a given list of URLs."""
445 if len(url_list) > 1 and self.fixed_template():
446 raise SameFileError(self.params['outtmpl'])
449 suitable_found = False
451 # Go to next InfoExtractor if not suitable
452 if not ie.suitable(url):
455 # Suitable InfoExtractor found
456 suitable_found = True
458 # Extract information from URL and process it
461 # Suitable InfoExtractor had been found; go to next URL
464 if not suitable_found:
465 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
467 return self._download_retcode
469 def post_process(self, filename, ie_info):
470 """Run the postprocessing chain on the given file."""
472 info['filepath'] = filename
478 def _download_with_rtmpdump(self, filename, url):
479 self.report_destination(filename)
481 # Check for rtmpdump first
483 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
484 except (OSError, IOError):
485 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
488 # Download using rtmpdump. rtmpdump returns exit code 2 when
489 # the connection was interrumpted and resuming appears to be
490 # possible. This is part of rtmpdump's normal usage, AFAIK.
491 basic_args = ['rtmpdump', '-q', '-r', url, '-o', filename]
492 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
493 while retval == 2 or retval == 1:
494 self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename), skip_eol=True)
495 time.sleep(2.0) # This seems to be needed
496 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
498 self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
501 self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
504 def _do_download(self, filename, url):
505 # Attempt to download using rtmpdump
506 if url.startswith('rtmp'):
507 return self._download_with_rtmpdump(filename, url)
511 basic_request = urllib2.Request(url, None, std_headers)
512 request = urllib2.Request(url, None, std_headers)
514 # Establish possible resume length
515 if os.path.isfile(filename):
516 resume_len = os.path.getsize(filename)
520 # Request parameters in case of being able to resume
521 if self.params.get('continuedl', False) and resume_len != 0:
522 self.report_resuming_byte(resume_len)
523 request.add_header('Range','bytes=%d-' % resume_len)
526 # Establish connection
528 data = urllib2.urlopen(request)
529 except (urllib2.HTTPError, ), err:
530 if err.code != 416: # 416 is 'Requested range not satisfiable'
533 data = urllib2.urlopen(basic_request)
534 content_length = data.info()['Content-Length']
536 if content_length is not None and long(content_length) == resume_len:
537 # Because the file had already been fully downloaded
538 self.report_file_already_downloaded(filename)
541 # Because the server didn't let us
542 self.report_unable_to_resume()
545 data_len = data.info().get('Content-length', None)
546 data_len_str = self.format_bytes(data_len)
553 data_block = data.read(block_size)
555 data_block_len = len(data_block)
556 if data_block_len == 0:
558 byte_counter += data_block_len
560 # Open file just in time
563 (stream, filename) = sanitize_open(filename, open_mode)
564 self.report_destination(filename)
565 self._num_downloads += 1
566 except (OSError, IOError), err:
567 self.trouble('ERROR: unable to open for writing: %s' % str(err))
570 stream.write(data_block)
571 except (IOError, OSError), err:
572 self.trouble('\nERROR: unable to write data: %s' % str(err))
573 block_size = self.best_block_size(after - before, data_block_len)
576 percent_str = self.calc_percent(byte_counter, data_len)
577 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
578 speed_str = self.calc_speed(start, time.time(), byte_counter)
579 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
582 self.slow_down(start, byte_counter)
585 if data_len is not None and str(byte_counter) != data_len:
586 raise ContentTooShortError(byte_counter, long(data_len))
589 class InfoExtractor(object):
590 """Information Extractor class.
592 Information extractors are the classes that, given a URL, extract
593 information from the video (or videos) the URL refers to. This
594 information includes the real video URL, the video title and simplified
595 title, author and others. The information is stored in a dictionary
596 which is then passed to the FileDownloader. The FileDownloader
597 processes this information possibly downloading the video to the file
598 system, among other possible outcomes. The dictionaries must include
599 the following fields:
601 id: Video identifier.
602 url: Final video URL.
603 uploader: Nickname of the video uploader.
604 title: Literal title.
605 stitle: Simplified title.
606 ext: Video filename extension.
607 format: Video format.
609 The following fields are optional. Their primary purpose is to allow
610 youtube-dl to serve as the backend for a video search function, such
611 as the one in youtube2mp3. They are only used when their respective
612 forced printing functions are called:
614 thumbnail: Full URL to a video thumbnail image.
615 description: One-line video description.
617 Subclasses of this one should re-define the _real_initialize() and
618 _real_extract() methods, as well as the suitable() static method.
619 Probably, they should also be instantiated and added to the main
626 def __init__(self, downloader=None):
627 """Constructor. Receives an optional downloader."""
629 self.set_downloader(downloader)
633 """Receives a URL and returns True if suitable for this IE."""
636 def initialize(self):
637 """Initializes an instance (authentication, etc)."""
639 self._real_initialize()
642 def extract(self, url):
643 """Extracts URL information and returns it in list of dicts."""
645 return self._real_extract(url)
647 def set_downloader(self, downloader):
648 """Sets the downloader for this IE."""
649 self._downloader = downloader
651 def _real_initialize(self):
652 """Real initialization process. Redefine in subclasses."""
655 def _real_extract(self, url):
656 """Real extraction process. Redefine in subclasses."""
659 class YoutubeIE(InfoExtractor):
660 """Information extractor for youtube.com."""
662 _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
663 _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
664 _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
665 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
666 _NETRC_MACHINE = 'youtube'
667 _available_formats = ['37', '22', '35', '18', '34', '5', '17', '13', None] # listed in order of priority for -b flag
668 _video_extensions = {
678 return (re.match(YoutubeIE._VALID_URL, url) is not None)
680 def report_lang(self):
681 """Report attempt to set language."""
682 self._downloader.to_stdout(u'[youtube] Setting language')
684 def report_login(self):
685 """Report attempt to log in."""
686 self._downloader.to_stdout(u'[youtube] Logging in')
688 def report_age_confirmation(self):
689 """Report attempt to confirm age."""
690 self._downloader.to_stdout(u'[youtube] Confirming age')
692 def report_video_info_webpage_download(self, video_id):
693 """Report attempt to download video info webpage."""
694 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
696 def report_information_extraction(self, video_id):
697 """Report attempt to extract video information."""
698 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
700 def report_unavailable_format(self, video_id, format):
701 """Report extracted video URL."""
702 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
704 def report_rtmp_download(self):
705 """Indicate the download will use the RTMP protocol."""
706 self._downloader.to_stdout(u'[youtube] RTMP download detected')
708 def _real_initialize(self):
709 if self._downloader is None:
714 downloader_params = self._downloader.params
716 # Attempt to use provided username and password or .netrc data
717 if downloader_params.get('username', None) is not None:
718 username = downloader_params['username']
719 password = downloader_params['password']
720 elif downloader_params.get('usenetrc', False):
722 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
727 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
728 except (IOError, netrc.NetrcParseError), err:
729 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
733 request = urllib2.Request(self._LANG_URL, None, std_headers)
736 urllib2.urlopen(request).read()
737 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
738 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
741 # No authentication to be performed
747 'current_form': 'loginForm',
749 'action_login': 'Log In',
750 'username': username,
751 'password': password,
753 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
756 login_results = urllib2.urlopen(request).read()
757 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
758 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
760 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
761 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
767 'action_confirm': 'Confirm',
769 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
771 self.report_age_confirmation()
772 age_results = urllib2.urlopen(request).read()
773 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
774 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
777 def _real_extract(self, url):
778 # Extract video id from URL
779 mobj = re.match(self._VALID_URL, url)
781 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
783 video_id = mobj.group(2)
785 # Downloader parameters
790 if self._downloader is not None:
791 params = self._downloader.params
792 format_param = params.get('format', None)
793 if format_param == '0':
794 format_param = self._available_formats[quality_index]
796 elif format_param == '-1':
797 format_param = self._available_formats[quality_index]
802 video_extension = self._video_extensions.get(format_param, 'flv')
805 self.report_video_info_webpage_download(video_id)
806 for el_type in ['embedded', 'detailpage', 'vevo']:
807 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s&el=%s&ps=default&eurl=&gl=US&hl=en'
808 % (video_id, el_type))
809 request = urllib2.Request(video_info_url, None, std_headers)
811 video_info_webpage = urllib2.urlopen(request).read()
812 video_info = parse_qs(video_info_webpage)
813 if 'token' in video_info:
815 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
816 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
818 self.report_information_extraction(video_id)
821 if 'token' not in video_info:
822 # Attempt to see if YouTube has issued an error message
823 if 'reason' not in video_info:
824 self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
825 stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
826 stream.write(video_info_webpage)
829 reason = urllib.unquote_plus(video_info['reason'][0])
830 self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
832 token = urllib.unquote_plus(video_info['token'][0])
833 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
834 if format_param is not None:
835 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
837 # Check possible RTMP download
838 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
839 self.report_rtmp_download()
840 video_real_url = video_info['conn'][0]
843 if 'author' not in video_info:
844 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
846 video_uploader = urllib.unquote_plus(video_info['author'][0])
849 if 'title' not in video_info:
850 self._downloader.trouble(u'ERROR: unable to extract video title')
852 video_title = urllib.unquote_plus(video_info['title'][0])
853 video_title = video_title.decode('utf-8')
854 video_title = sanitize_title(video_title)
857 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
858 simple_title = simple_title.strip(ur'_')
861 if 'thumbnail_url' not in video_info:
862 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
864 else: # don't panic if we can't find it
865 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
867 # get video description
868 video_description = 'No description available.' # we need something to pass to self._downloader
869 # this requires an additional HTTP request and a little
870 # more time, so don't do it unless absolutely necessary
871 if self._downloader.params.get('forcedescription', False):
872 video_page_url = 'http://www.youtube.com/watch?v=' + video_id
873 request = urllib2.Request(video_page_url, None, std_headers)
875 video_page_webpage = urllib2.urlopen(request).read()
876 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_page_webpage)
878 video_description = mobj.group(1)
879 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
880 pass # don't panic if we can't find it
883 # Process video information
884 self._downloader.process_info({
885 'id': video_id.decode('utf-8'),
886 'url': video_real_url.decode('utf-8'),
887 'uploader': video_uploader.decode('utf-8'),
888 'title': video_title,
889 'stitle': simple_title,
890 'ext': video_extension.decode('utf-8'),
891 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
892 'thumbnail': video_thumbnail.decode('utf-8'),
893 'description': video_description.decode('utf-8'),
897 if quality_index == len(self._available_formats):
902 format_param = self._available_formats[quality_index]
906 except UnavailableFormatError, err:
907 if best_quality or all_formats:
908 if quality_index == len(self._available_formats):
909 # I don't ever expect this to happen
911 self._downloader.trouble(u'ERROR: no known formats available for video')
914 self.report_unavailable_format(video_id, format_param)
916 format_param = self._available_formats[quality_index]
919 self._downloader.trouble('ERROR: format not available for video')
923 class MetacafeIE(InfoExtractor):
924 """Information Extractor for metacafe.com."""
926 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
927 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
928 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
931 def __init__(self, youtube_ie, downloader=None):
932 InfoExtractor.__init__(self, downloader)
933 self._youtube_ie = youtube_ie
937 return (re.match(MetacafeIE._VALID_URL, url) is not None)
939 def report_disclaimer(self):
940 """Report disclaimer retrieval."""
941 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
943 def report_age_confirmation(self):
944 """Report attempt to confirm age."""
945 self._downloader.to_stdout(u'[metacafe] Confirming age')
947 def report_download_webpage(self, video_id):
948 """Report webpage download."""
949 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
951 def report_extraction(self, video_id):
952 """Report information extraction."""
953 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
955 def _real_initialize(self):
956 # Retrieve disclaimer
957 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
959 self.report_disclaimer()
960 disclaimer = urllib2.urlopen(request).read()
961 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
962 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
968 'submit': "Continue - I'm over 18",
970 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
972 self.report_age_confirmation()
973 disclaimer = urllib2.urlopen(request).read()
974 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
975 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
978 def _real_extract(self, url):
979 # Extract id and simplified title from URL
980 mobj = re.match(self._VALID_URL, url)
982 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
985 video_id = mobj.group(1)
987 # Check if video comes from YouTube
988 mobj2 = re.match(r'^yt-(.*)$', video_id)
989 if mobj2 is not None:
990 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
993 simple_title = mobj.group(2).decode('utf-8')
994 video_extension = 'flv'
996 # Retrieve video webpage to extract further information
997 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
999 self.report_download_webpage(video_id)
1000 webpage = urllib2.urlopen(request).read()
1001 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1002 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1005 # Extract URL, uploader and title from webpage
1006 self.report_extraction(video_id)
1007 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1009 self._downloader.trouble(u'ERROR: unable to extract media URL')
1011 mediaURL = urllib.unquote(mobj.group(1))
1013 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1015 # self._downloader.trouble(u'ERROR: unable to extract gdaKey')
1017 #gdaKey = mobj.group(1)
1019 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1021 video_url = mediaURL
1023 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1025 self._downloader.trouble(u'ERROR: unable to extract title')
1027 video_title = mobj.group(1).decode('utf-8')
1028 video_title = sanitize_title(video_title)
1030 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1032 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1034 video_uploader = mobj.group(1)
1037 # Process video information
1038 self._downloader.process_info({
1039 'id': video_id.decode('utf-8'),
1040 'url': video_url.decode('utf-8'),
1041 'uploader': video_uploader.decode('utf-8'),
1042 'title': video_title,
1043 'stitle': simple_title,
1044 'ext': video_extension.decode('utf-8'),
1047 except UnavailableFormatError:
1048 self._downloader.trouble(u'ERROR: format not available for video')
1051 class GoogleIE(InfoExtractor):
1052 """Information extractor for video.google.com."""
1054 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1056 def __init__(self, downloader=None):
1057 InfoExtractor.__init__(self, downloader)
1061 return (re.match(GoogleIE._VALID_URL, url) is not None)
1063 def report_download_webpage(self, video_id):
1064 """Report webpage download."""
1065 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1067 def report_extraction(self, video_id):
1068 """Report information extraction."""
1069 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1071 def _real_initialize(self):
1074 def _real_extract(self, url):
1075 # Extract id from URL
1076 mobj = re.match(self._VALID_URL, url)
1078 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1081 video_id = mobj.group(1)
1083 video_extension = 'mp4'
1085 # Retrieve video webpage to extract further information
1086 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1088 self.report_download_webpage(video_id)
1089 webpage = urllib2.urlopen(request).read()
1090 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1091 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1094 # Extract URL, uploader, and title from webpage
1095 self.report_extraction(video_id)
1096 mobj = re.search(r"download_url:'([^']+)'", webpage)
1098 video_extension = 'flv'
1099 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1101 self._downloader.trouble(u'ERROR: unable to extract media URL')
1103 mediaURL = urllib.unquote(mobj.group(1))
1104 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1105 mediaURL = mediaURL.replace('\\x26', '\x26')
1107 video_url = mediaURL
1109 mobj = re.search(r'<title>(.*)</title>', webpage)
1111 self._downloader.trouble(u'ERROR: unable to extract title')
1113 video_title = mobj.group(1).decode('utf-8')
1114 video_title = sanitize_title(video_title)
1115 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1117 # Extract video description
1118 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1120 self._downloader.trouble(u'ERROR: unable to extract video description')
1122 video_description = mobj.group(1).decode('utf-8')
1123 if not video_description:
1124 video_description = 'No description available.'
1126 # Extract video thumbnail
1127 if self._downloader.params.get('forcethumbnail', False):
1128 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1130 webpage = urllib2.urlopen(request).read()
1131 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1132 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1134 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1136 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1138 video_thumbnail = mobj.group(1)
1139 else: # we need something to pass to process_info
1140 video_thumbnail = ''
1144 # Process video information
1145 self._downloader.process_info({
1146 'id': video_id.decode('utf-8'),
1147 'url': video_url.decode('utf-8'),
1149 'title': video_title,
1150 'stitle': simple_title,
1151 'ext': video_extension.decode('utf-8'),
1154 except UnavailableFormatError:
1155 self._downloader.trouble(u'ERROR: format not available for video')
1158 class PhotobucketIE(InfoExtractor):
1159 """Information extractor for photobucket.com."""
1161 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1163 def __init__(self, downloader=None):
1164 InfoExtractor.__init__(self, downloader)
1168 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1170 def report_download_webpage(self, video_id):
1171 """Report webpage download."""
1172 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1174 def report_extraction(self, video_id):
1175 """Report information extraction."""
1176 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1178 def _real_initialize(self):
1181 def _real_extract(self, url):
1182 # Extract id from URL
1183 mobj = re.match(self._VALID_URL, url)
1185 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1188 video_id = mobj.group(1)
1190 video_extension = 'flv'
1192 # Retrieve video webpage to extract further information
1193 request = urllib2.Request(url)
1195 self.report_download_webpage(video_id)
1196 webpage = urllib2.urlopen(request).read()
1197 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1198 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1201 # Extract URL, uploader, and title from webpage
1202 self.report_extraction(video_id)
1203 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1205 self._downloader.trouble(u'ERROR: unable to extract media URL')
1207 mediaURL = urllib.unquote(mobj.group(1))
1209 video_url = mediaURL
1211 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1213 self._downloader.trouble(u'ERROR: unable to extract title')
1215 video_title = mobj.group(1).decode('utf-8')
1216 video_title = sanitize_title(video_title)
1217 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1219 video_uploader = mobj.group(2).decode('utf-8')
1222 # Process video information
1223 self._downloader.process_info({
1224 'id': video_id.decode('utf-8'),
1225 'url': video_url.decode('utf-8'),
1226 'uploader': video_uploader,
1227 'title': video_title,
1228 'stitle': simple_title,
1229 'ext': video_extension.decode('utf-8'),
1232 except UnavailableFormatError:
1233 self._downloader.trouble(u'ERROR: format not available for video')
1236 class YahooIE(InfoExtractor):
1237 """Information extractor for video.yahoo.com."""
1239 # _VALID_URL matches all Yahoo! Video URLs
1240 # _VPAGE_URL matches only the extractable '/watch/' URLs
1241 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1242 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1244 def __init__(self, downloader=None):
1245 InfoExtractor.__init__(self, downloader)
1249 return (re.match(YahooIE._VALID_URL, url) is not None)
1251 def report_download_webpage(self, video_id):
1252 """Report webpage download."""
1253 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1255 def report_extraction(self, video_id):
1256 """Report information extraction."""
1257 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1259 def _real_initialize(self):
1262 def _real_extract(self, url):
1263 # Extract ID from URL
1264 mobj = re.match(self._VALID_URL, url)
1266 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1269 video_id = mobj.group(2)
1270 video_extension = 'flv'
1272 # Rewrite valid but non-extractable URLs as
1273 # extractable English language /watch/ URLs
1274 if re.match(self._VPAGE_URL, url) is None:
1275 request = urllib2.Request(url)
1277 webpage = urllib2.urlopen(request).read()
1278 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1279 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1282 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1284 self._downloader.trouble(u'ERROR: Unable to extract id field')
1286 yahoo_id = mobj.group(1)
1288 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1290 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1292 yahoo_vid = mobj.group(1)
1294 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1295 return self._real_extract(url)
1297 # Retrieve video webpage to extract further information
1298 request = urllib2.Request(url)
1300 self.report_download_webpage(video_id)
1301 webpage = urllib2.urlopen(request).read()
1302 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1303 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1306 # Extract uploader and title from webpage
1307 self.report_extraction(video_id)
1308 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1310 self._downloader.trouble(u'ERROR: unable to extract video title')
1312 video_title = mobj.group(1).decode('utf-8')
1313 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1315 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1317 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1319 video_uploader = mobj.group(1).decode('utf-8')
1321 # Extract video thumbnail
1322 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1324 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1326 video_thumbnail = mobj.group(1).decode('utf-8')
1328 # Extract video description
1329 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1331 self._downloader.trouble(u'ERROR: unable to extract video description')
1333 video_description = mobj.group(1).decode('utf-8')
1334 if not video_description: video_description = 'No description available.'
1336 # Extract video height and width
1337 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1339 self._downloader.trouble(u'ERROR: unable to extract video height')
1341 yv_video_height = mobj.group(1)
1343 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1345 self._downloader.trouble(u'ERROR: unable to extract video width')
1347 yv_video_width = mobj.group(1)
1349 # Retrieve video playlist to extract media URL
1350 # I'm not completely sure what all these options are, but we
1351 # seem to need most of them, otherwise the server sends a 401.
1352 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1353 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1354 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1355 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1356 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1358 self.report_download_webpage(video_id)
1359 webpage = urllib2.urlopen(request).read()
1360 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1361 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1364 # Extract media URL from playlist XML
1365 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1367 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1369 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1370 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1373 # Process video information
1374 self._downloader.process_info({
1375 'id': video_id.decode('utf-8'),
1377 'uploader': video_uploader,
1378 'title': video_title,
1379 'stitle': simple_title,
1380 'ext': video_extension.decode('utf-8'),
1381 'thumbnail': video_thumbnail.decode('utf-8'),
1382 'description': video_description,
1383 'thumbnail': video_thumbnail,
1384 'description': video_description,
1386 except UnavailableFormatError:
1387 self._downloader.trouble(u'ERROR: format not available for video')
1390 class GenericIE(InfoExtractor):
1391 """Generic last-resort information extractor."""
1393 def __init__(self, downloader=None):
1394 InfoExtractor.__init__(self, downloader)
1400 def report_download_webpage(self, video_id):
1401 """Report webpage download."""
1402 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1403 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1405 def report_extraction(self, video_id):
1406 """Report information extraction."""
1407 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1409 def _real_initialize(self):
1412 def _real_extract(self, url):
1413 video_id = url.split('/')[-1]
1414 request = urllib2.Request(url)
1416 self.report_download_webpage(video_id)
1417 webpage = urllib2.urlopen(request).read()
1418 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1419 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1421 except ValueError, err:
1422 # since this is the last-resort InfoExtractor, if
1423 # this error is thrown, it'll be thrown here
1424 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1427 # Start with something easy: JW Player in SWFObject
1428 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1430 # Broaden the search a little bit
1431 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1433 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1436 # It's possible that one of the regexes
1437 # matched, but returned an empty group:
1438 if mobj.group(1) is None:
1439 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1442 video_url = urllib.unquote(mobj.group(1))
1443 video_id = os.path.basename(video_url)
1445 # here's a fun little line of code for you:
1446 video_extension = os.path.splitext(video_id)[1][1:]
1447 video_id = os.path.splitext(video_id)[0]
1449 # it's tempting to parse this further, but you would
1450 # have to take into account all the variations like
1451 # Video Title - Site Name
1452 # Site Name | Video Title
1453 # Video Title - Tagline | Site Name
1454 # and so on and so forth; it's just not practical
1455 mobj = re.search(r'<title>(.*)</title>', webpage)
1457 self._downloader.trouble(u'ERROR: unable to extract title')
1459 video_title = mobj.group(1).decode('utf-8')
1460 video_title = sanitize_title(video_title)
1461 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1463 # video uploader is domain name
1464 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1466 self._downloader.trouble(u'ERROR: unable to extract title')
1468 video_uploader = mobj.group(1).decode('utf-8')
1471 # Process video information
1472 self._downloader.process_info({
1473 'id': video_id.decode('utf-8'),
1474 'url': video_url.decode('utf-8'),
1475 'uploader': video_uploader,
1476 'title': video_title,
1477 'stitle': simple_title,
1478 'ext': video_extension.decode('utf-8'),
1481 except UnavailableFormatError:
1482 self._downloader.trouble(u'ERROR: format not available for video')
1485 class YoutubeSearchIE(InfoExtractor):
1486 """Information Extractor for YouTube search queries."""
1487 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1488 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1489 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1490 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1492 _max_youtube_results = 1000
1494 def __init__(self, youtube_ie, downloader=None):
1495 InfoExtractor.__init__(self, downloader)
1496 self._youtube_ie = youtube_ie
1500 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1502 def report_download_page(self, query, pagenum):
1503 """Report attempt to download playlist page with given number."""
1504 query = query.decode(preferredencoding())
1505 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1507 def _real_initialize(self):
1508 self._youtube_ie.initialize()
1510 def _real_extract(self, query):
1511 mobj = re.match(self._VALID_QUERY, query)
1513 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1516 prefix, query = query.split(':')
1518 query = query.encode('utf-8')
1520 self._download_n_results(query, 1)
1522 elif prefix == 'all':
1523 self._download_n_results(query, self._max_youtube_results)
1529 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1531 elif n > self._max_youtube_results:
1532 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1533 n = self._max_youtube_results
1534 self._download_n_results(query, n)
1536 except ValueError: # parsing prefix as integer fails
1537 self._download_n_results(query, 1)
1540 def _download_n_results(self, query, n):
1541 """Downloads a specified number of results for a query"""
1544 already_seen = set()
1548 self.report_download_page(query, pagenum)
1549 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1550 request = urllib2.Request(result_url, None, std_headers)
1552 page = urllib2.urlopen(request).read()
1553 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1554 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1557 # Extract video identifiers
1558 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1559 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1560 if video_id not in already_seen:
1561 video_ids.append(video_id)
1562 already_seen.add(video_id)
1563 if len(video_ids) == n:
1564 # Specified n videos reached
1565 for id in video_ids:
1566 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1569 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1570 for id in video_ids:
1571 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1574 pagenum = pagenum + 1
1576 class GoogleSearchIE(InfoExtractor):
1577 """Information Extractor for Google Video search queries."""
1578 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1579 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1580 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1581 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1583 _max_google_results = 1000
1585 def __init__(self, google_ie, downloader=None):
1586 InfoExtractor.__init__(self, downloader)
1587 self._google_ie = google_ie
1591 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1593 def report_download_page(self, query, pagenum):
1594 """Report attempt to download playlist page with given number."""
1595 query = query.decode(preferredencoding())
1596 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1598 def _real_initialize(self):
1599 self._google_ie.initialize()
1601 def _real_extract(self, query):
1602 mobj = re.match(self._VALID_QUERY, query)
1604 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1607 prefix, query = query.split(':')
1609 query = query.encode('utf-8')
1611 self._download_n_results(query, 1)
1613 elif prefix == 'all':
1614 self._download_n_results(query, self._max_google_results)
1620 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1622 elif n > self._max_google_results:
1623 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1624 n = self._max_google_results
1625 self._download_n_results(query, n)
1627 except ValueError: # parsing prefix as integer fails
1628 self._download_n_results(query, 1)
1631 def _download_n_results(self, query, n):
1632 """Downloads a specified number of results for a query"""
1635 already_seen = set()
1639 self.report_download_page(query, pagenum)
1640 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1641 request = urllib2.Request(result_url, None, std_headers)
1643 page = urllib2.urlopen(request).read()
1644 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1645 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1648 # Extract video identifiers
1649 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1650 video_id = mobj.group(1)
1651 if video_id not in already_seen:
1652 video_ids.append(video_id)
1653 already_seen.add(video_id)
1654 if len(video_ids) == n:
1655 # Specified n videos reached
1656 for id in video_ids:
1657 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1660 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1661 for id in video_ids:
1662 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1665 pagenum = pagenum + 1
1667 class YahooSearchIE(InfoExtractor):
1668 """Information Extractor for Yahoo! Video search queries."""
1669 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1670 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1671 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1672 _MORE_PAGES_INDICATOR = r'\s*Next'
1674 _max_yahoo_results = 1000
1676 def __init__(self, yahoo_ie, downloader=None):
1677 InfoExtractor.__init__(self, downloader)
1678 self._yahoo_ie = yahoo_ie
1682 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1684 def report_download_page(self, query, pagenum):
1685 """Report attempt to download playlist page with given number."""
1686 query = query.decode(preferredencoding())
1687 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1689 def _real_initialize(self):
1690 self._yahoo_ie.initialize()
1692 def _real_extract(self, query):
1693 mobj = re.match(self._VALID_QUERY, query)
1695 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1698 prefix, query = query.split(':')
1700 query = query.encode('utf-8')
1702 self._download_n_results(query, 1)
1704 elif prefix == 'all':
1705 self._download_n_results(query, self._max_yahoo_results)
1711 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1713 elif n > self._max_yahoo_results:
1714 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1715 n = self._max_yahoo_results
1716 self._download_n_results(query, n)
1718 except ValueError: # parsing prefix as integer fails
1719 self._download_n_results(query, 1)
1722 def _download_n_results(self, query, n):
1723 """Downloads a specified number of results for a query"""
1726 already_seen = set()
1730 self.report_download_page(query, pagenum)
1731 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1732 request = urllib2.Request(result_url, None, std_headers)
1734 page = urllib2.urlopen(request).read()
1735 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1736 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1739 # Extract video identifiers
1740 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1741 video_id = mobj.group(1)
1742 if video_id not in already_seen:
1743 video_ids.append(video_id)
1744 already_seen.add(video_id)
1745 if len(video_ids) == n:
1746 # Specified n videos reached
1747 for id in video_ids:
1748 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1751 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1752 for id in video_ids:
1753 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1756 pagenum = pagenum + 1
1758 class YoutubePlaylistIE(InfoExtractor):
1759 """Information Extractor for YouTube playlists."""
1761 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1762 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1763 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1764 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1767 def __init__(self, youtube_ie, downloader=None):
1768 InfoExtractor.__init__(self, downloader)
1769 self._youtube_ie = youtube_ie
1773 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1775 def report_download_page(self, playlist_id, pagenum):
1776 """Report attempt to download playlist page with given number."""
1777 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1779 def _real_initialize(self):
1780 self._youtube_ie.initialize()
1782 def _real_extract(self, url):
1783 # Extract playlist id
1784 mobj = re.match(self._VALID_URL, url)
1786 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1789 # Download playlist pages
1790 playlist_id = mobj.group(1)
1795 self.report_download_page(playlist_id, pagenum)
1796 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1798 page = urllib2.urlopen(request).read()
1799 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1800 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1803 # Extract video identifiers
1805 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1806 if mobj.group(1) not in ids_in_page:
1807 ids_in_page.append(mobj.group(1))
1808 video_ids.extend(ids_in_page)
1810 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1812 pagenum = pagenum + 1
1814 for id in video_ids:
1815 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1818 class YoutubeUserIE(InfoExtractor):
1819 """Information Extractor for YouTube users."""
1821 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1822 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1823 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1826 def __init__(self, youtube_ie, downloader=None):
1827 InfoExtractor.__init__(self, downloader)
1828 self._youtube_ie = youtube_ie
1832 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1834 def report_download_page(self, username):
1835 """Report attempt to download user page."""
1836 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1838 def _real_initialize(self):
1839 self._youtube_ie.initialize()
1841 def _real_extract(self, url):
1843 mobj = re.match(self._VALID_URL, url)
1845 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1848 # Download user page
1849 username = mobj.group(1)
1853 self.report_download_page(username)
1854 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1856 page = urllib2.urlopen(request).read()
1857 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1858 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1861 # Extract video identifiers
1864 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1865 if mobj.group(1) not in ids_in_page:
1866 ids_in_page.append(mobj.group(1))
1867 video_ids.extend(ids_in_page)
1869 for id in video_ids:
1870 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1873 class PostProcessor(object):
1874 """Post Processor class.
1876 PostProcessor objects can be added to downloaders with their
1877 add_post_processor() method. When the downloader has finished a
1878 successful download, it will take its internal chain of PostProcessors
1879 and start calling the run() method on each one of them, first with
1880 an initial argument and then with the returned value of the previous
1883 The chain will be stopped if one of them ever returns None or the end
1884 of the chain is reached.
1886 PostProcessor objects follow a "mutual registration" process similar
1887 to InfoExtractor objects.
1892 def __init__(self, downloader=None):
1893 self._downloader = downloader
1895 def set_downloader(self, downloader):
1896 """Sets the downloader for this PP."""
1897 self._downloader = downloader
1899 def run(self, information):
1900 """Run the PostProcessor.
1902 The "information" argument is a dictionary like the ones
1903 composed by InfoExtractors. The only difference is that this
1904 one has an extra field called "filepath" that points to the
1907 When this method returns None, the postprocessing chain is
1908 stopped. However, this method may return an information
1909 dictionary that will be passed to the next postprocessing
1910 object in the chain. It can be the one it received after
1911 changing some fields.
1913 In addition, this method may raise a PostProcessingError
1914 exception that will be taken into account by the downloader
1917 return information # by default, do nothing
1919 ### MAIN PROGRAM ###
1920 if __name__ == '__main__':
1922 # Modules needed only when running the main program
1926 # Function to update the program file with the latest version from bitbucket.org
1927 def update_self(downloader, filename):
1928 # Note: downloader only used for options
1929 if not os.access (filename, os.W_OK):
1930 sys.exit('ERROR: no write permissions on %s' % filename)
1932 downloader.to_stdout('Updating to latest stable version...')
1933 latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
1934 latest_version = urllib.urlopen(latest_url).read().strip()
1935 prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
1936 newcontent = urllib.urlopen(prog_url).read()
1937 stream = open(filename, 'w')
1938 stream.write(newcontent)
1940 downloader.to_stdout('Updated to version %s' % latest_version)
1942 # General configuration
1943 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1944 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1945 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1947 # Parse command line
1948 parser = optparse.OptionParser(
1949 usage='Usage: %prog [options] url...',
1950 version='2010.04.04',
1951 conflict_handler='resolve',
1954 parser.add_option('-h', '--help',
1955 action='help', help='print this help text and exit')
1956 parser.add_option('-v', '--version',
1957 action='version', help='print program version and exit')
1958 parser.add_option('-U', '--update',
1959 action='store_true', dest='update_self', help='update this program to latest stable version')
1960 parser.add_option('-i', '--ignore-errors',
1961 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1962 parser.add_option('-r', '--rate-limit',
1963 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1965 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1966 authentication.add_option('-u', '--username',
1967 dest='username', metavar='UN', help='account username')
1968 authentication.add_option('-p', '--password',
1969 dest='password', metavar='PW', help='account password')
1970 authentication.add_option('-n', '--netrc',
1971 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1972 parser.add_option_group(authentication)
1974 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1975 video_format.add_option('-f', '--format',
1976 action='store', dest='format', metavar='FMT', help='video format code')
1977 video_format.add_option('-b', '--best-quality',
1978 action='store_const', dest='format', help='download the best quality video possible', const='0')
1979 video_format.add_option('-m', '--mobile-version',
1980 action='store_const', dest='format', help='alias for -f 17', const='17')
1981 video_format.add_option('-d', '--high-def',
1982 action='store_const', dest='format', help='alias for -f 22', const='22')
1983 video_format.add_option('--all-formats',
1984 action='store_const', dest='format', help='download all available video formats', const='-1')
1985 parser.add_option_group(video_format)
1987 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1988 verbosity.add_option('-q', '--quiet',
1989 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1990 verbosity.add_option('-s', '--simulate',
1991 action='store_true', dest='simulate', help='do not download video', default=False)
1992 verbosity.add_option('-g', '--get-url',
1993 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1994 verbosity.add_option('-e', '--get-title',
1995 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1996 verbosity.add_option('--get-thumbnail',
1997 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
1998 verbosity.add_option('--get-description',
1999 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2000 verbosity.add_option('--no-progress',
2001 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2002 parser.add_option_group(verbosity)
2004 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2005 filesystem.add_option('-t', '--title',
2006 action='store_true', dest='usetitle', help='use title in file name', default=False)
2007 filesystem.add_option('-l', '--literal',
2008 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2009 filesystem.add_option('-o', '--output',
2010 dest='outtmpl', metavar='TPL', help='output filename template')
2011 filesystem.add_option('-a', '--batch-file',
2012 dest='batchfile', metavar='F', help='file containing URLs to download')
2013 filesystem.add_option('-w', '--no-overwrites',
2014 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2015 filesystem.add_option('-c', '--continue',
2016 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2017 parser.add_option_group(filesystem)
2019 (opts, args) = parser.parse_args()
2021 # Batch file verification
2023 if opts.batchfile is not None:
2025 batchurls = open(opts.batchfile, 'r').readlines()
2026 batchurls = [x.strip() for x in batchurls]
2027 batchurls = [x for x in batchurls if len(x) > 0]
2029 sys.exit(u'ERROR: batch file could not be read')
2030 all_urls = batchurls + args
2032 # Conflicting, missing and erroneous options
2033 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2034 parser.error(u'using .netrc conflicts with giving username/password')
2035 if opts.password is not None and opts.username is None:
2036 parser.error(u'account username missing')
2037 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2038 parser.error(u'using output template conflicts with using title or literal title')
2039 if opts.usetitle and opts.useliteral:
2040 parser.error(u'using title conflicts with using literal title')
2041 if opts.username is not None and opts.password is None:
2042 opts.password = getpass.getpass(u'Type account password and press return:')
2043 if opts.ratelimit is not None:
2044 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2045 if numeric_limit is None:
2046 parser.error(u'invalid rate limit specified')
2047 opts.ratelimit = numeric_limit
2049 # Information extractors
2050 youtube_ie = YoutubeIE()
2051 metacafe_ie = MetacafeIE(youtube_ie)
2052 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2053 youtube_user_ie = YoutubeUserIE(youtube_ie)
2054 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2055 google_ie = GoogleIE()
2056 google_search_ie = GoogleSearchIE(google_ie)
2057 photobucket_ie = PhotobucketIE()
2058 yahoo_ie = YahooIE()
2059 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2060 generic_ie = GenericIE()
2063 fd = FileDownloader({
2064 'usenetrc': opts.usenetrc,
2065 'username': opts.username,
2066 'password': opts.password,
2067 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2068 'forceurl': opts.geturl,
2069 'forcetitle': opts.gettitle,
2070 'forcethumbnail': opts.getthumbnail,
2071 'forcedescription': opts.getdescription,
2072 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2073 'format': opts.format,
2074 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2075 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2076 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2077 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2078 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2079 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2080 or u'%(id)s.%(ext)s'),
2081 'ignoreerrors': opts.ignoreerrors,
2082 'ratelimit': opts.ratelimit,
2083 'nooverwrites': opts.nooverwrites,
2084 'continuedl': opts.continue_dl,
2085 'noprogress': opts.noprogress,
2087 fd.add_info_extractor(youtube_search_ie)
2088 fd.add_info_extractor(youtube_pl_ie)
2089 fd.add_info_extractor(youtube_user_ie)
2090 fd.add_info_extractor(metacafe_ie)
2091 fd.add_info_extractor(youtube_ie)
2092 fd.add_info_extractor(google_ie)
2093 fd.add_info_extractor(google_search_ie)
2094 fd.add_info_extractor(photobucket_ie)
2095 fd.add_info_extractor(yahoo_ie)
2096 fd.add_info_extractor(yahoo_search_ie)
2098 # This must come last since it's the
2099 # fallback if none of the others work
2100 fd.add_info_extractor(generic_ie)
2103 if opts.update_self:
2104 update_self(fd, sys.argv[0])
2107 if len(all_urls) < 1:
2108 if not opts.update_self:
2109 parser.error(u'you must provide at least one URL')
2112 retcode = fd.download(all_urls)
2115 except DownloadError:
2117 except SameFileError:
2118 sys.exit(u'ERROR: fixed output name but more than one file to download')
2119 except KeyboardInterrupt:
2120 sys.exit(u'\nERROR: Interrupted by user')