2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # Author: Vasyl' Vavrychuk
7 # License: Public domain code
30 # parse_qs was moved from the cgi module to the urlparse module recently.
32 from urlparse import parse_qs
34 from cgi import parse_qs
37 'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
38 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
39 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
40 'Accept-Encoding': 'gzip, deflate',
41 'Accept-Language': 'en-us,en;q=0.5',
44 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
46 def preferredencoding():
47 """Get preferred encoding.
49 Returns the best encoding scheme for the system, based on
50 locale.getpreferredencoding() and some further tweaks.
52 def yield_preferredencoding():
54 pref = locale.getpreferredencoding()
60 return yield_preferredencoding().next()
62 def htmlentity_transform(matchobj):
63 """Transforms an HTML entity to a Unicode character.
65 This function receives a match object and is intended to be used with
66 the re.sub() function.
68 entity = matchobj.group(1)
70 # Known non-numeric HTML entity
71 if entity in htmlentitydefs.name2codepoint:
72 return unichr(htmlentitydefs.name2codepoint[entity])
75 mobj = re.match(ur'(?u)#(x?\d+)', entity)
77 numstr = mobj.group(1)
78 if numstr.startswith(u'x'):
80 numstr = u'0%s' % numstr
83 return unichr(long(numstr, base))
85 # Unknown entity in name, return its literal representation
86 return (u'&%s;' % entity)
88 def sanitize_title(utitle):
89 """Sanitizes a video title so it could be used as part of a filename."""
90 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
91 return utitle.replace(unicode(os.sep), u'%')
93 def sanitize_open(filename, open_mode):
94 """Try to open the given filename, and slightly tweak it if this fails.
96 Attempts to open the given filename. If this fails, it tries to change
97 the filename slightly, step by step, until it's either able to open it
98 or it fails and raises a final exception, like the standard open()
101 It returns the tuple (stream, definitive_file_name).
105 if sys.platform == 'win32':
107 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
108 return (sys.stdout, filename)
109 stream = open(filename, open_mode)
110 return (stream, filename)
111 except (IOError, OSError), err:
112 # In case of error, try to remove win32 forbidden chars
113 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
115 # An exception here should be caught in the caller
116 stream = open(filename, open_mode)
117 return (stream, filename)
119 class DownloadError(Exception):
120 """Download Error exception.
122 This exception may be thrown by FileDownloader objects if they are not
123 configured to continue on errors. They will contain the appropriate
128 class SameFileError(Exception):
129 """Same File exception.
131 This exception will be thrown by FileDownloader objects if they detect
132 multiple files would have to be downloaded to the same file on disk.
136 class PostProcessingError(Exception):
137 """Post Processing exception.
139 This exception may be raised by PostProcessor's .run() method to
140 indicate an error in the postprocessing task.
144 class UnavailableVideoError(Exception):
145 """Unavailable Format exception.
147 This exception will be thrown when a video is requested
148 in a format that is not available for that video.
152 class ContentTooShortError(Exception):
153 """Content Too Short exception.
155 This exception may be raised by FileDownloader objects when a file they
156 download is too small for what the server announced first, indicating
157 the connection was probably interrupted.
163 def __init__(self, downloaded, expected):
164 self.downloaded = downloaded
165 self.expected = expected
167 class YoutubeDLHandler(urllib2.HTTPHandler):
168 """Handler for HTTP requests and responses.
170 This class, when installed with an OpenerDirector, automatically adds
171 the standard headers to every HTTP request and handles gzipped and
172 deflated responses from web servers. If compression is to be avoided in
173 a particular request, the original request in the program code only has
174 to include the HTTP header "Youtubedl-No-Compression", which will be
175 removed before making the real request.
177 Part of this code was copied from:
179 http://techknack.net/python-urllib2-handlers/
181 Andrew Rowls, the author of that code, agreed to release it to the
188 return zlib.decompress(data, -zlib.MAX_WBITS)
190 return zlib.decompress(data)
192 def http_request(self, req):
193 for h in std_headers:
196 req.add_header(h, std_headers[h])
197 if 'Youtubedl-no-compression' in req.headers:
198 if 'Accept-encoding' in req.headers:
199 del req.headers['Accept-encoding']
200 del req.headers['Youtubedl-no-compression']
203 def http_response(self, req, resp):
206 if resp.headers.get('Content-encoding', '') == 'gzip':
207 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
208 resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
209 resp.msg = old_resp.msg
211 if resp.headers.get('Content-encoding', '') == 'deflate':
212 gz = StringIO.StringIO(self.deflate(resp.read()))
213 resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
214 resp.msg = old_resp.msg
217 class FileDownloader(object):
218 """File Downloader class.
220 File downloader objects are the ones responsible of downloading the
221 actual video file and writing it to disk if the user has requested
222 it, among some other tasks. In most cases there should be one per
223 program. As, given a video URL, the downloader doesn't know how to
224 extract all the needed information, task that InfoExtractors do, it
225 has to pass the URL to one of them.
227 For this, file downloader objects have a method that allows
228 InfoExtractors to be registered in a given order. When it is passed
229 a URL, the file downloader handles it to the first InfoExtractor it
230 finds that reports being able to handle it. The InfoExtractor extracts
231 all the information about the video or videos the URL refers to, and
232 asks the FileDownloader to process the video information, possibly
233 downloading the video.
235 File downloaders accept a lot of parameters. In order not to saturate
236 the object constructor with arguments, it receives a dictionary of
237 options instead. These options are available through the params
238 attribute for the InfoExtractors to use. The FileDownloader also
239 registers itself as the downloader in charge for the InfoExtractors
240 that are added to it, so this is a "mutual registration".
244 username: Username for authentication purposes.
245 password: Password for authentication purposes.
246 usenetrc: Use netrc for authentication instead.
247 quiet: Do not print messages to stdout.
248 forceurl: Force printing final URL.
249 forcetitle: Force printing title.
250 forcethumbnail: Force printing thumbnail URL.
251 forcedescription: Force printing description.
252 simulate: Do not download the video files.
253 format: Video format code.
254 format_limit: Highest quality format to try.
255 outtmpl: Template for output names.
256 ignoreerrors: Do not stop on download errors.
257 ratelimit: Download speed limit, in bytes/sec.
258 nooverwrites: Prevent overwriting files.
259 retries: Number of times to retry for HTTP error 5xx
260 continuedl: Try to continue downloads if possible.
261 noprogress: Do not print the progress bar.
262 playliststart: Playlist item to start at.
263 playlistend: Playlist item to end at.
264 logtostderr: Log messages to stderr instead of stdout.
265 consoletitle: Display progress in console window's titlebar.
266 nopart: Do not use temporary .part files.
272 _download_retcode = None
273 _num_downloads = None
276 def __init__(self, params):
277 """Create a FileDownloader object with the given options."""
280 self._download_retcode = 0
281 self._num_downloads = 0
282 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
286 def pmkdir(filename):
287 """Create directory components in filename. Similar to Unix "mkdir -p"."""
288 components = filename.split(os.sep)
289 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
290 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
291 for dir in aggregate:
292 if not os.path.exists(dir):
296 def format_bytes(bytes):
299 if type(bytes) is str:
304 exponent = long(math.log(bytes, 1024.0))
305 suffix = 'bkMGTPEZY'[exponent]
306 converted = float(bytes) / float(1024**exponent)
307 return '%.2f%s' % (converted, suffix)
310 def calc_percent(byte_counter, data_len):
313 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
316 def calc_eta(start, now, total, current):
320 if current == 0 or dif < 0.001: # One millisecond
322 rate = float(current) / dif
323 eta = long((float(total) - float(current)) / rate)
324 (eta_mins, eta_secs) = divmod(eta, 60)
327 return '%02d:%02d' % (eta_mins, eta_secs)
330 def calc_speed(start, now, bytes):
332 if bytes == 0 or dif < 0.001: # One millisecond
333 return '%10s' % '---b/s'
334 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
337 def best_block_size(elapsed_time, bytes):
338 new_min = max(bytes / 2.0, 1.0)
339 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
340 if elapsed_time < 0.001:
342 rate = bytes / elapsed_time
350 def parse_bytes(bytestr):
351 """Parse a string indicating a byte quantity into a long integer."""
352 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
355 number = float(matchobj.group(1))
356 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
357 return long(round(number * multiplier))
359 def add_info_extractor(self, ie):
360 """Add an InfoExtractor object to the end of the list."""
362 ie.set_downloader(self)
364 def add_post_processor(self, pp):
365 """Add a PostProcessor object to the end of the chain."""
367 pp.set_downloader(self)
369 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
370 """Print message to stdout if not in quiet mode."""
372 if not self.params.get('quiet', False):
373 terminator = [u'\n', u''][skip_eol]
374 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
375 self._screen_file.flush()
376 except (UnicodeEncodeError), err:
377 if not ignore_encoding_errors:
380 def to_stderr(self, message):
381 """Print message to stderr."""
382 print >>sys.stderr, message.encode(preferredencoding())
384 def to_cons_title(self, message):
385 """Set console/terminal window title to message."""
386 if not self.params.get('consoletitle', False):
388 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
389 # c_wchar_p() might not be necessary if `message` is
390 # already of type unicode()
391 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
392 elif 'TERM' in os.environ:
393 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
395 def fixed_template(self):
396 """Checks if the output template is fixed."""
397 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
399 def trouble(self, message=None):
400 """Determine action to take when a download problem appears.
402 Depending on if the downloader has been configured to ignore
403 download errors or not, this method may throw an exception or
404 not when errors are found, after printing the message.
406 if message is not None:
407 self.to_stderr(message)
408 if not self.params.get('ignoreerrors', False):
409 raise DownloadError(message)
410 self._download_retcode = 1
412 def slow_down(self, start_time, byte_counter):
413 """Sleep if the download speed is over the rate limit."""
414 rate_limit = self.params.get('ratelimit', None)
415 if rate_limit is None or byte_counter == 0:
418 elapsed = now - start_time
421 speed = float(byte_counter) / elapsed
422 if speed > rate_limit:
423 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
425 def temp_name(self, filename):
426 """Returns a temporary filename for the given filename."""
427 if self.params.get('nopart', False) or filename == u'-' or \
428 (os.path.exists(filename) and not os.path.isfile(filename)):
430 return filename + u'.part'
432 def try_rename(self, old_filename, new_filename):
434 if old_filename == new_filename:
436 os.rename(old_filename, new_filename)
437 except (IOError, OSError), err:
438 self.trouble(u'ERROR: unable to rename file')
440 def report_destination(self, filename):
441 """Report destination filename."""
442 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
444 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
445 """Report download progress."""
446 if self.params.get('noprogress', False):
448 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
449 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
450 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
451 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
453 def report_resuming_byte(self, resume_len):
454 """Report attempt to resume at given byte."""
455 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
457 def report_retry(self, count, retries):
458 """Report retry in case of HTTP error 5xx"""
459 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
461 def report_file_already_downloaded(self, file_name):
462 """Report file has already been fully downloaded."""
464 self.to_screen(u'[download] %s has already been downloaded' % file_name)
465 except (UnicodeEncodeError), err:
466 self.to_screen(u'[download] The file has already been downloaded')
468 def report_unable_to_resume(self):
469 """Report it was impossible to resume download."""
470 self.to_screen(u'[download] Unable to resume')
472 def report_finish(self):
473 """Report download finished."""
474 if self.params.get('noprogress', False):
475 self.to_screen(u'[download] Download completed')
479 def increment_downloads(self):
480 """Increment the ordinal that assigns a number to each file."""
481 self._num_downloads += 1
483 def process_info(self, info_dict):
484 """Process a single dictionary returned by an InfoExtractor."""
485 # Do nothing else if in simulate mode
486 if self.params.get('simulate', False):
488 if self.params.get('forcetitle', False):
489 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
490 if self.params.get('forceurl', False):
491 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
492 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
493 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
494 if self.params.get('forcedescription', False) and 'description' in info_dict:
495 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
500 template_dict = dict(info_dict)
501 template_dict['epoch'] = unicode(long(time.time()))
502 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
503 filename = self.params['outtmpl'] % template_dict
504 except (ValueError, KeyError), err:
505 self.trouble(u'ERROR: invalid system charset or erroneous output template')
507 if self.params.get('nooverwrites', False) and os.path.exists(filename):
508 self.to_stderr(u'WARNING: file exists and will be skipped')
512 self.pmkdir(filename)
513 except (OSError, IOError), err:
514 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
518 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
519 except (OSError, IOError), err:
520 raise UnavailableVideoError
521 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
522 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
524 except (ContentTooShortError, ), err:
525 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
530 self.post_process(filename, info_dict)
531 except (PostProcessingError), err:
532 self.trouble(u'ERROR: postprocessing: %s' % str(err))
535 def download(self, url_list):
536 """Download a given list of URLs."""
537 if len(url_list) > 1 and self.fixed_template():
538 raise SameFileError(self.params['outtmpl'])
541 suitable_found = False
543 # Go to next InfoExtractor if not suitable
544 if not ie.suitable(url):
547 # Suitable InfoExtractor found
548 suitable_found = True
550 # Extract information from URL and process it
553 # Suitable InfoExtractor had been found; go to next URL
556 if not suitable_found:
557 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
559 return self._download_retcode
561 def post_process(self, filename, ie_info):
562 """Run the postprocessing chain on the given file."""
564 info['filepath'] = filename
570 def _download_with_rtmpdump(self, filename, url, player_url):
571 self.report_destination(filename)
572 tmpfilename = self.temp_name(filename)
574 # Check for rtmpdump first
576 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
577 except (OSError, IOError):
578 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
581 # Download using rtmpdump. rtmpdump returns exit code 2 when
582 # the connection was interrumpted and resuming appears to be
583 # possible. This is part of rtmpdump's normal usage, AFAIK.
584 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
585 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
586 while retval == 2 or retval == 1:
587 prevsize = os.path.getsize(tmpfilename)
588 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
589 time.sleep(5.0) # This seems to be needed
590 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
591 cursize = os.path.getsize(tmpfilename)
592 if prevsize == cursize and retval == 1:
595 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
596 self.try_rename(tmpfilename, filename)
599 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
602 def _do_download(self, filename, url, player_url):
603 # Check file already present
604 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
605 self.report_file_already_downloaded(filename)
608 # Attempt to download using rtmpdump
609 if url.startswith('rtmp'):
610 return self._download_with_rtmpdump(filename, url, player_url)
612 tmpfilename = self.temp_name(filename)
616 # Do not include the Accept-Encoding header
617 headers = {'Youtubedl-no-compression': 'True'}
618 basic_request = urllib2.Request(url, None, headers)
619 request = urllib2.Request(url, None, headers)
621 # Establish possible resume length
622 if os.path.isfile(tmpfilename):
623 resume_len = os.path.getsize(tmpfilename)
627 # Request parameters in case of being able to resume
628 if self.params.get('continuedl', False) and resume_len != 0:
629 self.report_resuming_byte(resume_len)
630 request.add_header('Range','bytes=%d-' % resume_len)
634 retries = self.params.get('retries', 0)
635 while count <= retries:
636 # Establish connection
638 data = urllib2.urlopen(request)
640 except (urllib2.HTTPError, ), err:
641 if (err.code < 500 or err.code >= 600) and err.code != 416:
642 # Unexpected HTTP error
644 elif err.code == 416:
645 # Unable to resume (requested range not satisfiable)
647 # Open the connection again without the range header
648 data = urllib2.urlopen(basic_request)
649 content_length = data.info()['Content-Length']
650 except (urllib2.HTTPError, ), err:
651 if err.code < 500 or err.code >= 600:
654 # Examine the reported length
655 if (content_length is not None and
656 (resume_len - 100 < long(content_length) < resume_len + 100)):
657 # The file had already been fully downloaded.
658 # Explanation to the above condition: in issue #175 it was revealed that
659 # YouTube sometimes adds or removes a few bytes from the end of the file,
660 # changing the file size slightly and causing problems for some users. So
661 # I decided to implement a suggested change and consider the file
662 # completely downloaded if the file size differs less than 100 bytes from
663 # the one in the hard drive.
664 self.report_file_already_downloaded(filename)
665 self.try_rename(tmpfilename, filename)
668 # The length does not match, we start the download over
669 self.report_unable_to_resume()
675 self.report_retry(count, retries)
678 self.trouble(u'ERROR: giving up after %s retries' % retries)
681 data_len = data.info().get('Content-length', None)
682 if data_len is not None:
683 data_len = long(data_len) + resume_len
684 data_len_str = self.format_bytes(data_len)
685 byte_counter = 0 + resume_len
691 data_block = data.read(block_size)
693 if len(data_block) == 0:
695 byte_counter += len(data_block)
697 # Open file just in time
700 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
701 self.report_destination(filename)
702 except (OSError, IOError), err:
703 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
706 stream.write(data_block)
707 except (IOError, OSError), err:
708 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
710 block_size = self.best_block_size(after - before, len(data_block))
713 percent_str = self.calc_percent(byte_counter, data_len)
714 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
715 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
716 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
719 self.slow_down(start, byte_counter - resume_len)
723 if data_len is not None and byte_counter != data_len:
724 raise ContentTooShortError(byte_counter, long(data_len))
725 self.try_rename(tmpfilename, filename)
728 class InfoExtractor(object):
729 """Information Extractor class.
731 Information extractors are the classes that, given a URL, extract
732 information from the video (or videos) the URL refers to. This
733 information includes the real video URL, the video title and simplified
734 title, author and others. The information is stored in a dictionary
735 which is then passed to the FileDownloader. The FileDownloader
736 processes this information possibly downloading the video to the file
737 system, among other possible outcomes. The dictionaries must include
738 the following fields:
740 id: Video identifier.
741 url: Final video URL.
742 uploader: Nickname of the video uploader.
743 title: Literal title.
744 stitle: Simplified title.
745 ext: Video filename extension.
746 format: Video format.
747 player_url: SWF Player URL (may be None).
749 The following fields are optional. Their primary purpose is to allow
750 youtube-dl to serve as the backend for a video search function, such
751 as the one in youtube2mp3. They are only used when their respective
752 forced printing functions are called:
754 thumbnail: Full URL to a video thumbnail image.
755 description: One-line video description.
757 Subclasses of this one should re-define the _real_initialize() and
758 _real_extract() methods, as well as the suitable() static method.
759 Probably, they should also be instantiated and added to the main
766 def __init__(self, downloader=None):
767 """Constructor. Receives an optional downloader."""
769 self.set_downloader(downloader)
773 """Receives a URL and returns True if suitable for this IE."""
776 def initialize(self):
777 """Initializes an instance (authentication, etc)."""
779 self._real_initialize()
782 def extract(self, url):
783 """Extracts URL information and returns it in list of dicts."""
785 return self._real_extract(url)
787 def set_downloader(self, downloader):
788 """Sets the downloader for this IE."""
789 self._downloader = downloader
791 def _real_initialize(self):
792 """Real initialization process. Redefine in subclasses."""
795 def _real_extract(self, url):
796 """Real extraction process. Redefine in subclasses."""
799 class YoutubeIE(InfoExtractor):
800 """Information extractor for youtube.com."""
802 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
803 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
804 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
805 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
806 _NETRC_MACHINE = 'youtube'
807 # Listed in order of quality
808 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
809 _video_extensions = {
815 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
822 return (re.match(YoutubeIE._VALID_URL, url) is not None)
824 def report_lang(self):
825 """Report attempt to set language."""
826 self._downloader.to_screen(u'[youtube] Setting language')
828 def report_login(self):
829 """Report attempt to log in."""
830 self._downloader.to_screen(u'[youtube] Logging in')
832 def report_age_confirmation(self):
833 """Report attempt to confirm age."""
834 self._downloader.to_screen(u'[youtube] Confirming age')
836 def report_video_webpage_download(self, video_id):
837 """Report attempt to download video webpage."""
838 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
840 def report_video_info_webpage_download(self, video_id):
841 """Report attempt to download video info webpage."""
842 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
844 def report_information_extraction(self, video_id):
845 """Report attempt to extract video information."""
846 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
848 def report_unavailable_format(self, video_id, format):
849 """Report extracted video URL."""
850 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
852 def report_rtmp_download(self):
853 """Indicate the download will use the RTMP protocol."""
854 self._downloader.to_screen(u'[youtube] RTMP download detected')
856 def _real_initialize(self):
857 if self._downloader is None:
862 downloader_params = self._downloader.params
864 # Attempt to use provided username and password or .netrc data
865 if downloader_params.get('username', None) is not None:
866 username = downloader_params['username']
867 password = downloader_params['password']
868 elif downloader_params.get('usenetrc', False):
870 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
875 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
876 except (IOError, netrc.NetrcParseError), err:
877 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
881 request = urllib2.Request(self._LANG_URL)
884 urllib2.urlopen(request).read()
885 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
886 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
889 # No authentication to be performed
895 'current_form': 'loginForm',
897 'action_login': 'Log In',
898 'username': username,
899 'password': password,
901 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
904 login_results = urllib2.urlopen(request).read()
905 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
906 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
908 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
909 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
915 'action_confirm': 'Confirm',
917 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
919 self.report_age_confirmation()
920 age_results = urllib2.urlopen(request).read()
921 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
922 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
925 def _real_extract(self, url):
926 # Extract video id from URL
927 mobj = re.match(self._VALID_URL, url)
929 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
931 video_id = mobj.group(2)
934 self.report_video_webpage_download(video_id)
935 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
937 video_webpage = urllib2.urlopen(request).read()
938 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
939 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
942 # Attempt to extract SWF player URL
943 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
945 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
950 self.report_video_info_webpage_download(video_id)
951 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
952 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
953 % (video_id, el_type))
954 request = urllib2.Request(video_info_url)
956 video_info_webpage = urllib2.urlopen(request).read()
957 video_info = parse_qs(video_info_webpage)
958 if 'token' in video_info:
960 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
961 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
963 if 'token' not in video_info:
964 if 'reason' in video_info:
965 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
967 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
970 # Start extracting information
971 self.report_information_extraction(video_id)
974 if 'author' not in video_info:
975 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
977 video_uploader = urllib.unquote_plus(video_info['author'][0])
980 if 'title' not in video_info:
981 self._downloader.trouble(u'ERROR: unable to extract video title')
983 video_title = urllib.unquote_plus(video_info['title'][0])
984 video_title = video_title.decode('utf-8')
985 video_title = sanitize_title(video_title)
988 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
989 simple_title = simple_title.strip(ur'_')
992 if 'thumbnail_url' not in video_info:
993 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
995 else: # don't panic if we can't find it
996 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1000 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
1001 if mobj is not None:
1002 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1003 format_expressions = ['%d %B %Y', '%B %d %Y']
1004 for expression in format_expressions:
1006 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1011 video_description = 'No description available.'
1012 if self._downloader.params.get('forcedescription', False):
1013 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1014 if mobj is not None:
1015 video_description = mobj.group(1)
1018 video_token = urllib.unquote_plus(video_info['token'][0])
1020 # Decide which formats to download
1021 req_format = self._downloader.params.get('format', None)
1023 if 'fmt_url_map' in video_info:
1024 url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
1025 format_limit = self._downloader.params.get('format_limit', None)
1026 if format_limit is not None and format_limit in self._available_formats:
1027 format_list = self._available_formats[self._available_formats.index(format_limit):]
1029 format_list = self._available_formats
1030 existing_formats = [x for x in format_list if x in url_map]
1031 if len(existing_formats) == 0:
1032 self._downloader.trouble(u'ERROR: no known formats available for video')
1034 if req_format is None:
1035 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1036 elif req_format == '-1':
1037 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1040 if req_format not in url_map:
1041 self._downloader.trouble(u'ERROR: requested format not available')
1043 video_url_list = [(req_format, url_map[req_format])] # Specific format
1045 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1046 self.report_rtmp_download()
1047 video_url_list = [(None, video_info['conn'][0])]
1050 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1053 for format_param, video_real_url in video_url_list:
1054 # At this point we have a new video
1055 self._downloader.increment_downloads()
1058 video_extension = self._video_extensions.get(format_param, 'flv')
1060 # Find the video URL in fmt_url_map or conn paramters
1062 # Process video information
1063 self._downloader.process_info({
1064 'id': video_id.decode('utf-8'),
1065 'url': video_real_url.decode('utf-8'),
1066 'uploader': video_uploader.decode('utf-8'),
1067 'upload_date': upload_date,
1068 'title': video_title,
1069 'stitle': simple_title,
1070 'ext': video_extension.decode('utf-8'),
1071 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1072 'thumbnail': video_thumbnail.decode('utf-8'),
1073 'description': video_description.decode('utf-8'),
1074 'player_url': player_url,
1076 except UnavailableVideoError, err:
1077 self._downloader.trouble(u'\nERROR: unable to download video')
1080 class MetacafeIE(InfoExtractor):
1081 """Information Extractor for metacafe.com."""
1083 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1084 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1085 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1088 def __init__(self, youtube_ie, downloader=None):
1089 InfoExtractor.__init__(self, downloader)
1090 self._youtube_ie = youtube_ie
1094 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1096 def report_disclaimer(self):
1097 """Report disclaimer retrieval."""
1098 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1100 def report_age_confirmation(self):
1101 """Report attempt to confirm age."""
1102 self._downloader.to_screen(u'[metacafe] Confirming age')
1104 def report_download_webpage(self, video_id):
1105 """Report webpage download."""
1106 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1108 def report_extraction(self, video_id):
1109 """Report information extraction."""
1110 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1112 def _real_initialize(self):
1113 # Retrieve disclaimer
1114 request = urllib2.Request(self._DISCLAIMER)
1116 self.report_disclaimer()
1117 disclaimer = urllib2.urlopen(request).read()
1118 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1119 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1125 'submit': "Continue - I'm over 18",
1127 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1129 self.report_age_confirmation()
1130 disclaimer = urllib2.urlopen(request).read()
1131 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1132 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1135 def _real_extract(self, url):
1136 # Extract id and simplified title from URL
1137 mobj = re.match(self._VALID_URL, url)
1139 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1142 video_id = mobj.group(1)
1144 # Check if video comes from YouTube
1145 mobj2 = re.match(r'^yt-(.*)$', video_id)
1146 if mobj2 is not None:
1147 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1150 # At this point we have a new video
1151 self._downloader.increment_downloads()
1153 simple_title = mobj.group(2).decode('utf-8')
1155 # Retrieve video webpage to extract further information
1156 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1158 self.report_download_webpage(video_id)
1159 webpage = urllib2.urlopen(request).read()
1160 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1161 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1164 # Extract URL, uploader and title from webpage
1165 self.report_extraction(video_id)
1166 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1167 if mobj is not None:
1168 mediaURL = urllib.unquote(mobj.group(1))
1169 video_extension = mediaURL[-3:]
1171 # Extract gdaKey if available
1172 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1174 video_url = mediaURL
1176 gdaKey = mobj.group(1)
1177 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1179 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1181 self._downloader.trouble(u'ERROR: unable to extract media URL')
1183 vardict = parse_qs(mobj.group(1))
1184 if 'mediaData' not in vardict:
1185 self._downloader.trouble(u'ERROR: unable to extract media URL')
1187 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1189 self._downloader.trouble(u'ERROR: unable to extract media URL')
1191 mediaURL = mobj.group(1).replace('\\/', '/')
1192 video_extension = mediaURL[-3:]
1193 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1195 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1197 self._downloader.trouble(u'ERROR: unable to extract title')
1199 video_title = mobj.group(1).decode('utf-8')
1200 video_title = sanitize_title(video_title)
1202 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1204 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1206 video_uploader = mobj.group(1)
1209 # Process video information
1210 self._downloader.process_info({
1211 'id': video_id.decode('utf-8'),
1212 'url': video_url.decode('utf-8'),
1213 'uploader': video_uploader.decode('utf-8'),
1214 'upload_date': u'NA',
1215 'title': video_title,
1216 'stitle': simple_title,
1217 'ext': video_extension.decode('utf-8'),
1221 except UnavailableVideoError:
1222 self._downloader.trouble(u'\nERROR: unable to download video')
1225 class DailymotionIE(InfoExtractor):
1226 """Information Extractor for Dailymotion"""
1228 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1230 def __init__(self, downloader=None):
1231 InfoExtractor.__init__(self, downloader)
1235 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1237 def report_download_webpage(self, video_id):
1238 """Report webpage download."""
1239 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1241 def report_extraction(self, video_id):
1242 """Report information extraction."""
1243 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1245 def _real_initialize(self):
1248 def _real_extract(self, url):
1249 # Extract id and simplified title from URL
1250 mobj = re.match(self._VALID_URL, url)
1252 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1255 # At this point we have a new video
1256 self._downloader.increment_downloads()
1257 video_id = mobj.group(1)
1259 simple_title = mobj.group(2).decode('utf-8')
1260 video_extension = 'flv'
1262 # Retrieve video webpage to extract further information
1263 request = urllib2.Request(url)
1265 self.report_download_webpage(video_id)
1266 webpage = urllib2.urlopen(request).read()
1267 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1268 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1271 # Extract URL, uploader and title from webpage
1272 self.report_extraction(video_id)
1273 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1275 self._downloader.trouble(u'ERROR: unable to extract media URL')
1277 mediaURL = urllib.unquote(mobj.group(1))
1279 # if needed add http://www.dailymotion.com/ if relative URL
1281 video_url = mediaURL
1283 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1284 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1286 self._downloader.trouble(u'ERROR: unable to extract title')
1288 video_title = mobj.group(1).decode('utf-8')
1289 video_title = sanitize_title(video_title)
1291 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
1293 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1295 video_uploader = mobj.group(1)
1298 # Process video information
1299 self._downloader.process_info({
1300 'id': video_id.decode('utf-8'),
1301 'url': video_url.decode('utf-8'),
1302 'uploader': video_uploader.decode('utf-8'),
1303 'upload_date': u'NA',
1304 'title': video_title,
1305 'stitle': simple_title,
1306 'ext': video_extension.decode('utf-8'),
1310 except UnavailableVideoError:
1311 self._downloader.trouble(u'\nERROR: unable to download video')
1313 class GoogleIE(InfoExtractor):
1314 """Information extractor for video.google.com."""
1316 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1318 def __init__(self, downloader=None):
1319 InfoExtractor.__init__(self, downloader)
1323 return (re.match(GoogleIE._VALID_URL, url) is not None)
1325 def report_download_webpage(self, video_id):
1326 """Report webpage download."""
1327 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1329 def report_extraction(self, video_id):
1330 """Report information extraction."""
1331 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1333 def _real_initialize(self):
1336 def _real_extract(self, url):
1337 # Extract id from URL
1338 mobj = re.match(self._VALID_URL, url)
1340 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1343 # At this point we have a new video
1344 self._downloader.increment_downloads()
1345 video_id = mobj.group(1)
1347 video_extension = 'mp4'
1349 # Retrieve video webpage to extract further information
1350 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1352 self.report_download_webpage(video_id)
1353 webpage = urllib2.urlopen(request).read()
1354 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1355 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1358 # Extract URL, uploader, and title from webpage
1359 self.report_extraction(video_id)
1360 mobj = re.search(r"download_url:'([^']+)'", webpage)
1362 video_extension = 'flv'
1363 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1365 self._downloader.trouble(u'ERROR: unable to extract media URL')
1367 mediaURL = urllib.unquote(mobj.group(1))
1368 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1369 mediaURL = mediaURL.replace('\\x26', '\x26')
1371 video_url = mediaURL
1373 mobj = re.search(r'<title>(.*)</title>', webpage)
1375 self._downloader.trouble(u'ERROR: unable to extract title')
1377 video_title = mobj.group(1).decode('utf-8')
1378 video_title = sanitize_title(video_title)
1379 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1381 # Extract video description
1382 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1384 self._downloader.trouble(u'ERROR: unable to extract video description')
1386 video_description = mobj.group(1).decode('utf-8')
1387 if not video_description:
1388 video_description = 'No description available.'
1390 # Extract video thumbnail
1391 if self._downloader.params.get('forcethumbnail', False):
1392 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1394 webpage = urllib2.urlopen(request).read()
1395 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1396 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1398 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1400 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1402 video_thumbnail = mobj.group(1)
1403 else: # we need something to pass to process_info
1404 video_thumbnail = ''
1408 # Process video information
1409 self._downloader.process_info({
1410 'id': video_id.decode('utf-8'),
1411 'url': video_url.decode('utf-8'),
1413 'upload_date': u'NA',
1414 'title': video_title,
1415 'stitle': simple_title,
1416 'ext': video_extension.decode('utf-8'),
1420 except UnavailableVideoError:
1421 self._downloader.trouble(u'\nERROR: unable to download video')
1424 class PhotobucketIE(InfoExtractor):
1425 """Information extractor for photobucket.com."""
1427 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1429 def __init__(self, downloader=None):
1430 InfoExtractor.__init__(self, downloader)
1434 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1436 def report_download_webpage(self, video_id):
1437 """Report webpage download."""
1438 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1440 def report_extraction(self, video_id):
1441 """Report information extraction."""
1442 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1444 def _real_initialize(self):
1447 def _real_extract(self, url):
1448 # Extract id from URL
1449 mobj = re.match(self._VALID_URL, url)
1451 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1454 # At this point we have a new video
1455 self._downloader.increment_downloads()
1456 video_id = mobj.group(1)
1458 video_extension = 'flv'
1460 # Retrieve video webpage to extract further information
1461 request = urllib2.Request(url)
1463 self.report_download_webpage(video_id)
1464 webpage = urllib2.urlopen(request).read()
1465 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1466 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1469 # Extract URL, uploader, and title from webpage
1470 self.report_extraction(video_id)
1471 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1473 self._downloader.trouble(u'ERROR: unable to extract media URL')
1475 mediaURL = urllib.unquote(mobj.group(1))
1477 video_url = mediaURL
1479 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1481 self._downloader.trouble(u'ERROR: unable to extract title')
1483 video_title = mobj.group(1).decode('utf-8')
1484 video_title = sanitize_title(video_title)
1485 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1487 video_uploader = mobj.group(2).decode('utf-8')
1490 # Process video information
1491 self._downloader.process_info({
1492 'id': video_id.decode('utf-8'),
1493 'url': video_url.decode('utf-8'),
1494 'uploader': video_uploader,
1495 'upload_date': u'NA',
1496 'title': video_title,
1497 'stitle': simple_title,
1498 'ext': video_extension.decode('utf-8'),
1502 except UnavailableVideoError:
1503 self._downloader.trouble(u'\nERROR: unable to download video')
1506 class YahooIE(InfoExtractor):
1507 """Information extractor for video.yahoo.com."""
1509 # _VALID_URL matches all Yahoo! Video URLs
1510 # _VPAGE_URL matches only the extractable '/watch/' URLs
1511 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1512 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1514 def __init__(self, downloader=None):
1515 InfoExtractor.__init__(self, downloader)
1519 return (re.match(YahooIE._VALID_URL, url) is not None)
1521 def report_download_webpage(self, video_id):
1522 """Report webpage download."""
1523 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1525 def report_extraction(self, video_id):
1526 """Report information extraction."""
1527 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1529 def _real_initialize(self):
1532 def _real_extract(self, url, new_video=True):
1533 # Extract ID from URL
1534 mobj = re.match(self._VALID_URL, url)
1536 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1539 # At this point we have a new video
1540 self._downloader.increment_downloads()
1541 video_id = mobj.group(2)
1542 video_extension = 'flv'
1544 # Rewrite valid but non-extractable URLs as
1545 # extractable English language /watch/ URLs
1546 if re.match(self._VPAGE_URL, url) is None:
1547 request = urllib2.Request(url)
1549 webpage = urllib2.urlopen(request).read()
1550 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1551 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1554 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1556 self._downloader.trouble(u'ERROR: Unable to extract id field')
1558 yahoo_id = mobj.group(1)
1560 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1562 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1564 yahoo_vid = mobj.group(1)
1566 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1567 return self._real_extract(url, new_video=False)
1569 # Retrieve video webpage to extract further information
1570 request = urllib2.Request(url)
1572 self.report_download_webpage(video_id)
1573 webpage = urllib2.urlopen(request).read()
1574 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1575 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1578 # Extract uploader and title from webpage
1579 self.report_extraction(video_id)
1580 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1582 self._downloader.trouble(u'ERROR: unable to extract video title')
1584 video_title = mobj.group(1).decode('utf-8')
1585 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1587 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1589 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1591 video_uploader = mobj.group(1).decode('utf-8')
1593 # Extract video thumbnail
1594 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1596 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1598 video_thumbnail = mobj.group(1).decode('utf-8')
1600 # Extract video description
1601 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1603 self._downloader.trouble(u'ERROR: unable to extract video description')
1605 video_description = mobj.group(1).decode('utf-8')
1606 if not video_description: video_description = 'No description available.'
1608 # Extract video height and width
1609 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1611 self._downloader.trouble(u'ERROR: unable to extract video height')
1613 yv_video_height = mobj.group(1)
1615 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1617 self._downloader.trouble(u'ERROR: unable to extract video width')
1619 yv_video_width = mobj.group(1)
1621 # Retrieve video playlist to extract media URL
1622 # I'm not completely sure what all these options are, but we
1623 # seem to need most of them, otherwise the server sends a 401.
1624 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1625 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1626 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1627 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1628 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1630 self.report_download_webpage(video_id)
1631 webpage = urllib2.urlopen(request).read()
1632 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1633 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1636 # Extract media URL from playlist XML
1637 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1639 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1641 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1642 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1645 # Process video information
1646 self._downloader.process_info({
1647 'id': video_id.decode('utf-8'),
1649 'uploader': video_uploader,
1650 'upload_date': u'NA',
1651 'title': video_title,
1652 'stitle': simple_title,
1653 'ext': video_extension.decode('utf-8'),
1654 'thumbnail': video_thumbnail.decode('utf-8'),
1655 'description': video_description,
1656 'thumbnail': video_thumbnail,
1657 'description': video_description,
1660 except UnavailableVideoError:
1661 self._downloader.trouble(u'\nERROR: unable to download video')
1664 class GenericIE(InfoExtractor):
1665 """Generic last-resort information extractor."""
1667 def __init__(self, downloader=None):
1668 InfoExtractor.__init__(self, downloader)
1674 def report_download_webpage(self, video_id):
1675 """Report webpage download."""
1676 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1677 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1679 def report_extraction(self, video_id):
1680 """Report information extraction."""
1681 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1683 def _real_initialize(self):
1686 def _real_extract(self, url):
1687 # At this point we have a new video
1688 self._downloader.increment_downloads()
1690 video_id = url.split('/')[-1]
1691 request = urllib2.Request(url)
1693 self.report_download_webpage(video_id)
1694 webpage = urllib2.urlopen(request).read()
1695 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1696 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1698 except ValueError, err:
1699 # since this is the last-resort InfoExtractor, if
1700 # this error is thrown, it'll be thrown here
1701 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1704 self.report_extraction(video_id)
1705 # Start with something easy: JW Player in SWFObject
1706 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1708 # Broaden the search a little bit
1709 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1711 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1714 # It's possible that one of the regexes
1715 # matched, but returned an empty group:
1716 if mobj.group(1) is None:
1717 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1720 video_url = urllib.unquote(mobj.group(1))
1721 video_id = os.path.basename(video_url)
1723 # here's a fun little line of code for you:
1724 video_extension = os.path.splitext(video_id)[1][1:]
1725 video_id = os.path.splitext(video_id)[0]
1727 # it's tempting to parse this further, but you would
1728 # have to take into account all the variations like
1729 # Video Title - Site Name
1730 # Site Name | Video Title
1731 # Video Title - Tagline | Site Name
1732 # and so on and so forth; it's just not practical
1733 mobj = re.search(r'<title>(.*)</title>', webpage)
1735 self._downloader.trouble(u'ERROR: unable to extract title')
1737 video_title = mobj.group(1).decode('utf-8')
1738 video_title = sanitize_title(video_title)
1739 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1741 # video uploader is domain name
1742 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1744 self._downloader.trouble(u'ERROR: unable to extract title')
1746 video_uploader = mobj.group(1).decode('utf-8')
1749 # Process video information
1750 self._downloader.process_info({
1751 'id': video_id.decode('utf-8'),
1752 'url': video_url.decode('utf-8'),
1753 'uploader': video_uploader,
1754 'upload_date': u'NA',
1755 'title': video_title,
1756 'stitle': simple_title,
1757 'ext': video_extension.decode('utf-8'),
1761 except UnavailableVideoError, err:
1762 self._downloader.trouble(u'\nERROR: unable to download video')
1765 class YoutubeSearchIE(InfoExtractor):
1766 """Information Extractor for YouTube search queries."""
1767 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1768 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1769 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1770 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1772 _max_youtube_results = 1000
1774 def __init__(self, youtube_ie, downloader=None):
1775 InfoExtractor.__init__(self, downloader)
1776 self._youtube_ie = youtube_ie
1780 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1782 def report_download_page(self, query, pagenum):
1783 """Report attempt to download playlist page with given number."""
1784 query = query.decode(preferredencoding())
1785 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1787 def _real_initialize(self):
1788 self._youtube_ie.initialize()
1790 def _real_extract(self, query):
1791 mobj = re.match(self._VALID_QUERY, query)
1793 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1796 prefix, query = query.split(':')
1798 query = query.encode('utf-8')
1800 self._download_n_results(query, 1)
1802 elif prefix == 'all':
1803 self._download_n_results(query, self._max_youtube_results)
1809 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1811 elif n > self._max_youtube_results:
1812 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1813 n = self._max_youtube_results
1814 self._download_n_results(query, n)
1816 except ValueError: # parsing prefix as integer fails
1817 self._download_n_results(query, 1)
1820 def _download_n_results(self, query, n):
1821 """Downloads a specified number of results for a query"""
1824 already_seen = set()
1828 self.report_download_page(query, pagenum)
1829 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1830 request = urllib2.Request(result_url)
1832 page = urllib2.urlopen(request).read()
1833 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1834 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1837 # Extract video identifiers
1838 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1839 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1840 if video_id not in already_seen:
1841 video_ids.append(video_id)
1842 already_seen.add(video_id)
1843 if len(video_ids) == n:
1844 # Specified n videos reached
1845 for id in video_ids:
1846 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1849 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1850 for id in video_ids:
1851 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1854 pagenum = pagenum + 1
1856 class GoogleSearchIE(InfoExtractor):
1857 """Information Extractor for Google Video search queries."""
1858 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1859 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1860 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1861 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1863 _max_google_results = 1000
1865 def __init__(self, google_ie, downloader=None):
1866 InfoExtractor.__init__(self, downloader)
1867 self._google_ie = google_ie
1871 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1873 def report_download_page(self, query, pagenum):
1874 """Report attempt to download playlist page with given number."""
1875 query = query.decode(preferredencoding())
1876 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1878 def _real_initialize(self):
1879 self._google_ie.initialize()
1881 def _real_extract(self, query):
1882 mobj = re.match(self._VALID_QUERY, query)
1884 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1887 prefix, query = query.split(':')
1889 query = query.encode('utf-8')
1891 self._download_n_results(query, 1)
1893 elif prefix == 'all':
1894 self._download_n_results(query, self._max_google_results)
1900 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1902 elif n > self._max_google_results:
1903 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1904 n = self._max_google_results
1905 self._download_n_results(query, n)
1907 except ValueError: # parsing prefix as integer fails
1908 self._download_n_results(query, 1)
1911 def _download_n_results(self, query, n):
1912 """Downloads a specified number of results for a query"""
1915 already_seen = set()
1919 self.report_download_page(query, pagenum)
1920 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1921 request = urllib2.Request(result_url)
1923 page = urllib2.urlopen(request).read()
1924 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1925 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1928 # Extract video identifiers
1929 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1930 video_id = mobj.group(1)
1931 if video_id not in already_seen:
1932 video_ids.append(video_id)
1933 already_seen.add(video_id)
1934 if len(video_ids) == n:
1935 # Specified n videos reached
1936 for id in video_ids:
1937 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1940 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1941 for id in video_ids:
1942 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1945 pagenum = pagenum + 1
1947 class YahooSearchIE(InfoExtractor):
1948 """Information Extractor for Yahoo! Video search queries."""
1949 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1950 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1951 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1952 _MORE_PAGES_INDICATOR = r'\s*Next'
1954 _max_yahoo_results = 1000
1956 def __init__(self, yahoo_ie, downloader=None):
1957 InfoExtractor.__init__(self, downloader)
1958 self._yahoo_ie = yahoo_ie
1962 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1964 def report_download_page(self, query, pagenum):
1965 """Report attempt to download playlist page with given number."""
1966 query = query.decode(preferredencoding())
1967 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1969 def _real_initialize(self):
1970 self._yahoo_ie.initialize()
1972 def _real_extract(self, query):
1973 mobj = re.match(self._VALID_QUERY, query)
1975 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1978 prefix, query = query.split(':')
1980 query = query.encode('utf-8')
1982 self._download_n_results(query, 1)
1984 elif prefix == 'all':
1985 self._download_n_results(query, self._max_yahoo_results)
1991 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1993 elif n > self._max_yahoo_results:
1994 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1995 n = self._max_yahoo_results
1996 self._download_n_results(query, n)
1998 except ValueError: # parsing prefix as integer fails
1999 self._download_n_results(query, 1)
2002 def _download_n_results(self, query, n):
2003 """Downloads a specified number of results for a query"""
2006 already_seen = set()
2010 self.report_download_page(query, pagenum)
2011 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2012 request = urllib2.Request(result_url)
2014 page = urllib2.urlopen(request).read()
2015 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2016 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2019 # Extract video identifiers
2020 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2021 video_id = mobj.group(1)
2022 if video_id not in already_seen:
2023 video_ids.append(video_id)
2024 already_seen.add(video_id)
2025 if len(video_ids) == n:
2026 # Specified n videos reached
2027 for id in video_ids:
2028 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2031 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2032 for id in video_ids:
2033 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2036 pagenum = pagenum + 1
2038 class YoutubePlaylistIE(InfoExtractor):
2039 """Information Extractor for YouTube playlists."""
2041 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
2042 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
2043 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2044 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2047 def __init__(self, youtube_ie, downloader=None):
2048 InfoExtractor.__init__(self, downloader)
2049 self._youtube_ie = youtube_ie
2053 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2055 def report_download_page(self, playlist_id, pagenum):
2056 """Report attempt to download playlist page with given number."""
2057 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2059 def _real_initialize(self):
2060 self._youtube_ie.initialize()
2062 def _real_extract(self, url):
2063 # Extract playlist id
2064 mobj = re.match(self._VALID_URL, url)
2066 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2069 # Download playlist pages
2070 playlist_id = mobj.group(1)
2075 self.report_download_page(playlist_id, pagenum)
2076 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum))
2078 page = urllib2.urlopen(request).read()
2079 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2080 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2083 # Extract video identifiers
2085 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2086 if mobj.group(1) not in ids_in_page:
2087 ids_in_page.append(mobj.group(1))
2088 video_ids.extend(ids_in_page)
2090 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2092 pagenum = pagenum + 1
2094 playliststart = self._downloader.params.get('playliststart', 1) - 1
2095 playlistend = self._downloader.params.get('playlistend', -1)
2096 video_ids = video_ids[playliststart:playlistend]
2098 for id in video_ids:
2099 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2102 class YoutubeUserIE(InfoExtractor):
2103 """Information Extractor for YouTube users."""
2105 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2106 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2107 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2110 def __init__(self, youtube_ie, downloader=None):
2111 InfoExtractor.__init__(self, downloader)
2112 self._youtube_ie = youtube_ie
2116 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2118 def report_download_page(self, username):
2119 """Report attempt to download user page."""
2120 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2122 def _real_initialize(self):
2123 self._youtube_ie.initialize()
2125 def _real_extract(self, url):
2127 mobj = re.match(self._VALID_URL, url)
2129 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2132 # Download user page
2133 username = mobj.group(1)
2137 self.report_download_page(username)
2138 request = urllib2.Request(self._TEMPLATE_URL % (username))
2140 page = urllib2.urlopen(request).read()
2141 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2142 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2145 # Extract video identifiers
2148 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2149 if mobj.group(1) not in ids_in_page:
2150 ids_in_page.append(mobj.group(1))
2151 video_ids.extend(ids_in_page)
2153 playliststart = self._downloader.params.get('playliststart', 1) - 1
2154 playlistend = self._downloader.params.get('playlistend', -1)
2155 video_ids = video_ids[playliststart:playlistend]
2157 for id in video_ids:
2158 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2161 class DepositFilesIE(InfoExtractor):
2162 """Information extractor for depositfiles.com"""
2164 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2166 def __init__(self, downloader=None):
2167 InfoExtractor.__init__(self, downloader)
2171 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2173 def report_download_webpage(self, file_id):
2174 """Report webpage download."""
2175 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2177 def report_extraction(self, file_id):
2178 """Report information extraction."""
2179 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2181 def _real_initialize(self):
2184 def _real_extract(self, url):
2185 # At this point we have a new file
2186 self._downloader.increment_downloads()
2188 file_id = url.split('/')[-1]
2189 # Rebuild url in english locale
2190 url = 'http://depositfiles.com/en/files/' + file_id
2192 # Retrieve file webpage with 'Free download' button pressed
2193 free_download_indication = { 'gateway_result' : '1' }
2194 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2196 self.report_download_webpage(file_id)
2197 webpage = urllib2.urlopen(request).read()
2198 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2199 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2202 # Search for the real file URL
2203 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2204 if (mobj is None) or (mobj.group(1) is None):
2205 # Try to figure out reason of the error.
2206 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2207 if (mobj is not None) and (mobj.group(1) is not None):
2208 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2209 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2211 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2214 file_url = mobj.group(1)
2215 file_extension = os.path.splitext(file_url)[1][1:]
2217 # Search for file title
2218 mobj = re.search(r'<b title="(.*?)">', webpage)
2220 self._downloader.trouble(u'ERROR: unable to extract title')
2222 file_title = mobj.group(1).decode('utf-8')
2225 # Process file information
2226 self._downloader.process_info({
2227 'id': file_id.decode('utf-8'),
2228 'url': file_url.decode('utf-8'),
2230 'upload_date': u'NA',
2231 'title': file_title,
2232 'stitle': file_title,
2233 'ext': file_extension.decode('utf-8'),
2237 except UnavailableVideoError, err:
2238 self._downloader.trouble(u'ERROR: unable to download file')
2240 class PostProcessor(object):
2241 """Post Processor class.
2243 PostProcessor objects can be added to downloaders with their
2244 add_post_processor() method. When the downloader has finished a
2245 successful download, it will take its internal chain of PostProcessors
2246 and start calling the run() method on each one of them, first with
2247 an initial argument and then with the returned value of the previous
2250 The chain will be stopped if one of them ever returns None or the end
2251 of the chain is reached.
2253 PostProcessor objects follow a "mutual registration" process similar
2254 to InfoExtractor objects.
2259 def __init__(self, downloader=None):
2260 self._downloader = downloader
2262 def set_downloader(self, downloader):
2263 """Sets the downloader for this PP."""
2264 self._downloader = downloader
2266 def run(self, information):
2267 """Run the PostProcessor.
2269 The "information" argument is a dictionary like the ones
2270 composed by InfoExtractors. The only difference is that this
2271 one has an extra field called "filepath" that points to the
2274 When this method returns None, the postprocessing chain is
2275 stopped. However, this method may return an information
2276 dictionary that will be passed to the next postprocessing
2277 object in the chain. It can be the one it received after
2278 changing some fields.
2280 In addition, this method may raise a PostProcessingError
2281 exception that will be taken into account by the downloader
2284 return information # by default, do nothing
2286 ### MAIN PROGRAM ###
2287 if __name__ == '__main__':
2289 # Modules needed only when running the main program
2293 # Function to update the program file with the latest version from bitbucket.org
2294 def update_self(downloader, filename):
2295 # Note: downloader only used for options
2296 if not os.access (filename, os.W_OK):
2297 sys.exit('ERROR: no write permissions on %s' % filename)
2299 downloader.to_screen('Updating to latest stable version...')
2300 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2301 latest_version = urllib.urlopen(latest_url).read().strip()
2302 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2303 newcontent = urllib.urlopen(prog_url).read()
2304 stream = open(filename, 'w')
2305 stream.write(newcontent)
2307 downloader.to_screen('Updated to version %s' % latest_version)
2309 # Parse command line
2310 parser = optparse.OptionParser(
2311 usage='Usage: %prog [options] url...',
2312 version='2010.12.09',
2313 conflict_handler='resolve',
2316 parser.add_option('-h', '--help',
2317 action='help', help='print this help text and exit')
2318 parser.add_option('-v', '--version',
2319 action='version', help='print program version and exit')
2320 parser.add_option('-U', '--update',
2321 action='store_true', dest='update_self', help='update this program to latest stable version')
2322 parser.add_option('-i', '--ignore-errors',
2323 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2324 parser.add_option('-r', '--rate-limit',
2325 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2326 parser.add_option('-R', '--retries',
2327 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2328 parser.add_option('--playlist-start',
2329 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2330 parser.add_option('--playlist-end',
2331 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2332 parser.add_option('--dump-user-agent',
2333 action='store_true', dest='dump_user_agent', help='display the current browser identification', default=False)
2335 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2336 authentication.add_option('-u', '--username',
2337 dest='username', metavar='USERNAME', help='account username')
2338 authentication.add_option('-p', '--password',
2339 dest='password', metavar='PASSWORD', help='account password')
2340 authentication.add_option('-n', '--netrc',
2341 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2342 parser.add_option_group(authentication)
2344 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2345 video_format.add_option('-f', '--format',
2346 action='store', dest='format', metavar='FORMAT', help='video format code')
2347 video_format.add_option('--all-formats',
2348 action='store_const', dest='format', help='download all available video formats', const='-1')
2349 video_format.add_option('--max-quality',
2350 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2351 parser.add_option_group(video_format)
2353 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2354 verbosity.add_option('-q', '--quiet',
2355 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2356 verbosity.add_option('-s', '--simulate',
2357 action='store_true', dest='simulate', help='do not download video', default=False)
2358 verbosity.add_option('-g', '--get-url',
2359 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2360 verbosity.add_option('-e', '--get-title',
2361 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2362 verbosity.add_option('--get-thumbnail',
2363 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2364 verbosity.add_option('--get-description',
2365 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2366 verbosity.add_option('--no-progress',
2367 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2368 verbosity.add_option('--console-title',
2369 action='store_true', dest='consoletitle', help='display progress in console titlebar', default=False)
2370 parser.add_option_group(verbosity)
2372 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2373 filesystem.add_option('-t', '--title',
2374 action='store_true', dest='usetitle', help='use title in file name', default=False)
2375 filesystem.add_option('-l', '--literal',
2376 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2377 filesystem.add_option('-A', '--auto-number',
2378 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2379 filesystem.add_option('-o', '--output',
2380 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2381 filesystem.add_option('-a', '--batch-file',
2382 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2383 filesystem.add_option('-w', '--no-overwrites',
2384 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2385 filesystem.add_option('-c', '--continue',
2386 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2387 filesystem.add_option('--cookies',
2388 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2389 filesystem.add_option('--no-part',
2390 action='store_true', dest='nopart', help='do not use .part files', default=False)
2391 parser.add_option_group(filesystem)
2393 (opts, args) = parser.parse_args()
2395 # Open appropriate CookieJar
2396 if opts.cookiefile is None:
2397 jar = cookielib.CookieJar()
2400 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2401 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2403 except (IOError, OSError), err:
2404 sys.exit(u'ERROR: unable to open cookie file')
2407 if opts.dump_user_agent:
2408 print std_headers['User-Agent']
2411 # General configuration
2412 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2413 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
2414 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2416 # Batch file verification
2418 if opts.batchfile is not None:
2420 if opts.batchfile == '-':
2423 batchfd = open(opts.batchfile, 'r')
2424 batchurls = batchfd.readlines()
2425 batchurls = [x.strip() for x in batchurls]
2426 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2428 sys.exit(u'ERROR: batch file could not be read')
2429 all_urls = batchurls + args
2431 # Conflicting, missing and erroneous options
2432 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2433 parser.error(u'using .netrc conflicts with giving username/password')
2434 if opts.password is not None and opts.username is None:
2435 parser.error(u'account username missing')
2436 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2437 parser.error(u'using output template conflicts with using title, literal title or auto number')
2438 if opts.usetitle and opts.useliteral:
2439 parser.error(u'using title conflicts with using literal title')
2440 if opts.username is not None and opts.password is None:
2441 opts.password = getpass.getpass(u'Type account password and press return:')
2442 if opts.ratelimit is not None:
2443 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2444 if numeric_limit is None:
2445 parser.error(u'invalid rate limit specified')
2446 opts.ratelimit = numeric_limit
2447 if opts.retries is not None:
2449 opts.retries = long(opts.retries)
2450 except (TypeError, ValueError), err:
2451 parser.error(u'invalid retry count specified')
2453 opts.playliststart = long(opts.playliststart)
2454 if opts.playliststart <= 0:
2456 except (TypeError, ValueError), err:
2457 parser.error(u'invalid playlist start number specified')
2459 opts.playlistend = long(opts.playlistend)
2460 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2462 except (TypeError, ValueError), err:
2463 parser.error(u'invalid playlist end number specified')
2465 # Information extractors
2466 youtube_ie = YoutubeIE()
2467 metacafe_ie = MetacafeIE(youtube_ie)
2468 dailymotion_ie = DailymotionIE()
2469 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2470 youtube_user_ie = YoutubeUserIE(youtube_ie)
2471 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2472 google_ie = GoogleIE()
2473 google_search_ie = GoogleSearchIE(google_ie)
2474 photobucket_ie = PhotobucketIE()
2475 yahoo_ie = YahooIE()
2476 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2477 deposit_files_ie = DepositFilesIE()
2478 generic_ie = GenericIE()
2481 fd = FileDownloader({
2482 'usenetrc': opts.usenetrc,
2483 'username': opts.username,
2484 'password': opts.password,
2485 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2486 'forceurl': opts.geturl,
2487 'forcetitle': opts.gettitle,
2488 'forcethumbnail': opts.getthumbnail,
2489 'forcedescription': opts.getdescription,
2490 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2491 'format': opts.format,
2492 'format_limit': opts.format_limit,
2493 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2494 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2495 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2496 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2497 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2498 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2499 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2500 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2501 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2502 or u'%(id)s.%(ext)s'),
2503 'ignoreerrors': opts.ignoreerrors,
2504 'ratelimit': opts.ratelimit,
2505 'nooverwrites': opts.nooverwrites,
2506 'retries': opts.retries,
2507 'continuedl': opts.continue_dl,
2508 'noprogress': opts.noprogress,
2509 'playliststart': opts.playliststart,
2510 'playlistend': opts.playlistend,
2511 'logtostderr': opts.outtmpl == '-',
2512 'consoletitle': opts.consoletitle,
2513 'nopart': opts.nopart,
2515 fd.add_info_extractor(youtube_search_ie)
2516 fd.add_info_extractor(youtube_pl_ie)
2517 fd.add_info_extractor(youtube_user_ie)
2518 fd.add_info_extractor(metacafe_ie)
2519 fd.add_info_extractor(dailymotion_ie)
2520 fd.add_info_extractor(youtube_ie)
2521 fd.add_info_extractor(google_ie)
2522 fd.add_info_extractor(google_search_ie)
2523 fd.add_info_extractor(photobucket_ie)
2524 fd.add_info_extractor(yahoo_ie)
2525 fd.add_info_extractor(yahoo_search_ie)
2526 fd.add_info_extractor(deposit_files_ie)
2528 # This must come last since it's the
2529 # fallback if none of the others work
2530 fd.add_info_extractor(generic_ie)
2533 if opts.update_self:
2534 update_self(fd, sys.argv[0])
2537 if len(all_urls) < 1:
2538 if not opts.update_self:
2539 parser.error(u'you must provide at least one URL')
2542 retcode = fd.download(all_urls)
2544 # Dump cookie jar if requested
2545 if opts.cookiefile is not None:
2548 except (IOError, OSError), err:
2549 sys.exit(u'ERROR: unable to save cookie jar')
2553 except DownloadError:
2555 except SameFileError:
2556 sys.exit(u'ERROR: fixed output name but more than one file to download')
2557 except KeyboardInterrupt:
2558 sys.exit(u'\nERROR: Interrupted by user')