2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # Author: Vasyl' Vavrychuk
7 # Author: Witold Baryluk
8 # License: Public domain code
32 # parse_qs was moved from the cgi module to the urlparse module recently.
34 from urlparse import parse_qs
36 from cgi import parse_qs
39 'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
40 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
41 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
42 'Accept-Encoding': 'gzip, deflate',
43 'Accept-Language': 'en-us,en;q=0.5',
46 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
48 def preferredencoding():
49 """Get preferred encoding.
51 Returns the best encoding scheme for the system, based on
52 locale.getpreferredencoding() and some further tweaks.
54 def yield_preferredencoding():
56 pref = locale.getpreferredencoding()
62 return yield_preferredencoding().next()
64 def htmlentity_transform(matchobj):
65 """Transforms an HTML entity to a Unicode character.
67 This function receives a match object and is intended to be used with
68 the re.sub() function.
70 entity = matchobj.group(1)
72 # Known non-numeric HTML entity
73 if entity in htmlentitydefs.name2codepoint:
74 return unichr(htmlentitydefs.name2codepoint[entity])
77 mobj = re.match(ur'(?u)#(x?\d+)', entity)
79 numstr = mobj.group(1)
80 if numstr.startswith(u'x'):
82 numstr = u'0%s' % numstr
85 return unichr(long(numstr, base))
87 # Unknown entity in name, return its literal representation
88 return (u'&%s;' % entity)
90 def sanitize_title(utitle):
91 """Sanitizes a video title so it could be used as part of a filename."""
92 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
93 return utitle.replace(unicode(os.sep), u'%')
95 def sanitize_open(filename, open_mode):
96 """Try to open the given filename, and slightly tweak it if this fails.
98 Attempts to open the given filename. If this fails, it tries to change
99 the filename slightly, step by step, until it's either able to open it
100 or it fails and raises a final exception, like the standard open()
103 It returns the tuple (stream, definitive_file_name).
107 if sys.platform == 'win32':
109 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
110 return (sys.stdout, filename)
111 stream = open(filename, open_mode)
112 return (stream, filename)
113 except (IOError, OSError), err:
114 # In case of error, try to remove win32 forbidden chars
115 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
117 # An exception here should be caught in the caller
118 stream = open(filename, open_mode)
119 return (stream, filename)
121 def timeconvert(timestr):
122 """Convert RFC 2822 defined time string into system timestamp"""
124 timetuple = email.utils.parsedate_tz(timestr)
125 if timetuple is not None:
126 timestamp = email.utils.mktime_tz(timetuple)
129 class DownloadError(Exception):
130 """Download Error exception.
132 This exception may be thrown by FileDownloader objects if they are not
133 configured to continue on errors. They will contain the appropriate
138 class SameFileError(Exception):
139 """Same File exception.
141 This exception will be thrown by FileDownloader objects if they detect
142 multiple files would have to be downloaded to the same file on disk.
146 class PostProcessingError(Exception):
147 """Post Processing exception.
149 This exception may be raised by PostProcessor's .run() method to
150 indicate an error in the postprocessing task.
154 class UnavailableVideoError(Exception):
155 """Unavailable Format exception.
157 This exception will be thrown when a video is requested
158 in a format that is not available for that video.
162 class ContentTooShortError(Exception):
163 """Content Too Short exception.
165 This exception may be raised by FileDownloader objects when a file they
166 download is too small for what the server announced first, indicating
167 the connection was probably interrupted.
173 def __init__(self, downloaded, expected):
174 self.downloaded = downloaded
175 self.expected = expected
177 class YoutubeDLHandler(urllib2.HTTPHandler):
178 """Handler for HTTP requests and responses.
180 This class, when installed with an OpenerDirector, automatically adds
181 the standard headers to every HTTP request and handles gzipped and
182 deflated responses from web servers. If compression is to be avoided in
183 a particular request, the original request in the program code only has
184 to include the HTTP header "Youtubedl-No-Compression", which will be
185 removed before making the real request.
187 Part of this code was copied from:
189 http://techknack.net/python-urllib2-handlers/
191 Andrew Rowls, the author of that code, agreed to release it to the
198 return zlib.decompress(data, -zlib.MAX_WBITS)
200 return zlib.decompress(data)
203 def addinfourl_wrapper(stream, headers, url, code):
204 if hasattr(urllib2.addinfourl, 'getcode'):
205 return urllib2.addinfourl(stream, headers, url, code)
206 ret = urllib2.addinfourl(stream, headers, url)
210 def http_request(self, req):
211 for h in std_headers:
214 req.add_header(h, std_headers[h])
215 if 'Youtubedl-no-compression' in req.headers:
216 if 'Accept-encoding' in req.headers:
217 del req.headers['Accept-encoding']
218 del req.headers['Youtubedl-no-compression']
221 def http_response(self, req, resp):
224 if resp.headers.get('Content-encoding', '') == 'gzip':
225 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
226 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
227 resp.msg = old_resp.msg
229 if resp.headers.get('Content-encoding', '') == 'deflate':
230 gz = StringIO.StringIO(self.deflate(resp.read()))
231 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
232 resp.msg = old_resp.msg
235 class FileDownloader(object):
236 """File Downloader class.
238 File downloader objects are the ones responsible of downloading the
239 actual video file and writing it to disk if the user has requested
240 it, among some other tasks. In most cases there should be one per
241 program. As, given a video URL, the downloader doesn't know how to
242 extract all the needed information, task that InfoExtractors do, it
243 has to pass the URL to one of them.
245 For this, file downloader objects have a method that allows
246 InfoExtractors to be registered in a given order. When it is passed
247 a URL, the file downloader handles it to the first InfoExtractor it
248 finds that reports being able to handle it. The InfoExtractor extracts
249 all the information about the video or videos the URL refers to, and
250 asks the FileDownloader to process the video information, possibly
251 downloading the video.
253 File downloaders accept a lot of parameters. In order not to saturate
254 the object constructor with arguments, it receives a dictionary of
255 options instead. These options are available through the params
256 attribute for the InfoExtractors to use. The FileDownloader also
257 registers itself as the downloader in charge for the InfoExtractors
258 that are added to it, so this is a "mutual registration".
262 username: Username for authentication purposes.
263 password: Password for authentication purposes.
264 usenetrc: Use netrc for authentication instead.
265 quiet: Do not print messages to stdout.
266 forceurl: Force printing final URL.
267 forcetitle: Force printing title.
268 forcethumbnail: Force printing thumbnail URL.
269 forcedescription: Force printing description.
270 forcefilename: Force printing final filename.
271 simulate: Do not download the video files.
272 format: Video format code.
273 format_limit: Highest quality format to try.
274 outtmpl: Template for output names.
275 ignoreerrors: Do not stop on download errors.
276 ratelimit: Download speed limit, in bytes/sec.
277 nooverwrites: Prevent overwriting files.
278 retries: Number of times to retry for HTTP error 5xx
279 continuedl: Try to continue downloads if possible.
280 noprogress: Do not print the progress bar.
281 playliststart: Playlist item to start at.
282 playlistend: Playlist item to end at.
283 logtostderr: Log messages to stderr instead of stdout.
284 consoletitle: Display progress in console window's titlebar.
285 nopart: Do not use temporary .part files.
291 _download_retcode = None
292 _num_downloads = None
295 def __init__(self, params):
296 """Create a FileDownloader object with the given options."""
299 self._download_retcode = 0
300 self._num_downloads = 0
301 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
305 def pmkdir(filename):
306 """Create directory components in filename. Similar to Unix "mkdir -p"."""
307 components = filename.split(os.sep)
308 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
309 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
310 for dir in aggregate:
311 if not os.path.exists(dir):
315 def format_bytes(bytes):
318 if type(bytes) is str:
323 exponent = long(math.log(bytes, 1024.0))
324 suffix = 'bkMGTPEZY'[exponent]
325 converted = float(bytes) / float(1024**exponent)
326 return '%.2f%s' % (converted, suffix)
329 def calc_percent(byte_counter, data_len):
332 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
335 def calc_eta(start, now, total, current):
339 if current == 0 or dif < 0.001: # One millisecond
341 rate = float(current) / dif
342 eta = long((float(total) - float(current)) / rate)
343 (eta_mins, eta_secs) = divmod(eta, 60)
346 return '%02d:%02d' % (eta_mins, eta_secs)
349 def calc_speed(start, now, bytes):
351 if bytes == 0 or dif < 0.001: # One millisecond
352 return '%10s' % '---b/s'
353 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
356 def best_block_size(elapsed_time, bytes):
357 new_min = max(bytes / 2.0, 1.0)
358 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
359 if elapsed_time < 0.001:
361 rate = bytes / elapsed_time
369 def parse_bytes(bytestr):
370 """Parse a string indicating a byte quantity into a long integer."""
371 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
374 number = float(matchobj.group(1))
375 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
376 return long(round(number * multiplier))
378 def add_info_extractor(self, ie):
379 """Add an InfoExtractor object to the end of the list."""
381 ie.set_downloader(self)
383 def add_post_processor(self, pp):
384 """Add a PostProcessor object to the end of the chain."""
386 pp.set_downloader(self)
388 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
389 """Print message to stdout if not in quiet mode."""
391 if not self.params.get('quiet', False):
392 terminator = [u'\n', u''][skip_eol]
393 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
394 self._screen_file.flush()
395 except (UnicodeEncodeError), err:
396 if not ignore_encoding_errors:
399 def to_stderr(self, message):
400 """Print message to stderr."""
401 print >>sys.stderr, message.encode(preferredencoding())
403 def to_cons_title(self, message):
404 """Set console/terminal window title to message."""
405 if not self.params.get('consoletitle', False):
407 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
408 # c_wchar_p() might not be necessary if `message` is
409 # already of type unicode()
410 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
411 elif 'TERM' in os.environ:
412 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
414 def fixed_template(self):
415 """Checks if the output template is fixed."""
416 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
418 def trouble(self, message=None):
419 """Determine action to take when a download problem appears.
421 Depending on if the downloader has been configured to ignore
422 download errors or not, this method may throw an exception or
423 not when errors are found, after printing the message.
425 if message is not None:
426 self.to_stderr(message)
427 if not self.params.get('ignoreerrors', False):
428 raise DownloadError(message)
429 self._download_retcode = 1
431 def slow_down(self, start_time, byte_counter):
432 """Sleep if the download speed is over the rate limit."""
433 rate_limit = self.params.get('ratelimit', None)
434 if rate_limit is None or byte_counter == 0:
437 elapsed = now - start_time
440 speed = float(byte_counter) / elapsed
441 if speed > rate_limit:
442 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
444 def temp_name(self, filename):
445 """Returns a temporary filename for the given filename."""
446 if self.params.get('nopart', False) or filename == u'-' or \
447 (os.path.exists(filename) and not os.path.isfile(filename)):
449 return filename + u'.part'
451 def undo_temp_name(self, filename):
452 if filename.endswith(u'.part'):
453 return filename[:-len(u'.part')]
456 def try_rename(self, old_filename, new_filename):
458 if old_filename == new_filename:
460 os.rename(old_filename, new_filename)
461 except (IOError, OSError), err:
462 self.trouble(u'ERROR: unable to rename file')
464 def report_destination(self, filename):
465 """Report destination filename."""
466 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
468 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
469 """Report download progress."""
470 if self.params.get('noprogress', False):
472 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
473 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
474 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
475 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
477 def report_resuming_byte(self, resume_len):
478 """Report attempt to resume at given byte."""
479 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
481 def report_retry(self, count, retries):
482 """Report retry in case of HTTP error 5xx"""
483 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
485 def report_file_already_downloaded(self, file_name):
486 """Report file has already been fully downloaded."""
488 self.to_screen(u'[download] %s has already been downloaded' % file_name)
489 except (UnicodeEncodeError), err:
490 self.to_screen(u'[download] The file has already been downloaded')
492 def report_unable_to_resume(self):
493 """Report it was impossible to resume download."""
494 self.to_screen(u'[download] Unable to resume')
496 def report_finish(self):
497 """Report download finished."""
498 if self.params.get('noprogress', False):
499 self.to_screen(u'[download] Download completed')
503 def increment_downloads(self):
504 """Increment the ordinal that assigns a number to each file."""
505 self._num_downloads += 1
507 def prepare_filename(self, info_dict):
508 """Generate the output filename."""
510 template_dict = dict(info_dict)
511 template_dict['epoch'] = unicode(long(time.time()))
512 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
513 filename = self.params['outtmpl'] % template_dict
515 except (ValueError, KeyError), err:
516 self.trouble(u'ERROR: invalid system charset or erroneous output template')
519 def process_info(self, info_dict):
520 """Process a single dictionary returned by an InfoExtractor."""
521 filename = self.prepare_filename(info_dict)
522 # Do nothing else if in simulate mode
523 if self.params.get('simulate', False):
525 if self.params.get('forcetitle', False):
526 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
527 if self.params.get('forceurl', False):
528 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
529 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
530 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
531 if self.params.get('forcedescription', False) and 'description' in info_dict:
532 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
533 if self.params.get('forcefilename', False) and filename is not None:
534 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
540 if self.params.get('nooverwrites', False) and os.path.exists(filename):
541 self.to_stderr(u'WARNING: file exists and will be skipped')
545 self.pmkdir(filename)
546 except (OSError, IOError), err:
547 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
551 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
552 except (OSError, IOError), err:
553 raise UnavailableVideoError
554 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
555 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
557 except (ContentTooShortError, ), err:
558 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
563 self.post_process(filename, info_dict)
564 except (PostProcessingError), err:
565 self.trouble(u'ERROR: postprocessing: %s' % str(err))
568 def download(self, url_list):
569 """Download a given list of URLs."""
570 if len(url_list) > 1 and self.fixed_template():
571 raise SameFileError(self.params['outtmpl'])
574 suitable_found = False
576 # Go to next InfoExtractor if not suitable
577 if not ie.suitable(url):
580 # Suitable InfoExtractor found
581 suitable_found = True
583 # Extract information from URL and process it
586 # Suitable InfoExtractor had been found; go to next URL
589 if not suitable_found:
590 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
592 return self._download_retcode
594 def post_process(self, filename, ie_info):
595 """Run the postprocessing chain on the given file."""
597 info['filepath'] = filename
603 def _download_with_rtmpdump(self, filename, url, player_url):
604 self.report_destination(filename)
605 tmpfilename = self.temp_name(filename)
607 # Check for rtmpdump first
609 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
610 except (OSError, IOError):
611 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
614 # Download using rtmpdump. rtmpdump returns exit code 2 when
615 # the connection was interrumpted and resuming appears to be
616 # possible. This is part of rtmpdump's normal usage, AFAIK.
617 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
618 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
619 while retval == 2 or retval == 1:
620 prevsize = os.path.getsize(tmpfilename)
621 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
622 time.sleep(5.0) # This seems to be needed
623 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
624 cursize = os.path.getsize(tmpfilename)
625 if prevsize == cursize and retval == 1:
628 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
629 self.try_rename(tmpfilename, filename)
632 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
635 def _do_download(self, filename, url, player_url):
636 # Check file already present
637 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
638 self.report_file_already_downloaded(filename)
641 # Attempt to download using rtmpdump
642 if url.startswith('rtmp'):
643 return self._download_with_rtmpdump(filename, url, player_url)
645 tmpfilename = self.temp_name(filename)
649 # Do not include the Accept-Encoding header
650 headers = {'Youtubedl-no-compression': 'True'}
651 basic_request = urllib2.Request(url, None, headers)
652 request = urllib2.Request(url, None, headers)
654 # Establish possible resume length
655 if os.path.isfile(tmpfilename):
656 resume_len = os.path.getsize(tmpfilename)
660 # Request parameters in case of being able to resume
661 if self.params.get('continuedl', False) and resume_len != 0:
662 self.report_resuming_byte(resume_len)
663 request.add_header('Range','bytes=%d-' % resume_len)
667 retries = self.params.get('retries', 0)
668 while count <= retries:
669 # Establish connection
671 data = urllib2.urlopen(request)
673 except (urllib2.HTTPError, ), err:
674 if (err.code < 500 or err.code >= 600) and err.code != 416:
675 # Unexpected HTTP error
677 elif err.code == 416:
678 # Unable to resume (requested range not satisfiable)
680 # Open the connection again without the range header
681 data = urllib2.urlopen(basic_request)
682 content_length = data.info()['Content-Length']
683 except (urllib2.HTTPError, ), err:
684 if err.code < 500 or err.code >= 600:
687 # Examine the reported length
688 if (content_length is not None and
689 (resume_len - 100 < long(content_length) < resume_len + 100)):
690 # The file had already been fully downloaded.
691 # Explanation to the above condition: in issue #175 it was revealed that
692 # YouTube sometimes adds or removes a few bytes from the end of the file,
693 # changing the file size slightly and causing problems for some users. So
694 # I decided to implement a suggested change and consider the file
695 # completely downloaded if the file size differs less than 100 bytes from
696 # the one in the hard drive.
697 self.report_file_already_downloaded(filename)
698 self.try_rename(tmpfilename, filename)
701 # The length does not match, we start the download over
702 self.report_unable_to_resume()
708 self.report_retry(count, retries)
711 self.trouble(u'ERROR: giving up after %s retries' % retries)
714 data_len = data.info().get('Content-length', None)
715 if data_len is not None:
716 data_len = long(data_len) + resume_len
717 data_len_str = self.format_bytes(data_len)
718 byte_counter = 0 + resume_len
724 data_block = data.read(block_size)
726 if len(data_block) == 0:
728 byte_counter += len(data_block)
730 # Open file just in time
733 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
734 filename = self.undo_temp_name(tmpfilename)
735 self.report_destination(filename)
736 except (OSError, IOError), err:
737 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
740 stream.write(data_block)
741 except (IOError, OSError), err:
742 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
744 block_size = self.best_block_size(after - before, len(data_block))
747 percent_str = self.calc_percent(byte_counter, data_len)
748 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
749 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
750 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
753 self.slow_down(start, byte_counter - resume_len)
757 if data_len is not None and byte_counter != data_len:
758 raise ContentTooShortError(byte_counter, long(data_len))
759 self.try_rename(tmpfilename, filename)
760 # Update file modification time
761 timestr = data.info().get('last-modified', None)
762 if timestr is not None:
763 filetime = timeconvert(timestr)
764 if filetime is not None:
766 os.utime(filename,(time.time(), filetime))
771 class InfoExtractor(object):
772 """Information Extractor class.
774 Information extractors are the classes that, given a URL, extract
775 information from the video (or videos) the URL refers to. This
776 information includes the real video URL, the video title and simplified
777 title, author and others. The information is stored in a dictionary
778 which is then passed to the FileDownloader. The FileDownloader
779 processes this information possibly downloading the video to the file
780 system, among other possible outcomes. The dictionaries must include
781 the following fields:
783 id: Video identifier.
784 url: Final video URL.
785 uploader: Nickname of the video uploader.
786 title: Literal title.
787 stitle: Simplified title.
788 ext: Video filename extension.
789 format: Video format.
790 player_url: SWF Player URL (may be None).
792 The following fields are optional. Their primary purpose is to allow
793 youtube-dl to serve as the backend for a video search function, such
794 as the one in youtube2mp3. They are only used when their respective
795 forced printing functions are called:
797 thumbnail: Full URL to a video thumbnail image.
798 description: One-line video description.
800 Subclasses of this one should re-define the _real_initialize() and
801 _real_extract() methods, as well as the suitable() static method.
802 Probably, they should also be instantiated and added to the main
809 def __init__(self, downloader=None):
810 """Constructor. Receives an optional downloader."""
812 self.set_downloader(downloader)
816 """Receives a URL and returns True if suitable for this IE."""
819 def initialize(self):
820 """Initializes an instance (authentication, etc)."""
822 self._real_initialize()
825 def extract(self, url):
826 """Extracts URL information and returns it in list of dicts."""
828 return self._real_extract(url)
830 def set_downloader(self, downloader):
831 """Sets the downloader for this IE."""
832 self._downloader = downloader
834 def _real_initialize(self):
835 """Real initialization process. Redefine in subclasses."""
838 def _real_extract(self, url):
839 """Real extraction process. Redefine in subclasses."""
842 class YoutubeIE(InfoExtractor):
843 """Information extractor for youtube.com."""
845 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
846 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
847 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
848 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
849 _NETRC_MACHINE = 'youtube'
850 # Listed in order of quality
851 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
852 _video_extensions = {
858 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
865 return (re.match(YoutubeIE._VALID_URL, url) is not None)
867 def report_lang(self):
868 """Report attempt to set language."""
869 self._downloader.to_screen(u'[youtube] Setting language')
871 def report_login(self):
872 """Report attempt to log in."""
873 self._downloader.to_screen(u'[youtube] Logging in')
875 def report_age_confirmation(self):
876 """Report attempt to confirm age."""
877 self._downloader.to_screen(u'[youtube] Confirming age')
879 def report_video_webpage_download(self, video_id):
880 """Report attempt to download video webpage."""
881 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
883 def report_video_info_webpage_download(self, video_id):
884 """Report attempt to download video info webpage."""
885 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
887 def report_information_extraction(self, video_id):
888 """Report attempt to extract video information."""
889 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
891 def report_unavailable_format(self, video_id, format):
892 """Report extracted video URL."""
893 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
895 def report_rtmp_download(self):
896 """Indicate the download will use the RTMP protocol."""
897 self._downloader.to_screen(u'[youtube] RTMP download detected')
899 def _real_initialize(self):
900 if self._downloader is None:
905 downloader_params = self._downloader.params
907 # Attempt to use provided username and password or .netrc data
908 if downloader_params.get('username', None) is not None:
909 username = downloader_params['username']
910 password = downloader_params['password']
911 elif downloader_params.get('usenetrc', False):
913 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
918 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
919 except (IOError, netrc.NetrcParseError), err:
920 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
924 request = urllib2.Request(self._LANG_URL)
927 urllib2.urlopen(request).read()
928 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
929 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
932 # No authentication to be performed
938 'current_form': 'loginForm',
940 'action_login': 'Log In',
941 'username': username,
942 'password': password,
944 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
947 login_results = urllib2.urlopen(request).read()
948 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
949 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
951 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
952 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
958 'action_confirm': 'Confirm',
960 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
962 self.report_age_confirmation()
963 age_results = urllib2.urlopen(request).read()
964 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
965 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
968 def _real_extract(self, url):
969 # Extract video id from URL
970 mobj = re.match(self._VALID_URL, url)
972 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
974 video_id = mobj.group(2)
977 self.report_video_webpage_download(video_id)
978 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
980 video_webpage = urllib2.urlopen(request).read()
981 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
982 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
985 # Attempt to extract SWF player URL
986 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
988 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
993 self.report_video_info_webpage_download(video_id)
994 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
995 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
996 % (video_id, el_type))
997 request = urllib2.Request(video_info_url)
999 video_info_webpage = urllib2.urlopen(request).read()
1000 video_info = parse_qs(video_info_webpage)
1001 if 'token' in video_info:
1003 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1004 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1006 if 'token' not in video_info:
1007 if 'reason' in video_info:
1008 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1010 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1013 # Start extracting information
1014 self.report_information_extraction(video_id)
1017 if 'author' not in video_info:
1018 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1020 video_uploader = urllib.unquote_plus(video_info['author'][0])
1023 if 'title' not in video_info:
1024 self._downloader.trouble(u'ERROR: unable to extract video title')
1026 video_title = urllib.unquote_plus(video_info['title'][0])
1027 video_title = video_title.decode('utf-8')
1028 video_title = sanitize_title(video_title)
1031 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1032 simple_title = simple_title.strip(ur'_')
1035 if 'thumbnail_url' not in video_info:
1036 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1037 video_thumbnail = ''
1038 else: # don't panic if we can't find it
1039 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1043 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
1044 if mobj is not None:
1045 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1046 format_expressions = ['%d %B %Y', '%B %d %Y']
1047 for expression in format_expressions:
1049 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1054 video_description = 'No description available.'
1055 if self._downloader.params.get('forcedescription', False):
1056 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1057 if mobj is not None:
1058 video_description = mobj.group(1)
1061 video_token = urllib.unquote_plus(video_info['token'][0])
1063 # Decide which formats to download
1064 req_format = self._downloader.params.get('format', None)
1066 if 'fmt_url_map' in video_info:
1067 url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
1068 format_limit = self._downloader.params.get('format_limit', None)
1069 if format_limit is not None and format_limit in self._available_formats:
1070 format_list = self._available_formats[self._available_formats.index(format_limit):]
1072 format_list = self._available_formats
1073 existing_formats = [x for x in format_list if x in url_map]
1074 if len(existing_formats) == 0:
1075 self._downloader.trouble(u'ERROR: no known formats available for video')
1077 if req_format is None:
1078 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1079 elif req_format == '-1':
1080 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1083 if req_format not in url_map:
1084 self._downloader.trouble(u'ERROR: requested format not available')
1086 video_url_list = [(req_format, url_map[req_format])] # Specific format
1088 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1089 self.report_rtmp_download()
1090 video_url_list = [(None, video_info['conn'][0])]
1093 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1096 for format_param, video_real_url in video_url_list:
1097 # At this point we have a new video
1098 self._downloader.increment_downloads()
1101 video_extension = self._video_extensions.get(format_param, 'flv')
1103 # Find the video URL in fmt_url_map or conn paramters
1105 # Process video information
1106 self._downloader.process_info({
1107 'id': video_id.decode('utf-8'),
1108 'url': video_real_url.decode('utf-8'),
1109 'uploader': video_uploader.decode('utf-8'),
1110 'upload_date': upload_date,
1111 'title': video_title,
1112 'stitle': simple_title,
1113 'ext': video_extension.decode('utf-8'),
1114 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1115 'thumbnail': video_thumbnail.decode('utf-8'),
1116 'description': video_description.decode('utf-8'),
1117 'player_url': player_url,
1119 except UnavailableVideoError, err:
1120 self._downloader.trouble(u'\nERROR: unable to download video')
1123 class MetacafeIE(InfoExtractor):
1124 """Information Extractor for metacafe.com."""
1126 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1127 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1128 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1131 def __init__(self, youtube_ie, downloader=None):
1132 InfoExtractor.__init__(self, downloader)
1133 self._youtube_ie = youtube_ie
1137 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1139 def report_disclaimer(self):
1140 """Report disclaimer retrieval."""
1141 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1143 def report_age_confirmation(self):
1144 """Report attempt to confirm age."""
1145 self._downloader.to_screen(u'[metacafe] Confirming age')
1147 def report_download_webpage(self, video_id):
1148 """Report webpage download."""
1149 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1151 def report_extraction(self, video_id):
1152 """Report information extraction."""
1153 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1155 def _real_initialize(self):
1156 # Retrieve disclaimer
1157 request = urllib2.Request(self._DISCLAIMER)
1159 self.report_disclaimer()
1160 disclaimer = urllib2.urlopen(request).read()
1161 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1162 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1168 'submit': "Continue - I'm over 18",
1170 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1172 self.report_age_confirmation()
1173 disclaimer = urllib2.urlopen(request).read()
1174 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1175 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1178 def _real_extract(self, url):
1179 # Extract id and simplified title from URL
1180 mobj = re.match(self._VALID_URL, url)
1182 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1185 video_id = mobj.group(1)
1187 # Check if video comes from YouTube
1188 mobj2 = re.match(r'^yt-(.*)$', video_id)
1189 if mobj2 is not None:
1190 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1193 # At this point we have a new video
1194 self._downloader.increment_downloads()
1196 simple_title = mobj.group(2).decode('utf-8')
1198 # Retrieve video webpage to extract further information
1199 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1201 self.report_download_webpage(video_id)
1202 webpage = urllib2.urlopen(request).read()
1203 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1204 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1207 # Extract URL, uploader and title from webpage
1208 self.report_extraction(video_id)
1209 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1210 if mobj is not None:
1211 mediaURL = urllib.unquote(mobj.group(1))
1212 video_extension = mediaURL[-3:]
1214 # Extract gdaKey if available
1215 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1217 video_url = mediaURL
1219 gdaKey = mobj.group(1)
1220 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1222 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1224 self._downloader.trouble(u'ERROR: unable to extract media URL')
1226 vardict = parse_qs(mobj.group(1))
1227 if 'mediaData' not in vardict:
1228 self._downloader.trouble(u'ERROR: unable to extract media URL')
1230 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1232 self._downloader.trouble(u'ERROR: unable to extract media URL')
1234 mediaURL = mobj.group(1).replace('\\/', '/')
1235 video_extension = mediaURL[-3:]
1236 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1238 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1240 self._downloader.trouble(u'ERROR: unable to extract title')
1242 video_title = mobj.group(1).decode('utf-8')
1243 video_title = sanitize_title(video_title)
1245 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1247 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1249 video_uploader = mobj.group(1)
1252 # Process video information
1253 self._downloader.process_info({
1254 'id': video_id.decode('utf-8'),
1255 'url': video_url.decode('utf-8'),
1256 'uploader': video_uploader.decode('utf-8'),
1257 'upload_date': u'NA',
1258 'title': video_title,
1259 'stitle': simple_title,
1260 'ext': video_extension.decode('utf-8'),
1264 except UnavailableVideoError:
1265 self._downloader.trouble(u'\nERROR: unable to download video')
1268 class DailymotionIE(InfoExtractor):
1269 """Information Extractor for Dailymotion"""
1271 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1273 def __init__(self, downloader=None):
1274 InfoExtractor.__init__(self, downloader)
1278 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1280 def report_download_webpage(self, video_id):
1281 """Report webpage download."""
1282 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1284 def report_extraction(self, video_id):
1285 """Report information extraction."""
1286 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1288 def _real_initialize(self):
1291 def _real_extract(self, url):
1292 # Extract id and simplified title from URL
1293 mobj = re.match(self._VALID_URL, url)
1295 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1298 # At this point we have a new video
1299 self._downloader.increment_downloads()
1300 video_id = mobj.group(1)
1302 simple_title = mobj.group(2).decode('utf-8')
1303 video_extension = 'flv'
1305 # Retrieve video webpage to extract further information
1306 request = urllib2.Request(url)
1308 self.report_download_webpage(video_id)
1309 webpage = urllib2.urlopen(request).read()
1310 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1311 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1314 # Extract URL, uploader and title from webpage
1315 self.report_extraction(video_id)
1316 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1318 self._downloader.trouble(u'ERROR: unable to extract media URL')
1320 mediaURL = urllib.unquote(mobj.group(1))
1322 # if needed add http://www.dailymotion.com/ if relative URL
1324 video_url = mediaURL
1326 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1327 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1329 self._downloader.trouble(u'ERROR: unable to extract title')
1331 video_title = mobj.group(1).decode('utf-8')
1332 video_title = sanitize_title(video_title)
1334 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1336 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1338 video_uploader = mobj.group(1)
1341 # Process video information
1342 self._downloader.process_info({
1343 'id': video_id.decode('utf-8'),
1344 'url': video_url.decode('utf-8'),
1345 'uploader': video_uploader.decode('utf-8'),
1346 'upload_date': u'NA',
1347 'title': video_title,
1348 'stitle': simple_title,
1349 'ext': video_extension.decode('utf-8'),
1353 except UnavailableVideoError:
1354 self._downloader.trouble(u'\nERROR: unable to download video')
1356 class GoogleIE(InfoExtractor):
1357 """Information extractor for video.google.com."""
1359 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1361 def __init__(self, downloader=None):
1362 InfoExtractor.__init__(self, downloader)
1366 return (re.match(GoogleIE._VALID_URL, url) is not None)
1368 def report_download_webpage(self, video_id):
1369 """Report webpage download."""
1370 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1372 def report_extraction(self, video_id):
1373 """Report information extraction."""
1374 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1376 def _real_initialize(self):
1379 def _real_extract(self, url):
1380 # Extract id from URL
1381 mobj = re.match(self._VALID_URL, url)
1383 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1386 # At this point we have a new video
1387 self._downloader.increment_downloads()
1388 video_id = mobj.group(1)
1390 video_extension = 'mp4'
1392 # Retrieve video webpage to extract further information
1393 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1395 self.report_download_webpage(video_id)
1396 webpage = urllib2.urlopen(request).read()
1397 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1398 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1401 # Extract URL, uploader, and title from webpage
1402 self.report_extraction(video_id)
1403 mobj = re.search(r"download_url:'([^']+)'", webpage)
1405 video_extension = 'flv'
1406 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1408 self._downloader.trouble(u'ERROR: unable to extract media URL')
1410 mediaURL = urllib.unquote(mobj.group(1))
1411 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1412 mediaURL = mediaURL.replace('\\x26', '\x26')
1414 video_url = mediaURL
1416 mobj = re.search(r'<title>(.*)</title>', webpage)
1418 self._downloader.trouble(u'ERROR: unable to extract title')
1420 video_title = mobj.group(1).decode('utf-8')
1421 video_title = sanitize_title(video_title)
1422 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1424 # Extract video description
1425 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1427 self._downloader.trouble(u'ERROR: unable to extract video description')
1429 video_description = mobj.group(1).decode('utf-8')
1430 if not video_description:
1431 video_description = 'No description available.'
1433 # Extract video thumbnail
1434 if self._downloader.params.get('forcethumbnail', False):
1435 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1437 webpage = urllib2.urlopen(request).read()
1438 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1439 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1441 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1443 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1445 video_thumbnail = mobj.group(1)
1446 else: # we need something to pass to process_info
1447 video_thumbnail = ''
1451 # Process video information
1452 self._downloader.process_info({
1453 'id': video_id.decode('utf-8'),
1454 'url': video_url.decode('utf-8'),
1456 'upload_date': u'NA',
1457 'title': video_title,
1458 'stitle': simple_title,
1459 'ext': video_extension.decode('utf-8'),
1463 except UnavailableVideoError:
1464 self._downloader.trouble(u'\nERROR: unable to download video')
1467 class PhotobucketIE(InfoExtractor):
1468 """Information extractor for photobucket.com."""
1470 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1472 def __init__(self, downloader=None):
1473 InfoExtractor.__init__(self, downloader)
1477 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1479 def report_download_webpage(self, video_id):
1480 """Report webpage download."""
1481 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1483 def report_extraction(self, video_id):
1484 """Report information extraction."""
1485 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1487 def _real_initialize(self):
1490 def _real_extract(self, url):
1491 # Extract id from URL
1492 mobj = re.match(self._VALID_URL, url)
1494 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1497 # At this point we have a new video
1498 self._downloader.increment_downloads()
1499 video_id = mobj.group(1)
1501 video_extension = 'flv'
1503 # Retrieve video webpage to extract further information
1504 request = urllib2.Request(url)
1506 self.report_download_webpage(video_id)
1507 webpage = urllib2.urlopen(request).read()
1508 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1509 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1512 # Extract URL, uploader, and title from webpage
1513 self.report_extraction(video_id)
1514 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1516 self._downloader.trouble(u'ERROR: unable to extract media URL')
1518 mediaURL = urllib.unquote(mobj.group(1))
1520 video_url = mediaURL
1522 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1524 self._downloader.trouble(u'ERROR: unable to extract title')
1526 video_title = mobj.group(1).decode('utf-8')
1527 video_title = sanitize_title(video_title)
1528 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1530 video_uploader = mobj.group(2).decode('utf-8')
1533 # Process video information
1534 self._downloader.process_info({
1535 'id': video_id.decode('utf-8'),
1536 'url': video_url.decode('utf-8'),
1537 'uploader': video_uploader,
1538 'upload_date': u'NA',
1539 'title': video_title,
1540 'stitle': simple_title,
1541 'ext': video_extension.decode('utf-8'),
1545 except UnavailableVideoError:
1546 self._downloader.trouble(u'\nERROR: unable to download video')
1549 class YahooIE(InfoExtractor):
1550 """Information extractor for video.yahoo.com."""
1552 # _VALID_URL matches all Yahoo! Video URLs
1553 # _VPAGE_URL matches only the extractable '/watch/' URLs
1554 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1555 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1557 def __init__(self, downloader=None):
1558 InfoExtractor.__init__(self, downloader)
1562 return (re.match(YahooIE._VALID_URL, url) is not None)
1564 def report_download_webpage(self, video_id):
1565 """Report webpage download."""
1566 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1568 def report_extraction(self, video_id):
1569 """Report information extraction."""
1570 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1572 def _real_initialize(self):
1575 def _real_extract(self, url, new_video=True):
1576 # Extract ID from URL
1577 mobj = re.match(self._VALID_URL, url)
1579 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1582 # At this point we have a new video
1583 self._downloader.increment_downloads()
1584 video_id = mobj.group(2)
1585 video_extension = 'flv'
1587 # Rewrite valid but non-extractable URLs as
1588 # extractable English language /watch/ URLs
1589 if re.match(self._VPAGE_URL, url) is None:
1590 request = urllib2.Request(url)
1592 webpage = urllib2.urlopen(request).read()
1593 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1594 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1597 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1599 self._downloader.trouble(u'ERROR: Unable to extract id field')
1601 yahoo_id = mobj.group(1)
1603 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1605 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1607 yahoo_vid = mobj.group(1)
1609 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1610 return self._real_extract(url, new_video=False)
1612 # Retrieve video webpage to extract further information
1613 request = urllib2.Request(url)
1615 self.report_download_webpage(video_id)
1616 webpage = urllib2.urlopen(request).read()
1617 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1618 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1621 # Extract uploader and title from webpage
1622 self.report_extraction(video_id)
1623 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1625 self._downloader.trouble(u'ERROR: unable to extract video title')
1627 video_title = mobj.group(1).decode('utf-8')
1628 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1630 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1632 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1634 video_uploader = mobj.group(1).decode('utf-8')
1636 # Extract video thumbnail
1637 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1639 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1641 video_thumbnail = mobj.group(1).decode('utf-8')
1643 # Extract video description
1644 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1646 self._downloader.trouble(u'ERROR: unable to extract video description')
1648 video_description = mobj.group(1).decode('utf-8')
1649 if not video_description: video_description = 'No description available.'
1651 # Extract video height and width
1652 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1654 self._downloader.trouble(u'ERROR: unable to extract video height')
1656 yv_video_height = mobj.group(1)
1658 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1660 self._downloader.trouble(u'ERROR: unable to extract video width')
1662 yv_video_width = mobj.group(1)
1664 # Retrieve video playlist to extract media URL
1665 # I'm not completely sure what all these options are, but we
1666 # seem to need most of them, otherwise the server sends a 401.
1667 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1668 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1669 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1670 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1671 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1673 self.report_download_webpage(video_id)
1674 webpage = urllib2.urlopen(request).read()
1675 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1676 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1679 # Extract media URL from playlist XML
1680 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1682 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1684 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1685 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1688 # Process video information
1689 self._downloader.process_info({
1690 'id': video_id.decode('utf-8'),
1692 'uploader': video_uploader,
1693 'upload_date': u'NA',
1694 'title': video_title,
1695 'stitle': simple_title,
1696 'ext': video_extension.decode('utf-8'),
1697 'thumbnail': video_thumbnail.decode('utf-8'),
1698 'description': video_description,
1699 'thumbnail': video_thumbnail,
1700 'description': video_description,
1703 except UnavailableVideoError:
1704 self._downloader.trouble(u'\nERROR: unable to download video')
1707 class GenericIE(InfoExtractor):
1708 """Generic last-resort information extractor."""
1710 def __init__(self, downloader=None):
1711 InfoExtractor.__init__(self, downloader)
1717 def report_download_webpage(self, video_id):
1718 """Report webpage download."""
1719 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1720 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1722 def report_extraction(self, video_id):
1723 """Report information extraction."""
1724 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1726 def _real_initialize(self):
1729 def _real_extract(self, url):
1730 # At this point we have a new video
1731 self._downloader.increment_downloads()
1733 video_id = url.split('/')[-1]
1734 request = urllib2.Request(url)
1736 self.report_download_webpage(video_id)
1737 webpage = urllib2.urlopen(request).read()
1738 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1739 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1741 except ValueError, err:
1742 # since this is the last-resort InfoExtractor, if
1743 # this error is thrown, it'll be thrown here
1744 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1747 self.report_extraction(video_id)
1748 # Start with something easy: JW Player in SWFObject
1749 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1751 # Broaden the search a little bit
1752 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1754 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1757 # It's possible that one of the regexes
1758 # matched, but returned an empty group:
1759 if mobj.group(1) is None:
1760 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1763 video_url = urllib.unquote(mobj.group(1))
1764 video_id = os.path.basename(video_url)
1766 # here's a fun little line of code for you:
1767 video_extension = os.path.splitext(video_id)[1][1:]
1768 video_id = os.path.splitext(video_id)[0]
1770 # it's tempting to parse this further, but you would
1771 # have to take into account all the variations like
1772 # Video Title - Site Name
1773 # Site Name | Video Title
1774 # Video Title - Tagline | Site Name
1775 # and so on and so forth; it's just not practical
1776 mobj = re.search(r'<title>(.*)</title>', webpage)
1778 self._downloader.trouble(u'ERROR: unable to extract title')
1780 video_title = mobj.group(1).decode('utf-8')
1781 video_title = sanitize_title(video_title)
1782 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1784 # video uploader is domain name
1785 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1787 self._downloader.trouble(u'ERROR: unable to extract title')
1789 video_uploader = mobj.group(1).decode('utf-8')
1792 # Process video information
1793 self._downloader.process_info({
1794 'id': video_id.decode('utf-8'),
1795 'url': video_url.decode('utf-8'),
1796 'uploader': video_uploader,
1797 'upload_date': u'NA',
1798 'title': video_title,
1799 'stitle': simple_title,
1800 'ext': video_extension.decode('utf-8'),
1804 except UnavailableVideoError, err:
1805 self._downloader.trouble(u'\nERROR: unable to download video')
1808 class YoutubeSearchIE(InfoExtractor):
1809 """Information Extractor for YouTube search queries."""
1810 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1811 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1812 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1813 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1815 _max_youtube_results = 1000
1817 def __init__(self, youtube_ie, downloader=None):
1818 InfoExtractor.__init__(self, downloader)
1819 self._youtube_ie = youtube_ie
1823 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1825 def report_download_page(self, query, pagenum):
1826 """Report attempt to download playlist page with given number."""
1827 query = query.decode(preferredencoding())
1828 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1830 def _real_initialize(self):
1831 self._youtube_ie.initialize()
1833 def _real_extract(self, query):
1834 mobj = re.match(self._VALID_QUERY, query)
1836 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1839 prefix, query = query.split(':')
1841 query = query.encode('utf-8')
1843 self._download_n_results(query, 1)
1845 elif prefix == 'all':
1846 self._download_n_results(query, self._max_youtube_results)
1852 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1854 elif n > self._max_youtube_results:
1855 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1856 n = self._max_youtube_results
1857 self._download_n_results(query, n)
1859 except ValueError: # parsing prefix as integer fails
1860 self._download_n_results(query, 1)
1863 def _download_n_results(self, query, n):
1864 """Downloads a specified number of results for a query"""
1867 already_seen = set()
1871 self.report_download_page(query, pagenum)
1872 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1873 request = urllib2.Request(result_url)
1875 page = urllib2.urlopen(request).read()
1876 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1877 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1880 # Extract video identifiers
1881 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1882 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1883 if video_id not in already_seen:
1884 video_ids.append(video_id)
1885 already_seen.add(video_id)
1886 if len(video_ids) == n:
1887 # Specified n videos reached
1888 for id in video_ids:
1889 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1892 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1893 for id in video_ids:
1894 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1897 pagenum = pagenum + 1
1899 class GoogleSearchIE(InfoExtractor):
1900 """Information Extractor for Google Video search queries."""
1901 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1902 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1903 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1904 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1906 _max_google_results = 1000
1908 def __init__(self, google_ie, downloader=None):
1909 InfoExtractor.__init__(self, downloader)
1910 self._google_ie = google_ie
1914 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1916 def report_download_page(self, query, pagenum):
1917 """Report attempt to download playlist page with given number."""
1918 query = query.decode(preferredencoding())
1919 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1921 def _real_initialize(self):
1922 self._google_ie.initialize()
1924 def _real_extract(self, query):
1925 mobj = re.match(self._VALID_QUERY, query)
1927 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1930 prefix, query = query.split(':')
1932 query = query.encode('utf-8')
1934 self._download_n_results(query, 1)
1936 elif prefix == 'all':
1937 self._download_n_results(query, self._max_google_results)
1943 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1945 elif n > self._max_google_results:
1946 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1947 n = self._max_google_results
1948 self._download_n_results(query, n)
1950 except ValueError: # parsing prefix as integer fails
1951 self._download_n_results(query, 1)
1954 def _download_n_results(self, query, n):
1955 """Downloads a specified number of results for a query"""
1958 already_seen = set()
1962 self.report_download_page(query, pagenum)
1963 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1964 request = urllib2.Request(result_url)
1966 page = urllib2.urlopen(request).read()
1967 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1968 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1971 # Extract video identifiers
1972 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1973 video_id = mobj.group(1)
1974 if video_id not in already_seen:
1975 video_ids.append(video_id)
1976 already_seen.add(video_id)
1977 if len(video_ids) == n:
1978 # Specified n videos reached
1979 for id in video_ids:
1980 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1983 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1984 for id in video_ids:
1985 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1988 pagenum = pagenum + 1
1990 class YahooSearchIE(InfoExtractor):
1991 """Information Extractor for Yahoo! Video search queries."""
1992 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1993 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1994 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1995 _MORE_PAGES_INDICATOR = r'\s*Next'
1997 _max_yahoo_results = 1000
1999 def __init__(self, yahoo_ie, downloader=None):
2000 InfoExtractor.__init__(self, downloader)
2001 self._yahoo_ie = yahoo_ie
2005 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2007 def report_download_page(self, query, pagenum):
2008 """Report attempt to download playlist page with given number."""
2009 query = query.decode(preferredencoding())
2010 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2012 def _real_initialize(self):
2013 self._yahoo_ie.initialize()
2015 def _real_extract(self, query):
2016 mobj = re.match(self._VALID_QUERY, query)
2018 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2021 prefix, query = query.split(':')
2023 query = query.encode('utf-8')
2025 self._download_n_results(query, 1)
2027 elif prefix == 'all':
2028 self._download_n_results(query, self._max_yahoo_results)
2034 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2036 elif n > self._max_yahoo_results:
2037 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2038 n = self._max_yahoo_results
2039 self._download_n_results(query, n)
2041 except ValueError: # parsing prefix as integer fails
2042 self._download_n_results(query, 1)
2045 def _download_n_results(self, query, n):
2046 """Downloads a specified number of results for a query"""
2049 already_seen = set()
2053 self.report_download_page(query, pagenum)
2054 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2055 request = urllib2.Request(result_url)
2057 page = urllib2.urlopen(request).read()
2058 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2059 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2062 # Extract video identifiers
2063 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2064 video_id = mobj.group(1)
2065 if video_id not in already_seen:
2066 video_ids.append(video_id)
2067 already_seen.add(video_id)
2068 if len(video_ids) == n:
2069 # Specified n videos reached
2070 for id in video_ids:
2071 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2074 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2075 for id in video_ids:
2076 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2079 pagenum = pagenum + 1
2081 class YoutubePlaylistIE(InfoExtractor):
2082 """Information Extractor for YouTube playlists."""
2084 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/|p/)([^&]+).*'
2085 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
2086 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2087 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2090 def __init__(self, youtube_ie, downloader=None):
2091 InfoExtractor.__init__(self, downloader)
2092 self._youtube_ie = youtube_ie
2096 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2098 def report_download_page(self, playlist_id, pagenum):
2099 """Report attempt to download playlist page with given number."""
2100 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2102 def _real_initialize(self):
2103 self._youtube_ie.initialize()
2105 def _real_extract(self, url):
2106 # Extract playlist id
2107 mobj = re.match(self._VALID_URL, url)
2109 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2112 # Download playlist pages
2113 playlist_id = mobj.group(1)
2118 self.report_download_page(playlist_id, pagenum)
2119 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum))
2121 page = urllib2.urlopen(request).read()
2122 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2123 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2126 # Extract video identifiers
2128 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2129 if mobj.group(1) not in ids_in_page:
2130 ids_in_page.append(mobj.group(1))
2131 video_ids.extend(ids_in_page)
2133 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2135 pagenum = pagenum + 1
2137 playliststart = self._downloader.params.get('playliststart', 1) - 1
2138 playlistend = self._downloader.params.get('playlistend', -1)
2139 video_ids = video_ids[playliststart:playlistend]
2141 for id in video_ids:
2142 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2145 class YoutubeUserIE(InfoExtractor):
2146 """Information Extractor for YouTube users."""
2148 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2149 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2150 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2153 def __init__(self, youtube_ie, downloader=None):
2154 InfoExtractor.__init__(self, downloader)
2155 self._youtube_ie = youtube_ie
2159 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2161 def report_download_page(self, username):
2162 """Report attempt to download user page."""
2163 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2165 def _real_initialize(self):
2166 self._youtube_ie.initialize()
2168 def _real_extract(self, url):
2170 mobj = re.match(self._VALID_URL, url)
2172 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2175 # Download user page
2176 username = mobj.group(1)
2180 self.report_download_page(username)
2181 request = urllib2.Request(self._TEMPLATE_URL % (username))
2183 page = urllib2.urlopen(request).read()
2184 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2185 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2188 # Extract video identifiers
2191 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2192 if mobj.group(1) not in ids_in_page:
2193 ids_in_page.append(mobj.group(1))
2194 video_ids.extend(ids_in_page)
2196 playliststart = self._downloader.params.get('playliststart', 1) - 1
2197 playlistend = self._downloader.params.get('playlistend', -1)
2198 video_ids = video_ids[playliststart:playlistend]
2200 for id in video_ids:
2201 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2204 class DepositFilesIE(InfoExtractor):
2205 """Information extractor for depositfiles.com"""
2207 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2209 def __init__(self, downloader=None):
2210 InfoExtractor.__init__(self, downloader)
2214 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2216 def report_download_webpage(self, file_id):
2217 """Report webpage download."""
2218 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2220 def report_extraction(self, file_id):
2221 """Report information extraction."""
2222 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2224 def _real_initialize(self):
2227 def _real_extract(self, url):
2228 # At this point we have a new file
2229 self._downloader.increment_downloads()
2231 file_id = url.split('/')[-1]
2232 # Rebuild url in english locale
2233 url = 'http://depositfiles.com/en/files/' + file_id
2235 # Retrieve file webpage with 'Free download' button pressed
2236 free_download_indication = { 'gateway_result' : '1' }
2237 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2239 self.report_download_webpage(file_id)
2240 webpage = urllib2.urlopen(request).read()
2241 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2242 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2245 # Search for the real file URL
2246 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2247 if (mobj is None) or (mobj.group(1) is None):
2248 # Try to figure out reason of the error.
2249 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2250 if (mobj is not None) and (mobj.group(1) is not None):
2251 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2252 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2254 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2257 file_url = mobj.group(1)
2258 file_extension = os.path.splitext(file_url)[1][1:]
2260 # Search for file title
2261 mobj = re.search(r'<b title="(.*?)">', webpage)
2263 self._downloader.trouble(u'ERROR: unable to extract title')
2265 file_title = mobj.group(1).decode('utf-8')
2268 # Process file information
2269 self._downloader.process_info({
2270 'id': file_id.decode('utf-8'),
2271 'url': file_url.decode('utf-8'),
2273 'upload_date': u'NA',
2274 'title': file_title,
2275 'stitle': file_title,
2276 'ext': file_extension.decode('utf-8'),
2280 except UnavailableVideoError, err:
2281 self._downloader.trouble(u'ERROR: unable to download file')
2283 class PostProcessor(object):
2284 """Post Processor class.
2286 PostProcessor objects can be added to downloaders with their
2287 add_post_processor() method. When the downloader has finished a
2288 successful download, it will take its internal chain of PostProcessors
2289 and start calling the run() method on each one of them, first with
2290 an initial argument and then with the returned value of the previous
2293 The chain will be stopped if one of them ever returns None or the end
2294 of the chain is reached.
2296 PostProcessor objects follow a "mutual registration" process similar
2297 to InfoExtractor objects.
2302 def __init__(self, downloader=None):
2303 self._downloader = downloader
2305 def set_downloader(self, downloader):
2306 """Sets the downloader for this PP."""
2307 self._downloader = downloader
2309 def run(self, information):
2310 """Run the PostProcessor.
2312 The "information" argument is a dictionary like the ones
2313 composed by InfoExtractors. The only difference is that this
2314 one has an extra field called "filepath" that points to the
2317 When this method returns None, the postprocessing chain is
2318 stopped. However, this method may return an information
2319 dictionary that will be passed to the next postprocessing
2320 object in the chain. It can be the one it received after
2321 changing some fields.
2323 In addition, this method may raise a PostProcessingError
2324 exception that will be taken into account by the downloader
2327 return information # by default, do nothing
2329 ### MAIN PROGRAM ###
2330 if __name__ == '__main__':
2332 # Modules needed only when running the main program
2336 # Function to update the program file with the latest version from the repository.
2337 def update_self(downloader, filename):
2338 # Note: downloader only used for options
2339 if not os.access(filename, os.W_OK):
2340 sys.exit('ERROR: no write permissions on %s' % filename)
2342 downloader.to_screen('Updating to latest stable version...')
2344 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2345 latest_version = urllib.urlopen(latest_url).read().strip()
2346 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2347 newcontent = urllib.urlopen(prog_url).read()
2348 except (IOError, OSError), err:
2349 sys.exit('ERROR: unable to download latest version')
2351 stream = open(filename, 'w')
2352 stream.write(newcontent)
2354 except (IOError, OSError), err:
2355 sys.exit('ERROR: unable to overwrite current version')
2356 downloader.to_screen('Updated to version %s' % latest_version)
2358 # Parse command line
2359 parser = optparse.OptionParser(
2360 usage='Usage: %prog [options] url...',
2361 version='2010.12.09',
2362 conflict_handler='resolve',
2365 parser.add_option('-h', '--help',
2366 action='help', help='print this help text and exit')
2367 parser.add_option('-v', '--version',
2368 action='version', help='print program version and exit')
2369 parser.add_option('-U', '--update',
2370 action='store_true', dest='update_self', help='update this program to latest stable version')
2371 parser.add_option('-i', '--ignore-errors',
2372 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2373 parser.add_option('-r', '--rate-limit',
2374 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2375 parser.add_option('-R', '--retries',
2376 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2377 parser.add_option('--playlist-start',
2378 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2379 parser.add_option('--playlist-end',
2380 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2381 parser.add_option('--dump-user-agent',
2382 action='store_true', dest='dump_user_agent', help='display the current browser identification', default=False)
2384 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2385 authentication.add_option('-u', '--username',
2386 dest='username', metavar='USERNAME', help='account username')
2387 authentication.add_option('-p', '--password',
2388 dest='password', metavar='PASSWORD', help='account password')
2389 authentication.add_option('-n', '--netrc',
2390 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2391 parser.add_option_group(authentication)
2393 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2394 video_format.add_option('-f', '--format',
2395 action='store', dest='format', metavar='FORMAT', help='video format code')
2396 video_format.add_option('--all-formats',
2397 action='store_const', dest='format', help='download all available video formats', const='-1')
2398 video_format.add_option('--max-quality',
2399 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2400 parser.add_option_group(video_format)
2402 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2403 verbosity.add_option('-q', '--quiet',
2404 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2405 verbosity.add_option('-s', '--simulate',
2406 action='store_true', dest='simulate', help='do not download video', default=False)
2407 verbosity.add_option('-g', '--get-url',
2408 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2409 verbosity.add_option('-e', '--get-title',
2410 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2411 verbosity.add_option('--get-thumbnail',
2412 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2413 verbosity.add_option('--get-description',
2414 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2415 verbosity.add_option('--get-filename',
2416 action='store_true', dest='getfilename', help='simulate, quiet but print output filename', default=False)
2417 verbosity.add_option('--no-progress',
2418 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2419 verbosity.add_option('--console-title',
2420 action='store_true', dest='consoletitle', help='display progress in console titlebar', default=False)
2421 parser.add_option_group(verbosity)
2423 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2424 filesystem.add_option('-t', '--title',
2425 action='store_true', dest='usetitle', help='use title in file name', default=False)
2426 filesystem.add_option('-l', '--literal',
2427 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2428 filesystem.add_option('-A', '--auto-number',
2429 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2430 filesystem.add_option('-o', '--output',
2431 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2432 filesystem.add_option('-a', '--batch-file',
2433 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2434 filesystem.add_option('-w', '--no-overwrites',
2435 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2436 filesystem.add_option('-c', '--continue',
2437 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2438 filesystem.add_option('--cookies',
2439 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2440 filesystem.add_option('--no-part',
2441 action='store_true', dest='nopart', help='do not use .part files', default=False)
2442 parser.add_option_group(filesystem)
2444 (opts, args) = parser.parse_args()
2446 # Open appropriate CookieJar
2447 if opts.cookiefile is None:
2448 jar = cookielib.CookieJar()
2451 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2452 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2454 except (IOError, OSError), err:
2455 sys.exit(u'ERROR: unable to open cookie file')
2458 if opts.dump_user_agent:
2459 print std_headers['User-Agent']
2462 # General configuration
2463 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2464 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
2465 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2467 # Batch file verification
2469 if opts.batchfile is not None:
2471 if opts.batchfile == '-':
2474 batchfd = open(opts.batchfile, 'r')
2475 batchurls = batchfd.readlines()
2476 batchurls = [x.strip() for x in batchurls]
2477 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2479 sys.exit(u'ERROR: batch file could not be read')
2480 all_urls = batchurls + args
2482 # Conflicting, missing and erroneous options
2483 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2484 parser.error(u'using .netrc conflicts with giving username/password')
2485 if opts.password is not None and opts.username is None:
2486 parser.error(u'account username missing')
2487 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2488 parser.error(u'using output template conflicts with using title, literal title or auto number')
2489 if opts.usetitle and opts.useliteral:
2490 parser.error(u'using title conflicts with using literal title')
2491 if opts.username is not None and opts.password is None:
2492 opts.password = getpass.getpass(u'Type account password and press return:')
2493 if opts.ratelimit is not None:
2494 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2495 if numeric_limit is None:
2496 parser.error(u'invalid rate limit specified')
2497 opts.ratelimit = numeric_limit
2498 if opts.retries is not None:
2500 opts.retries = long(opts.retries)
2501 except (TypeError, ValueError), err:
2502 parser.error(u'invalid retry count specified')
2504 opts.playliststart = long(opts.playliststart)
2505 if opts.playliststart <= 0:
2507 except (TypeError, ValueError), err:
2508 parser.error(u'invalid playlist start number specified')
2510 opts.playlistend = long(opts.playlistend)
2511 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2513 except (TypeError, ValueError), err:
2514 parser.error(u'invalid playlist end number specified')
2516 # Information extractors
2517 youtube_ie = YoutubeIE()
2518 metacafe_ie = MetacafeIE(youtube_ie)
2519 dailymotion_ie = DailymotionIE()
2520 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2521 youtube_user_ie = YoutubeUserIE(youtube_ie)
2522 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2523 google_ie = GoogleIE()
2524 google_search_ie = GoogleSearchIE(google_ie)
2525 photobucket_ie = PhotobucketIE()
2526 yahoo_ie = YahooIE()
2527 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2528 deposit_files_ie = DepositFilesIE()
2529 generic_ie = GenericIE()
2532 fd = FileDownloader({
2533 'usenetrc': opts.usenetrc,
2534 'username': opts.username,
2535 'password': opts.password,
2536 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
2537 'forceurl': opts.geturl,
2538 'forcetitle': opts.gettitle,
2539 'forcethumbnail': opts.getthumbnail,
2540 'forcedescription': opts.getdescription,
2541 'forcefilename': opts.getfilename,
2542 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
2543 'format': opts.format,
2544 'format_limit': opts.format_limit,
2545 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2546 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2547 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2548 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2549 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2550 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2551 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2552 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2553 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2554 or u'%(id)s.%(ext)s'),
2555 'ignoreerrors': opts.ignoreerrors,
2556 'ratelimit': opts.ratelimit,
2557 'nooverwrites': opts.nooverwrites,
2558 'retries': opts.retries,
2559 'continuedl': opts.continue_dl,
2560 'noprogress': opts.noprogress,
2561 'playliststart': opts.playliststart,
2562 'playlistend': opts.playlistend,
2563 'logtostderr': opts.outtmpl == '-',
2564 'consoletitle': opts.consoletitle,
2565 'nopart': opts.nopart,
2567 fd.add_info_extractor(youtube_search_ie)
2568 fd.add_info_extractor(youtube_pl_ie)
2569 fd.add_info_extractor(youtube_user_ie)
2570 fd.add_info_extractor(metacafe_ie)
2571 fd.add_info_extractor(dailymotion_ie)
2572 fd.add_info_extractor(youtube_ie)
2573 fd.add_info_extractor(google_ie)
2574 fd.add_info_extractor(google_search_ie)
2575 fd.add_info_extractor(photobucket_ie)
2576 fd.add_info_extractor(yahoo_ie)
2577 fd.add_info_extractor(yahoo_search_ie)
2578 fd.add_info_extractor(deposit_files_ie)
2580 # This must come last since it's the
2581 # fallback if none of the others work
2582 fd.add_info_extractor(generic_ie)
2585 if opts.update_self:
2586 update_self(fd, sys.argv[0])
2589 if len(all_urls) < 1:
2590 if not opts.update_self:
2591 parser.error(u'you must provide at least one URL')
2594 retcode = fd.download(all_urls)
2596 # Dump cookie jar if requested
2597 if opts.cookiefile is not None:
2600 except (IOError, OSError), err:
2601 sys.exit(u'ERROR: unable to save cookie jar')
2605 except DownloadError:
2607 except SameFileError:
2608 sys.exit(u'ERROR: fixed output name but more than one file to download')
2609 except KeyboardInterrupt:
2610 sys.exit(u'\nERROR: Interrupted by user')