2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # Author: Vasyl' Vavrychuk
7 # Author: Witold Baryluk
8 # Author: Paweł Paprota
9 # Author: Gergely Imreh
10 # License: Public domain code
34 # parse_qs was moved from the cgi module to the urlparse module recently.
36 from urlparse import parse_qs
38 from cgi import parse_qs
41 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b10) Gecko/20100101 Firefox/4.0b10',
42 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
43 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 'Accept-Encoding': 'gzip, deflate',
45 'Accept-Language': 'en-us,en;q=0.5',
48 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
50 def preferredencoding():
51 """Get preferred encoding.
53 Returns the best encoding scheme for the system, based on
54 locale.getpreferredencoding() and some further tweaks.
56 def yield_preferredencoding():
58 pref = locale.getpreferredencoding()
64 return yield_preferredencoding().next()
66 def htmlentity_transform(matchobj):
67 """Transforms an HTML entity to a Unicode character.
69 This function receives a match object and is intended to be used with
70 the re.sub() function.
72 entity = matchobj.group(1)
74 # Known non-numeric HTML entity
75 if entity in htmlentitydefs.name2codepoint:
76 return unichr(htmlentitydefs.name2codepoint[entity])
79 mobj = re.match(ur'(?u)#(x?\d+)', entity)
81 numstr = mobj.group(1)
82 if numstr.startswith(u'x'):
84 numstr = u'0%s' % numstr
87 return unichr(long(numstr, base))
89 # Unknown entity in name, return its literal representation
90 return (u'&%s;' % entity)
92 def sanitize_title(utitle):
93 """Sanitizes a video title so it could be used as part of a filename."""
94 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
95 return utitle.replace(unicode(os.sep), u'%')
97 def sanitize_open(filename, open_mode):
98 """Try to open the given filename, and slightly tweak it if this fails.
100 Attempts to open the given filename. If this fails, it tries to change
101 the filename slightly, step by step, until it's either able to open it
102 or it fails and raises a final exception, like the standard open()
105 It returns the tuple (stream, definitive_file_name).
109 if sys.platform == 'win32':
111 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
112 return (sys.stdout, filename)
113 stream = open(filename, open_mode)
114 return (stream, filename)
115 except (IOError, OSError), err:
116 # In case of error, try to remove win32 forbidden chars
117 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
119 # An exception here should be caught in the caller
120 stream = open(filename, open_mode)
121 return (stream, filename)
123 def timeconvert(timestr):
124 """Convert RFC 2822 defined time string into system timestamp"""
126 timetuple = email.utils.parsedate_tz(timestr)
127 if timetuple is not None:
128 timestamp = email.utils.mktime_tz(timetuple)
131 class DownloadError(Exception):
132 """Download Error exception.
134 This exception may be thrown by FileDownloader objects if they are not
135 configured to continue on errors. They will contain the appropriate
140 class SameFileError(Exception):
141 """Same File exception.
143 This exception will be thrown by FileDownloader objects if they detect
144 multiple files would have to be downloaded to the same file on disk.
148 class PostProcessingError(Exception):
149 """Post Processing exception.
151 This exception may be raised by PostProcessor's .run() method to
152 indicate an error in the postprocessing task.
156 class UnavailableVideoError(Exception):
157 """Unavailable Format exception.
159 This exception will be thrown when a video is requested
160 in a format that is not available for that video.
164 class ContentTooShortError(Exception):
165 """Content Too Short exception.
167 This exception may be raised by FileDownloader objects when a file they
168 download is too small for what the server announced first, indicating
169 the connection was probably interrupted.
175 def __init__(self, downloaded, expected):
176 self.downloaded = downloaded
177 self.expected = expected
179 class YoutubeDLHandler(urllib2.HTTPHandler):
180 """Handler for HTTP requests and responses.
182 This class, when installed with an OpenerDirector, automatically adds
183 the standard headers to every HTTP request and handles gzipped and
184 deflated responses from web servers. If compression is to be avoided in
185 a particular request, the original request in the program code only has
186 to include the HTTP header "Youtubedl-No-Compression", which will be
187 removed before making the real request.
189 Part of this code was copied from:
191 http://techknack.net/python-urllib2-handlers/
193 Andrew Rowls, the author of that code, agreed to release it to the
200 return zlib.decompress(data, -zlib.MAX_WBITS)
202 return zlib.decompress(data)
205 def addinfourl_wrapper(stream, headers, url, code):
206 if hasattr(urllib2.addinfourl, 'getcode'):
207 return urllib2.addinfourl(stream, headers, url, code)
208 ret = urllib2.addinfourl(stream, headers, url)
212 def http_request(self, req):
213 for h in std_headers:
216 req.add_header(h, std_headers[h])
217 if 'Youtubedl-no-compression' in req.headers:
218 if 'Accept-encoding' in req.headers:
219 del req.headers['Accept-encoding']
220 del req.headers['Youtubedl-no-compression']
223 def http_response(self, req, resp):
226 if resp.headers.get('Content-encoding', '') == 'gzip':
227 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
228 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
229 resp.msg = old_resp.msg
231 if resp.headers.get('Content-encoding', '') == 'deflate':
232 gz = StringIO.StringIO(self.deflate(resp.read()))
233 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
234 resp.msg = old_resp.msg
237 class FileDownloader(object):
238 """File Downloader class.
240 File downloader objects are the ones responsible of downloading the
241 actual video file and writing it to disk if the user has requested
242 it, among some other tasks. In most cases there should be one per
243 program. As, given a video URL, the downloader doesn't know how to
244 extract all the needed information, task that InfoExtractors do, it
245 has to pass the URL to one of them.
247 For this, file downloader objects have a method that allows
248 InfoExtractors to be registered in a given order. When it is passed
249 a URL, the file downloader handles it to the first InfoExtractor it
250 finds that reports being able to handle it. The InfoExtractor extracts
251 all the information about the video or videos the URL refers to, and
252 asks the FileDownloader to process the video information, possibly
253 downloading the video.
255 File downloaders accept a lot of parameters. In order not to saturate
256 the object constructor with arguments, it receives a dictionary of
257 options instead. These options are available through the params
258 attribute for the InfoExtractors to use. The FileDownloader also
259 registers itself as the downloader in charge for the InfoExtractors
260 that are added to it, so this is a "mutual registration".
264 username: Username for authentication purposes.
265 password: Password for authentication purposes.
266 usenetrc: Use netrc for authentication instead.
267 quiet: Do not print messages to stdout.
268 forceurl: Force printing final URL.
269 forcetitle: Force printing title.
270 forcethumbnail: Force printing thumbnail URL.
271 forcedescription: Force printing description.
272 forcefilename: Force printing final filename.
273 simulate: Do not download the video files.
274 format: Video format code.
275 format_limit: Highest quality format to try.
276 outtmpl: Template for output names.
277 ignoreerrors: Do not stop on download errors.
278 ratelimit: Download speed limit, in bytes/sec.
279 nooverwrites: Prevent overwriting files.
280 retries: Number of times to retry for HTTP error 5xx
281 continuedl: Try to continue downloads if possible.
282 noprogress: Do not print the progress bar.
283 playliststart: Playlist item to start at.
284 playlistend: Playlist item to end at.
285 logtostderr: Log messages to stderr instead of stdout.
286 consoletitle: Display progress in console window's titlebar.
287 nopart: Do not use temporary .part files.
288 updatetime: Use the Last-modified header to set output file timestamps.
294 _download_retcode = None
295 _num_downloads = None
298 def __init__(self, params):
299 """Create a FileDownloader object with the given options."""
302 self._download_retcode = 0
303 self._num_downloads = 0
304 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
308 def pmkdir(filename):
309 """Create directory components in filename. Similar to Unix "mkdir -p"."""
310 components = filename.split(os.sep)
311 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
312 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
313 for dir in aggregate:
314 if not os.path.exists(dir):
318 def format_bytes(bytes):
321 if type(bytes) is str:
326 exponent = long(math.log(bytes, 1024.0))
327 suffix = 'bkMGTPEZY'[exponent]
328 converted = float(bytes) / float(1024**exponent)
329 return '%.2f%s' % (converted, suffix)
332 def calc_percent(byte_counter, data_len):
335 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
338 def calc_eta(start, now, total, current):
342 if current == 0 or dif < 0.001: # One millisecond
344 rate = float(current) / dif
345 eta = long((float(total) - float(current)) / rate)
346 (eta_mins, eta_secs) = divmod(eta, 60)
349 return '%02d:%02d' % (eta_mins, eta_secs)
352 def calc_speed(start, now, bytes):
354 if bytes == 0 or dif < 0.001: # One millisecond
355 return '%10s' % '---b/s'
356 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
359 def best_block_size(elapsed_time, bytes):
360 new_min = max(bytes / 2.0, 1.0)
361 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
362 if elapsed_time < 0.001:
364 rate = bytes / elapsed_time
372 def parse_bytes(bytestr):
373 """Parse a string indicating a byte quantity into a long integer."""
374 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
377 number = float(matchobj.group(1))
378 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
379 return long(round(number * multiplier))
381 def add_info_extractor(self, ie):
382 """Add an InfoExtractor object to the end of the list."""
384 ie.set_downloader(self)
386 def add_post_processor(self, pp):
387 """Add a PostProcessor object to the end of the chain."""
389 pp.set_downloader(self)
391 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
392 """Print message to stdout if not in quiet mode."""
394 if not self.params.get('quiet', False):
395 terminator = [u'\n', u''][skip_eol]
396 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
397 self._screen_file.flush()
398 except (UnicodeEncodeError), err:
399 if not ignore_encoding_errors:
402 def to_stderr(self, message):
403 """Print message to stderr."""
404 print >>sys.stderr, message.encode(preferredencoding())
406 def to_cons_title(self, message):
407 """Set console/terminal window title to message."""
408 if not self.params.get('consoletitle', False):
410 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
411 # c_wchar_p() might not be necessary if `message` is
412 # already of type unicode()
413 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
414 elif 'TERM' in os.environ:
415 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
417 def fixed_template(self):
418 """Checks if the output template is fixed."""
419 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
421 def trouble(self, message=None):
422 """Determine action to take when a download problem appears.
424 Depending on if the downloader has been configured to ignore
425 download errors or not, this method may throw an exception or
426 not when errors are found, after printing the message.
428 if message is not None:
429 self.to_stderr(message)
430 if not self.params.get('ignoreerrors', False):
431 raise DownloadError(message)
432 self._download_retcode = 1
434 def slow_down(self, start_time, byte_counter):
435 """Sleep if the download speed is over the rate limit."""
436 rate_limit = self.params.get('ratelimit', None)
437 if rate_limit is None or byte_counter == 0:
440 elapsed = now - start_time
443 speed = float(byte_counter) / elapsed
444 if speed > rate_limit:
445 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
447 def temp_name(self, filename):
448 """Returns a temporary filename for the given filename."""
449 if self.params.get('nopart', False) or filename == u'-' or \
450 (os.path.exists(filename) and not os.path.isfile(filename)):
452 return filename + u'.part'
454 def undo_temp_name(self, filename):
455 if filename.endswith(u'.part'):
456 return filename[:-len(u'.part')]
459 def try_rename(self, old_filename, new_filename):
461 if old_filename == new_filename:
463 os.rename(old_filename, new_filename)
464 except (IOError, OSError), err:
465 self.trouble(u'ERROR: unable to rename file')
467 def try_utime(self, filename, last_modified_hdr):
468 """Try to set the last-modified time of the given file."""
469 if last_modified_hdr is None:
471 if not os.path.isfile(filename):
473 timestr = last_modified_hdr
476 filetime = timeconvert(timestr)
480 os.utime(filename,(time.time(), filetime))
484 def report_destination(self, filename):
485 """Report destination filename."""
486 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
488 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
489 """Report download progress."""
490 if self.params.get('noprogress', False):
492 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
493 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
494 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
495 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
497 def report_resuming_byte(self, resume_len):
498 """Report attempt to resume at given byte."""
499 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
501 def report_retry(self, count, retries):
502 """Report retry in case of HTTP error 5xx"""
503 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
505 def report_file_already_downloaded(self, file_name):
506 """Report file has already been fully downloaded."""
508 self.to_screen(u'[download] %s has already been downloaded' % file_name)
509 except (UnicodeEncodeError), err:
510 self.to_screen(u'[download] The file has already been downloaded')
512 def report_unable_to_resume(self):
513 """Report it was impossible to resume download."""
514 self.to_screen(u'[download] Unable to resume')
516 def report_finish(self):
517 """Report download finished."""
518 if self.params.get('noprogress', False):
519 self.to_screen(u'[download] Download completed')
523 def increment_downloads(self):
524 """Increment the ordinal that assigns a number to each file."""
525 self._num_downloads += 1
527 def prepare_filename(self, info_dict):
528 """Generate the output filename."""
530 template_dict = dict(info_dict)
531 template_dict['epoch'] = unicode(long(time.time()))
532 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
533 filename = self.params['outtmpl'] % template_dict
535 except (ValueError, KeyError), err:
536 self.trouble(u'ERROR: invalid system charset or erroneous output template')
539 def process_info(self, info_dict):
540 """Process a single dictionary returned by an InfoExtractor."""
541 filename = self.prepare_filename(info_dict)
542 # Do nothing else if in simulate mode
543 if self.params.get('simulate', False):
545 if self.params.get('forcetitle', False):
546 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
547 if self.params.get('forceurl', False):
548 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
549 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
550 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
551 if self.params.get('forcedescription', False) and 'description' in info_dict:
552 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
553 if self.params.get('forcefilename', False) and filename is not None:
554 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
560 if self.params.get('nooverwrites', False) and os.path.exists(filename):
561 self.to_stderr(u'WARNING: file exists and will be skipped')
565 self.pmkdir(filename)
566 except (OSError, IOError), err:
567 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
571 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
572 except (OSError, IOError), err:
573 raise UnavailableVideoError
574 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
575 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
577 except (ContentTooShortError, ), err:
578 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
583 self.post_process(filename, info_dict)
584 except (PostProcessingError), err:
585 self.trouble(u'ERROR: postprocessing: %s' % str(err))
588 def download(self, url_list):
589 """Download a given list of URLs."""
590 if len(url_list) > 1 and self.fixed_template():
591 raise SameFileError(self.params['outtmpl'])
594 suitable_found = False
596 # Go to next InfoExtractor if not suitable
597 if not ie.suitable(url):
600 # Suitable InfoExtractor found
601 suitable_found = True
603 # Extract information from URL and process it
606 # Suitable InfoExtractor had been found; go to next URL
609 if not suitable_found:
610 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
612 return self._download_retcode
614 def post_process(self, filename, ie_info):
615 """Run the postprocessing chain on the given file."""
617 info['filepath'] = filename
623 def _download_with_rtmpdump(self, filename, url, player_url):
624 self.report_destination(filename)
625 tmpfilename = self.temp_name(filename)
627 # Check for rtmpdump first
629 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
630 except (OSError, IOError):
631 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
634 # Download using rtmpdump. rtmpdump returns exit code 2 when
635 # the connection was interrumpted and resuming appears to be
636 # possible. This is part of rtmpdump's normal usage, AFAIK.
637 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
638 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
639 while retval == 2 or retval == 1:
640 prevsize = os.path.getsize(tmpfilename)
641 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
642 time.sleep(5.0) # This seems to be needed
643 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
644 cursize = os.path.getsize(tmpfilename)
645 if prevsize == cursize and retval == 1:
648 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
649 self.try_rename(tmpfilename, filename)
652 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
655 def _do_download(self, filename, url, player_url):
656 # Check file already present
657 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
658 self.report_file_already_downloaded(filename)
661 # Attempt to download using rtmpdump
662 if url.startswith('rtmp'):
663 return self._download_with_rtmpdump(filename, url, player_url)
665 tmpfilename = self.temp_name(filename)
669 # Do not include the Accept-Encoding header
670 headers = {'Youtubedl-no-compression': 'True'}
671 basic_request = urllib2.Request(url, None, headers)
672 request = urllib2.Request(url, None, headers)
674 # Establish possible resume length
675 if os.path.isfile(tmpfilename):
676 resume_len = os.path.getsize(tmpfilename)
680 # Request parameters in case of being able to resume
681 if self.params.get('continuedl', False) and resume_len != 0:
682 self.report_resuming_byte(resume_len)
683 request.add_header('Range','bytes=%d-' % resume_len)
687 retries = self.params.get('retries', 0)
688 while count <= retries:
689 # Establish connection
691 data = urllib2.urlopen(request)
693 except (urllib2.HTTPError, ), err:
694 if (err.code < 500 or err.code >= 600) and err.code != 416:
695 # Unexpected HTTP error
697 elif err.code == 416:
698 # Unable to resume (requested range not satisfiable)
700 # Open the connection again without the range header
701 data = urllib2.urlopen(basic_request)
702 content_length = data.info()['Content-Length']
703 except (urllib2.HTTPError, ), err:
704 if err.code < 500 or err.code >= 600:
707 # Examine the reported length
708 if (content_length is not None and
709 (resume_len - 100 < long(content_length) < resume_len + 100)):
710 # The file had already been fully downloaded.
711 # Explanation to the above condition: in issue #175 it was revealed that
712 # YouTube sometimes adds or removes a few bytes from the end of the file,
713 # changing the file size slightly and causing problems for some users. So
714 # I decided to implement a suggested change and consider the file
715 # completely downloaded if the file size differs less than 100 bytes from
716 # the one in the hard drive.
717 self.report_file_already_downloaded(filename)
718 self.try_rename(tmpfilename, filename)
721 # The length does not match, we start the download over
722 self.report_unable_to_resume()
728 self.report_retry(count, retries)
731 self.trouble(u'ERROR: giving up after %s retries' % retries)
734 data_len = data.info().get('Content-length', None)
735 if data_len is not None:
736 data_len = long(data_len) + resume_len
737 data_len_str = self.format_bytes(data_len)
738 byte_counter = 0 + resume_len
744 data_block = data.read(block_size)
746 if len(data_block) == 0:
748 byte_counter += len(data_block)
750 # Open file just in time
753 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
754 filename = self.undo_temp_name(tmpfilename)
755 self.report_destination(filename)
756 except (OSError, IOError), err:
757 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
760 stream.write(data_block)
761 except (IOError, OSError), err:
762 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
764 block_size = self.best_block_size(after - before, len(data_block))
767 percent_str = self.calc_percent(byte_counter, data_len)
768 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
769 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
770 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
773 self.slow_down(start, byte_counter - resume_len)
777 if data_len is not None and byte_counter != data_len:
778 raise ContentTooShortError(byte_counter, long(data_len))
779 self.try_rename(tmpfilename, filename)
781 # Update file modification time
782 if self.params.get('updatetime', True):
783 self.try_utime(filename, data.info().get('last-modified', None))
787 class InfoExtractor(object):
788 """Information Extractor class.
790 Information extractors are the classes that, given a URL, extract
791 information from the video (or videos) the URL refers to. This
792 information includes the real video URL, the video title and simplified
793 title, author and others. The information is stored in a dictionary
794 which is then passed to the FileDownloader. The FileDownloader
795 processes this information possibly downloading the video to the file
796 system, among other possible outcomes. The dictionaries must include
797 the following fields:
799 id: Video identifier.
800 url: Final video URL.
801 uploader: Nickname of the video uploader.
802 title: Literal title.
803 stitle: Simplified title.
804 ext: Video filename extension.
805 format: Video format.
806 player_url: SWF Player URL (may be None).
808 The following fields are optional. Their primary purpose is to allow
809 youtube-dl to serve as the backend for a video search function, such
810 as the one in youtube2mp3. They are only used when their respective
811 forced printing functions are called:
813 thumbnail: Full URL to a video thumbnail image.
814 description: One-line video description.
816 Subclasses of this one should re-define the _real_initialize() and
817 _real_extract() methods, as well as the suitable() static method.
818 Probably, they should also be instantiated and added to the main
825 def __init__(self, downloader=None):
826 """Constructor. Receives an optional downloader."""
828 self.set_downloader(downloader)
832 """Receives a URL and returns True if suitable for this IE."""
835 def initialize(self):
836 """Initializes an instance (authentication, etc)."""
838 self._real_initialize()
841 def extract(self, url):
842 """Extracts URL information and returns it in list of dicts."""
844 return self._real_extract(url)
846 def set_downloader(self, downloader):
847 """Sets the downloader for this IE."""
848 self._downloader = downloader
850 def _real_initialize(self):
851 """Real initialization process. Redefine in subclasses."""
854 def _real_extract(self, url):
855 """Real extraction process. Redefine in subclasses."""
858 class YoutubeIE(InfoExtractor):
859 """Information extractor for youtube.com."""
861 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
862 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
863 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
864 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
865 _NETRC_MACHINE = 'youtube'
866 # Listed in order of quality
867 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
868 _video_extensions = {
874 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
881 return (re.match(YoutubeIE._VALID_URL, url) is not None)
883 def report_lang(self):
884 """Report attempt to set language."""
885 self._downloader.to_screen(u'[youtube] Setting language')
887 def report_login(self):
888 """Report attempt to log in."""
889 self._downloader.to_screen(u'[youtube] Logging in')
891 def report_age_confirmation(self):
892 """Report attempt to confirm age."""
893 self._downloader.to_screen(u'[youtube] Confirming age')
895 def report_video_webpage_download(self, video_id):
896 """Report attempt to download video webpage."""
897 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
899 def report_video_info_webpage_download(self, video_id):
900 """Report attempt to download video info webpage."""
901 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
903 def report_information_extraction(self, video_id):
904 """Report attempt to extract video information."""
905 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
907 def report_unavailable_format(self, video_id, format):
908 """Report extracted video URL."""
909 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
911 def report_rtmp_download(self):
912 """Indicate the download will use the RTMP protocol."""
913 self._downloader.to_screen(u'[youtube] RTMP download detected')
915 def _real_initialize(self):
916 if self._downloader is None:
921 downloader_params = self._downloader.params
923 # Attempt to use provided username and password or .netrc data
924 if downloader_params.get('username', None) is not None:
925 username = downloader_params['username']
926 password = downloader_params['password']
927 elif downloader_params.get('usenetrc', False):
929 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
934 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
935 except (IOError, netrc.NetrcParseError), err:
936 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
940 request = urllib2.Request(self._LANG_URL)
943 urllib2.urlopen(request).read()
944 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
945 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
948 # No authentication to be performed
954 'current_form': 'loginForm',
956 'action_login': 'Log In',
957 'username': username,
958 'password': password,
960 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
963 login_results = urllib2.urlopen(request).read()
964 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
965 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
967 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
968 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
974 'action_confirm': 'Confirm',
976 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
978 self.report_age_confirmation()
979 age_results = urllib2.urlopen(request).read()
980 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
981 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
984 def _real_extract(self, url):
985 # Extract video id from URL
986 mobj = re.match(self._VALID_URL, url)
988 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
990 video_id = mobj.group(2)
993 self.report_video_webpage_download(video_id)
994 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
996 video_webpage = urllib2.urlopen(request).read()
997 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
998 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1001 # Attempt to extract SWF player URL
1002 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1003 if mobj is not None:
1004 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1009 self.report_video_info_webpage_download(video_id)
1010 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1011 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1012 % (video_id, el_type))
1013 request = urllib2.Request(video_info_url)
1015 video_info_webpage = urllib2.urlopen(request).read()
1016 video_info = parse_qs(video_info_webpage)
1017 if 'token' in video_info:
1019 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1020 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1022 if 'token' not in video_info:
1023 if 'reason' in video_info:
1024 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1026 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1029 # Start extracting information
1030 self.report_information_extraction(video_id)
1033 if 'author' not in video_info:
1034 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1036 video_uploader = urllib.unquote_plus(video_info['author'][0])
1039 if 'title' not in video_info:
1040 self._downloader.trouble(u'ERROR: unable to extract video title')
1042 video_title = urllib.unquote_plus(video_info['title'][0])
1043 video_title = video_title.decode('utf-8')
1044 video_title = sanitize_title(video_title)
1047 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1048 simple_title = simple_title.strip(ur'_')
1051 if 'thumbnail_url' not in video_info:
1052 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1053 video_thumbnail = ''
1054 else: # don't panic if we can't find it
1055 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1059 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
1060 if mobj is not None:
1061 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1062 format_expressions = ['%d %B %Y', '%B %d %Y']
1063 for expression in format_expressions:
1065 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1070 video_description = 'No description available.'
1071 if self._downloader.params.get('forcedescription', False):
1072 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1073 if mobj is not None:
1074 video_description = mobj.group(1)
1077 video_token = urllib.unquote_plus(video_info['token'][0])
1079 # Decide which formats to download
1080 req_format = self._downloader.params.get('format', None)
1082 if 'fmt_url_map' in video_info:
1083 url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
1084 format_limit = self._downloader.params.get('format_limit', None)
1085 if format_limit is not None and format_limit in self._available_formats:
1086 format_list = self._available_formats[self._available_formats.index(format_limit):]
1088 format_list = self._available_formats
1089 existing_formats = [x for x in format_list if x in url_map]
1090 if len(existing_formats) == 0:
1091 self._downloader.trouble(u'ERROR: no known formats available for video')
1093 if req_format is None:
1094 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1095 elif req_format == '-1':
1096 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1099 if req_format not in url_map:
1100 self._downloader.trouble(u'ERROR: requested format not available')
1102 video_url_list = [(req_format, url_map[req_format])] # Specific format
1104 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1105 self.report_rtmp_download()
1106 video_url_list = [(None, video_info['conn'][0])]
1109 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1112 for format_param, video_real_url in video_url_list:
1113 # At this point we have a new video
1114 self._downloader.increment_downloads()
1117 video_extension = self._video_extensions.get(format_param, 'flv')
1119 # Find the video URL in fmt_url_map or conn paramters
1121 # Process video information
1122 self._downloader.process_info({
1123 'id': video_id.decode('utf-8'),
1124 'url': video_real_url.decode('utf-8'),
1125 'uploader': video_uploader.decode('utf-8'),
1126 'upload_date': upload_date,
1127 'title': video_title,
1128 'stitle': simple_title,
1129 'ext': video_extension.decode('utf-8'),
1130 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1131 'thumbnail': video_thumbnail.decode('utf-8'),
1132 'description': video_description.decode('utf-8'),
1133 'player_url': player_url,
1135 except UnavailableVideoError, err:
1136 self._downloader.trouble(u'\nERROR: unable to download video')
1139 class MetacafeIE(InfoExtractor):
1140 """Information Extractor for metacafe.com."""
1142 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1143 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1144 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1147 def __init__(self, youtube_ie, downloader=None):
1148 InfoExtractor.__init__(self, downloader)
1149 self._youtube_ie = youtube_ie
1153 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1155 def report_disclaimer(self):
1156 """Report disclaimer retrieval."""
1157 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1159 def report_age_confirmation(self):
1160 """Report attempt to confirm age."""
1161 self._downloader.to_screen(u'[metacafe] Confirming age')
1163 def report_download_webpage(self, video_id):
1164 """Report webpage download."""
1165 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1167 def report_extraction(self, video_id):
1168 """Report information extraction."""
1169 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1171 def _real_initialize(self):
1172 # Retrieve disclaimer
1173 request = urllib2.Request(self._DISCLAIMER)
1175 self.report_disclaimer()
1176 disclaimer = urllib2.urlopen(request).read()
1177 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1178 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1184 'submit': "Continue - I'm over 18",
1186 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1188 self.report_age_confirmation()
1189 disclaimer = urllib2.urlopen(request).read()
1190 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1191 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1194 def _real_extract(self, url):
1195 # Extract id and simplified title from URL
1196 mobj = re.match(self._VALID_URL, url)
1198 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1201 video_id = mobj.group(1)
1203 # Check if video comes from YouTube
1204 mobj2 = re.match(r'^yt-(.*)$', video_id)
1205 if mobj2 is not None:
1206 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1209 # At this point we have a new video
1210 self._downloader.increment_downloads()
1212 simple_title = mobj.group(2).decode('utf-8')
1214 # Retrieve video webpage to extract further information
1215 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1217 self.report_download_webpage(video_id)
1218 webpage = urllib2.urlopen(request).read()
1219 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1220 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1223 # Extract URL, uploader and title from webpage
1224 self.report_extraction(video_id)
1225 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1226 if mobj is not None:
1227 mediaURL = urllib.unquote(mobj.group(1))
1228 video_extension = mediaURL[-3:]
1230 # Extract gdaKey if available
1231 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1233 video_url = mediaURL
1235 gdaKey = mobj.group(1)
1236 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1238 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1240 self._downloader.trouble(u'ERROR: unable to extract media URL')
1242 vardict = parse_qs(mobj.group(1))
1243 if 'mediaData' not in vardict:
1244 self._downloader.trouble(u'ERROR: unable to extract media URL')
1246 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1248 self._downloader.trouble(u'ERROR: unable to extract media URL')
1250 mediaURL = mobj.group(1).replace('\\/', '/')
1251 video_extension = mediaURL[-3:]
1252 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1254 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1256 self._downloader.trouble(u'ERROR: unable to extract title')
1258 video_title = mobj.group(1).decode('utf-8')
1259 video_title = sanitize_title(video_title)
1261 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1263 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1265 video_uploader = mobj.group(1)
1268 # Process video information
1269 self._downloader.process_info({
1270 'id': video_id.decode('utf-8'),
1271 'url': video_url.decode('utf-8'),
1272 'uploader': video_uploader.decode('utf-8'),
1273 'upload_date': u'NA',
1274 'title': video_title,
1275 'stitle': simple_title,
1276 'ext': video_extension.decode('utf-8'),
1280 except UnavailableVideoError:
1281 self._downloader.trouble(u'\nERROR: unable to download video')
1284 class DailymotionIE(InfoExtractor):
1285 """Information Extractor for Dailymotion"""
1287 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1289 def __init__(self, downloader=None):
1290 InfoExtractor.__init__(self, downloader)
1294 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1296 def report_download_webpage(self, video_id):
1297 """Report webpage download."""
1298 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1300 def report_extraction(self, video_id):
1301 """Report information extraction."""
1302 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1304 def _real_initialize(self):
1307 def _real_extract(self, url):
1308 # Extract id and simplified title from URL
1309 mobj = re.match(self._VALID_URL, url)
1311 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1314 # At this point we have a new video
1315 self._downloader.increment_downloads()
1316 video_id = mobj.group(1)
1318 simple_title = mobj.group(2).decode('utf-8')
1319 video_extension = 'flv'
1321 # Retrieve video webpage to extract further information
1322 request = urllib2.Request(url)
1324 self.report_download_webpage(video_id)
1325 webpage = urllib2.urlopen(request).read()
1326 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1327 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1330 # Extract URL, uploader and title from webpage
1331 self.report_extraction(video_id)
1332 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1334 self._downloader.trouble(u'ERROR: unable to extract media URL')
1336 mediaURL = urllib.unquote(mobj.group(1))
1338 # if needed add http://www.dailymotion.com/ if relative URL
1340 video_url = mediaURL
1342 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1343 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1345 self._downloader.trouble(u'ERROR: unable to extract title')
1347 video_title = mobj.group(1).decode('utf-8')
1348 video_title = sanitize_title(video_title)
1350 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1352 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1354 video_uploader = mobj.group(1)
1357 # Process video information
1358 self._downloader.process_info({
1359 'id': video_id.decode('utf-8'),
1360 'url': video_url.decode('utf-8'),
1361 'uploader': video_uploader.decode('utf-8'),
1362 'upload_date': u'NA',
1363 'title': video_title,
1364 'stitle': simple_title,
1365 'ext': video_extension.decode('utf-8'),
1369 except UnavailableVideoError:
1370 self._downloader.trouble(u'\nERROR: unable to download video')
1372 class GoogleIE(InfoExtractor):
1373 """Information extractor for video.google.com."""
1375 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1377 def __init__(self, downloader=None):
1378 InfoExtractor.__init__(self, downloader)
1382 return (re.match(GoogleIE._VALID_URL, url) is not None)
1384 def report_download_webpage(self, video_id):
1385 """Report webpage download."""
1386 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1388 def report_extraction(self, video_id):
1389 """Report information extraction."""
1390 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1392 def _real_initialize(self):
1395 def _real_extract(self, url):
1396 # Extract id from URL
1397 mobj = re.match(self._VALID_URL, url)
1399 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1402 # At this point we have a new video
1403 self._downloader.increment_downloads()
1404 video_id = mobj.group(1)
1406 video_extension = 'mp4'
1408 # Retrieve video webpage to extract further information
1409 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1411 self.report_download_webpage(video_id)
1412 webpage = urllib2.urlopen(request).read()
1413 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1414 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1417 # Extract URL, uploader, and title from webpage
1418 self.report_extraction(video_id)
1419 mobj = re.search(r"download_url:'([^']+)'", webpage)
1421 video_extension = 'flv'
1422 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1424 self._downloader.trouble(u'ERROR: unable to extract media URL')
1426 mediaURL = urllib.unquote(mobj.group(1))
1427 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1428 mediaURL = mediaURL.replace('\\x26', '\x26')
1430 video_url = mediaURL
1432 mobj = re.search(r'<title>(.*)</title>', webpage)
1434 self._downloader.trouble(u'ERROR: unable to extract title')
1436 video_title = mobj.group(1).decode('utf-8')
1437 video_title = sanitize_title(video_title)
1438 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1440 # Extract video description
1441 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1443 self._downloader.trouble(u'ERROR: unable to extract video description')
1445 video_description = mobj.group(1).decode('utf-8')
1446 if not video_description:
1447 video_description = 'No description available.'
1449 # Extract video thumbnail
1450 if self._downloader.params.get('forcethumbnail', False):
1451 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1453 webpage = urllib2.urlopen(request).read()
1454 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1455 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1457 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1459 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1461 video_thumbnail = mobj.group(1)
1462 else: # we need something to pass to process_info
1463 video_thumbnail = ''
1467 # Process video information
1468 self._downloader.process_info({
1469 'id': video_id.decode('utf-8'),
1470 'url': video_url.decode('utf-8'),
1472 'upload_date': u'NA',
1473 'title': video_title,
1474 'stitle': simple_title,
1475 'ext': video_extension.decode('utf-8'),
1479 except UnavailableVideoError:
1480 self._downloader.trouble(u'\nERROR: unable to download video')
1483 class PhotobucketIE(InfoExtractor):
1484 """Information extractor for photobucket.com."""
1486 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1488 def __init__(self, downloader=None):
1489 InfoExtractor.__init__(self, downloader)
1493 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1495 def report_download_webpage(self, video_id):
1496 """Report webpage download."""
1497 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1499 def report_extraction(self, video_id):
1500 """Report information extraction."""
1501 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1503 def _real_initialize(self):
1506 def _real_extract(self, url):
1507 # Extract id from URL
1508 mobj = re.match(self._VALID_URL, url)
1510 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1513 # At this point we have a new video
1514 self._downloader.increment_downloads()
1515 video_id = mobj.group(1)
1517 video_extension = 'flv'
1519 # Retrieve video webpage to extract further information
1520 request = urllib2.Request(url)
1522 self.report_download_webpage(video_id)
1523 webpage = urllib2.urlopen(request).read()
1524 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1525 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1528 # Extract URL, uploader, and title from webpage
1529 self.report_extraction(video_id)
1530 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1532 self._downloader.trouble(u'ERROR: unable to extract media URL')
1534 mediaURL = urllib.unquote(mobj.group(1))
1536 video_url = mediaURL
1538 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1540 self._downloader.trouble(u'ERROR: unable to extract title')
1542 video_title = mobj.group(1).decode('utf-8')
1543 video_title = sanitize_title(video_title)
1544 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1546 video_uploader = mobj.group(2).decode('utf-8')
1549 # Process video information
1550 self._downloader.process_info({
1551 'id': video_id.decode('utf-8'),
1552 'url': video_url.decode('utf-8'),
1553 'uploader': video_uploader,
1554 'upload_date': u'NA',
1555 'title': video_title,
1556 'stitle': simple_title,
1557 'ext': video_extension.decode('utf-8'),
1561 except UnavailableVideoError:
1562 self._downloader.trouble(u'\nERROR: unable to download video')
1565 class YahooIE(InfoExtractor):
1566 """Information extractor for video.yahoo.com."""
1568 # _VALID_URL matches all Yahoo! Video URLs
1569 # _VPAGE_URL matches only the extractable '/watch/' URLs
1570 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1571 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1573 def __init__(self, downloader=None):
1574 InfoExtractor.__init__(self, downloader)
1578 return (re.match(YahooIE._VALID_URL, url) is not None)
1580 def report_download_webpage(self, video_id):
1581 """Report webpage download."""
1582 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1584 def report_extraction(self, video_id):
1585 """Report information extraction."""
1586 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1588 def _real_initialize(self):
1591 def _real_extract(self, url, new_video=True):
1592 # Extract ID from URL
1593 mobj = re.match(self._VALID_URL, url)
1595 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1598 # At this point we have a new video
1599 self._downloader.increment_downloads()
1600 video_id = mobj.group(2)
1601 video_extension = 'flv'
1603 # Rewrite valid but non-extractable URLs as
1604 # extractable English language /watch/ URLs
1605 if re.match(self._VPAGE_URL, url) is None:
1606 request = urllib2.Request(url)
1608 webpage = urllib2.urlopen(request).read()
1609 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1610 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1613 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1615 self._downloader.trouble(u'ERROR: Unable to extract id field')
1617 yahoo_id = mobj.group(1)
1619 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1621 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1623 yahoo_vid = mobj.group(1)
1625 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1626 return self._real_extract(url, new_video=False)
1628 # Retrieve video webpage to extract further information
1629 request = urllib2.Request(url)
1631 self.report_download_webpage(video_id)
1632 webpage = urllib2.urlopen(request).read()
1633 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1634 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1637 # Extract uploader and title from webpage
1638 self.report_extraction(video_id)
1639 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1641 self._downloader.trouble(u'ERROR: unable to extract video title')
1643 video_title = mobj.group(1).decode('utf-8')
1644 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1646 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1648 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1650 video_uploader = mobj.group(1).decode('utf-8')
1652 # Extract video thumbnail
1653 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1655 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1657 video_thumbnail = mobj.group(1).decode('utf-8')
1659 # Extract video description
1660 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1662 self._downloader.trouble(u'ERROR: unable to extract video description')
1664 video_description = mobj.group(1).decode('utf-8')
1665 if not video_description: video_description = 'No description available.'
1667 # Extract video height and width
1668 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1670 self._downloader.trouble(u'ERROR: unable to extract video height')
1672 yv_video_height = mobj.group(1)
1674 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1676 self._downloader.trouble(u'ERROR: unable to extract video width')
1678 yv_video_width = mobj.group(1)
1680 # Retrieve video playlist to extract media URL
1681 # I'm not completely sure what all these options are, but we
1682 # seem to need most of them, otherwise the server sends a 401.
1683 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1684 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1685 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1686 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1687 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1689 self.report_download_webpage(video_id)
1690 webpage = urllib2.urlopen(request).read()
1691 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1692 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1695 # Extract media URL from playlist XML
1696 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1698 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1700 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1701 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1704 # Process video information
1705 self._downloader.process_info({
1706 'id': video_id.decode('utf-8'),
1708 'uploader': video_uploader,
1709 'upload_date': u'NA',
1710 'title': video_title,
1711 'stitle': simple_title,
1712 'ext': video_extension.decode('utf-8'),
1713 'thumbnail': video_thumbnail.decode('utf-8'),
1714 'description': video_description,
1715 'thumbnail': video_thumbnail,
1716 'description': video_description,
1719 except UnavailableVideoError:
1720 self._downloader.trouble(u'\nERROR: unable to download video')
1723 class VimeoIE(InfoExtractor):
1724 """Information extractor for vimeo.com."""
1726 # _VALID_URL matches Vimeo URLs
1727 _VALID_URL = r'(?:http://)?vimeo\.com/([0-9]+)'
1729 def __init__(self, downloader=None):
1730 InfoExtractor.__init__(self, downloader)
1734 return (re.match(VimeoIE._VALID_URL, url) is not None)
1736 def report_download_webpage(self, video_id):
1737 """Report webpage download."""
1738 self._downloader.to_screen(u'[video.vimeo] %s: Downloading webpage' % video_id)
1740 def report_extraction(self, video_id):
1741 """Report information extraction."""
1742 self._downloader.to_screen(u'[video.vimeo] %s: Extracting information' % video_id)
1744 def _real_initialize(self):
1747 def _real_extract(self, url, new_video=True):
1748 # Extract ID from URL
1749 mobj = re.match(self._VALID_URL, url)
1751 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1754 # At this point we have a new video
1755 self._downloader.increment_downloads()
1756 video_id = mobj.group(1)
1757 video_extension = 'flv' # FIXME
1759 # Retrieve video webpage to extract further information
1760 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1762 self.report_download_webpage(video_id)
1763 webpage = urllib2.urlopen(request).read()
1764 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1765 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1768 # Extract uploader and title from webpage
1769 self.report_extraction(video_id)
1770 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1772 self._downloader.trouble(u'ERROR: unable to extract video title')
1774 video_title = mobj.group(1).decode('utf-8')
1775 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1777 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1779 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1781 video_uploader = mobj.group(1).decode('utf-8')
1783 # Extract video thumbnail
1784 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
1786 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1788 video_thumbnail = mobj.group(1).decode('utf-8')
1790 # # Extract video description
1791 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
1793 # self._downloader.trouble(u'ERROR: unable to extract video description')
1795 # video_description = mobj.group(1).decode('utf-8')
1796 # if not video_description: video_description = 'No description available.'
1797 video_description = 'Foo.'
1799 # Extract request signature
1800 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
1802 self._downloader.trouble(u'ERROR: unable to extract request signature')
1804 sig = mobj.group(1).decode('utf-8')
1806 # Extract request signature expiration
1807 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
1809 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
1811 sig_exp = mobj.group(1).decode('utf-8')
1813 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
1816 # Process video information
1817 self._downloader.process_info({
1818 'id': video_id.decode('utf-8'),
1820 'uploader': video_uploader,
1821 'upload_date': u'NA',
1822 'title': video_title,
1823 'stitle': simple_title,
1824 'ext': video_extension.decode('utf-8'),
1825 'thumbnail': video_thumbnail.decode('utf-8'),
1826 'description': video_description,
1827 'thumbnail': video_thumbnail,
1828 'description': video_description,
1831 except UnavailableVideoError:
1832 self._downloader.trouble(u'ERROR: unable to download video')
1835 class GenericIE(InfoExtractor):
1836 """Generic last-resort information extractor."""
1838 def __init__(self, downloader=None):
1839 InfoExtractor.__init__(self, downloader)
1845 def report_download_webpage(self, video_id):
1846 """Report webpage download."""
1847 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1848 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1850 def report_extraction(self, video_id):
1851 """Report information extraction."""
1852 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1854 def _real_initialize(self):
1857 def _real_extract(self, url):
1858 # At this point we have a new video
1859 self._downloader.increment_downloads()
1861 video_id = url.split('/')[-1]
1862 request = urllib2.Request(url)
1864 self.report_download_webpage(video_id)
1865 webpage = urllib2.urlopen(request).read()
1866 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1867 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1869 except ValueError, err:
1870 # since this is the last-resort InfoExtractor, if
1871 # this error is thrown, it'll be thrown here
1872 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1875 self.report_extraction(video_id)
1876 # Start with something easy: JW Player in SWFObject
1877 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1879 # Broaden the search a little bit
1880 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1882 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1885 # It's possible that one of the regexes
1886 # matched, but returned an empty group:
1887 if mobj.group(1) is None:
1888 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1891 video_url = urllib.unquote(mobj.group(1))
1892 video_id = os.path.basename(video_url)
1894 # here's a fun little line of code for you:
1895 video_extension = os.path.splitext(video_id)[1][1:]
1896 video_id = os.path.splitext(video_id)[0]
1898 # it's tempting to parse this further, but you would
1899 # have to take into account all the variations like
1900 # Video Title - Site Name
1901 # Site Name | Video Title
1902 # Video Title - Tagline | Site Name
1903 # and so on and so forth; it's just not practical
1904 mobj = re.search(r'<title>(.*)</title>', webpage)
1906 self._downloader.trouble(u'ERROR: unable to extract title')
1908 video_title = mobj.group(1).decode('utf-8')
1909 video_title = sanitize_title(video_title)
1910 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1912 # video uploader is domain name
1913 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1915 self._downloader.trouble(u'ERROR: unable to extract title')
1917 video_uploader = mobj.group(1).decode('utf-8')
1920 # Process video information
1921 self._downloader.process_info({
1922 'id': video_id.decode('utf-8'),
1923 'url': video_url.decode('utf-8'),
1924 'uploader': video_uploader,
1925 'upload_date': u'NA',
1926 'title': video_title,
1927 'stitle': simple_title,
1928 'ext': video_extension.decode('utf-8'),
1932 except UnavailableVideoError, err:
1933 self._downloader.trouble(u'\nERROR: unable to download video')
1936 class YoutubeSearchIE(InfoExtractor):
1937 """Information Extractor for YouTube search queries."""
1938 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1939 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1940 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1941 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1943 _max_youtube_results = 1000
1945 def __init__(self, youtube_ie, downloader=None):
1946 InfoExtractor.__init__(self, downloader)
1947 self._youtube_ie = youtube_ie
1951 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1953 def report_download_page(self, query, pagenum):
1954 """Report attempt to download playlist page with given number."""
1955 query = query.decode(preferredencoding())
1956 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1958 def _real_initialize(self):
1959 self._youtube_ie.initialize()
1961 def _real_extract(self, query):
1962 mobj = re.match(self._VALID_QUERY, query)
1964 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1967 prefix, query = query.split(':')
1969 query = query.encode('utf-8')
1971 self._download_n_results(query, 1)
1973 elif prefix == 'all':
1974 self._download_n_results(query, self._max_youtube_results)
1980 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1982 elif n > self._max_youtube_results:
1983 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1984 n = self._max_youtube_results
1985 self._download_n_results(query, n)
1987 except ValueError: # parsing prefix as integer fails
1988 self._download_n_results(query, 1)
1991 def _download_n_results(self, query, n):
1992 """Downloads a specified number of results for a query"""
1995 already_seen = set()
1999 self.report_download_page(query, pagenum)
2000 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2001 request = urllib2.Request(result_url)
2003 page = urllib2.urlopen(request).read()
2004 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2005 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2008 # Extract video identifiers
2009 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2010 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2011 if video_id not in already_seen:
2012 video_ids.append(video_id)
2013 already_seen.add(video_id)
2014 if len(video_ids) == n:
2015 # Specified n videos reached
2016 for id in video_ids:
2017 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2020 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2021 for id in video_ids:
2022 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2025 pagenum = pagenum + 1
2027 class GoogleSearchIE(InfoExtractor):
2028 """Information Extractor for Google Video search queries."""
2029 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2030 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2031 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2032 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2034 _max_google_results = 1000
2036 def __init__(self, google_ie, downloader=None):
2037 InfoExtractor.__init__(self, downloader)
2038 self._google_ie = google_ie
2042 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2044 def report_download_page(self, query, pagenum):
2045 """Report attempt to download playlist page with given number."""
2046 query = query.decode(preferredencoding())
2047 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2049 def _real_initialize(self):
2050 self._google_ie.initialize()
2052 def _real_extract(self, query):
2053 mobj = re.match(self._VALID_QUERY, query)
2055 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2058 prefix, query = query.split(':')
2060 query = query.encode('utf-8')
2062 self._download_n_results(query, 1)
2064 elif prefix == 'all':
2065 self._download_n_results(query, self._max_google_results)
2071 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2073 elif n > self._max_google_results:
2074 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2075 n = self._max_google_results
2076 self._download_n_results(query, n)
2078 except ValueError: # parsing prefix as integer fails
2079 self._download_n_results(query, 1)
2082 def _download_n_results(self, query, n):
2083 """Downloads a specified number of results for a query"""
2086 already_seen = set()
2090 self.report_download_page(query, pagenum)
2091 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2092 request = urllib2.Request(result_url)
2094 page = urllib2.urlopen(request).read()
2095 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2096 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2099 # Extract video identifiers
2100 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2101 video_id = mobj.group(1)
2102 if video_id not in already_seen:
2103 video_ids.append(video_id)
2104 already_seen.add(video_id)
2105 if len(video_ids) == n:
2106 # Specified n videos reached
2107 for id in video_ids:
2108 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2111 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2112 for id in video_ids:
2113 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2116 pagenum = pagenum + 1
2118 class YahooSearchIE(InfoExtractor):
2119 """Information Extractor for Yahoo! Video search queries."""
2120 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2121 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2122 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2123 _MORE_PAGES_INDICATOR = r'\s*Next'
2125 _max_yahoo_results = 1000
2127 def __init__(self, yahoo_ie, downloader=None):
2128 InfoExtractor.__init__(self, downloader)
2129 self._yahoo_ie = yahoo_ie
2133 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2135 def report_download_page(self, query, pagenum):
2136 """Report attempt to download playlist page with given number."""
2137 query = query.decode(preferredencoding())
2138 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2140 def _real_initialize(self):
2141 self._yahoo_ie.initialize()
2143 def _real_extract(self, query):
2144 mobj = re.match(self._VALID_QUERY, query)
2146 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2149 prefix, query = query.split(':')
2151 query = query.encode('utf-8')
2153 self._download_n_results(query, 1)
2155 elif prefix == 'all':
2156 self._download_n_results(query, self._max_yahoo_results)
2162 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2164 elif n > self._max_yahoo_results:
2165 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2166 n = self._max_yahoo_results
2167 self._download_n_results(query, n)
2169 except ValueError: # parsing prefix as integer fails
2170 self._download_n_results(query, 1)
2173 def _download_n_results(self, query, n):
2174 """Downloads a specified number of results for a query"""
2177 already_seen = set()
2181 self.report_download_page(query, pagenum)
2182 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2183 request = urllib2.Request(result_url)
2185 page = urllib2.urlopen(request).read()
2186 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2187 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2190 # Extract video identifiers
2191 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2192 video_id = mobj.group(1)
2193 if video_id not in already_seen:
2194 video_ids.append(video_id)
2195 already_seen.add(video_id)
2196 if len(video_ids) == n:
2197 # Specified n videos reached
2198 for id in video_ids:
2199 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2202 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2203 for id in video_ids:
2204 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2207 pagenum = pagenum + 1
2209 class YoutubePlaylistIE(InfoExtractor):
2210 """Information Extractor for YouTube playlists."""
2212 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2213 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2214 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2215 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2218 def __init__(self, youtube_ie, downloader=None):
2219 InfoExtractor.__init__(self, downloader)
2220 self._youtube_ie = youtube_ie
2224 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2226 def report_download_page(self, playlist_id, pagenum):
2227 """Report attempt to download playlist page with given number."""
2228 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2230 def _real_initialize(self):
2231 self._youtube_ie.initialize()
2233 def _real_extract(self, url):
2234 # Extract playlist id
2235 mobj = re.match(self._VALID_URL, url)
2237 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2241 if mobj.group(3) is not None:
2242 self._youtube_ie.extract(mobj.group(3))
2245 # Download playlist pages
2246 # prefix is 'p' as default for playlists but there are other types that need extra care
2247 playlist_prefix = mobj.group(1)
2248 if playlist_prefix == 'a':
2249 playlist_access = 'artist'
2251 playlist_prefix = 'p'
2252 playlist_access = 'view_play_list'
2253 playlist_id = mobj.group(2)
2258 self.report_download_page(playlist_id, pagenum)
2259 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2261 page = urllib2.urlopen(request).read()
2262 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2263 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2266 # Extract video identifiers
2268 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2269 if mobj.group(1) not in ids_in_page:
2270 ids_in_page.append(mobj.group(1))
2271 video_ids.extend(ids_in_page)
2273 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2275 pagenum = pagenum + 1
2277 playliststart = self._downloader.params.get('playliststart', 1) - 1
2278 playlistend = self._downloader.params.get('playlistend', -1)
2279 video_ids = video_ids[playliststart:playlistend]
2281 for id in video_ids:
2282 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2285 class YoutubeUserIE(InfoExtractor):
2286 """Information Extractor for YouTube users."""
2288 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2289 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2290 _GDATA_PAGE_SIZE = 50
2291 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2292 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2295 def __init__(self, youtube_ie, downloader=None):
2296 InfoExtractor.__init__(self, downloader)
2297 self._youtube_ie = youtube_ie
2301 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2303 def report_download_page(self, username, start_index):
2304 """Report attempt to download user page."""
2305 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2306 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2308 def _real_initialize(self):
2309 self._youtube_ie.initialize()
2311 def _real_extract(self, url):
2313 mobj = re.match(self._VALID_URL, url)
2315 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2318 username = mobj.group(1)
2320 # Download video ids using YouTube Data API. Result size per
2321 # query is limited (currently to 50 videos) so we need to query
2322 # page by page until there are no video ids - it means we got
2329 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2330 self.report_download_page(username, start_index)
2332 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2335 page = urllib2.urlopen(request).read()
2336 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2337 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2340 # Extract video identifiers
2343 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2344 if mobj.group(1) not in ids_in_page:
2345 ids_in_page.append(mobj.group(1))
2347 video_ids.extend(ids_in_page)
2349 # A little optimization - if current page is not
2350 # "full", ie. does not contain PAGE_SIZE video ids then
2351 # we can assume that this page is the last one - there
2352 # are no more ids on further pages - no need to query
2355 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2360 all_ids_count = len(video_ids)
2361 playliststart = self._downloader.params.get('playliststart', 1) - 1
2362 playlistend = self._downloader.params.get('playlistend', -1)
2364 if playlistend == -1:
2365 video_ids = video_ids[playliststart:]
2367 video_ids = video_ids[playliststart:playlistend]
2369 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2370 (username, all_ids_count, len(video_ids)))
2372 for video_id in video_ids:
2373 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2376 class DepositFilesIE(InfoExtractor):
2377 """Information extractor for depositfiles.com"""
2379 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2381 def __init__(self, downloader=None):
2382 InfoExtractor.__init__(self, downloader)
2386 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2388 def report_download_webpage(self, file_id):
2389 """Report webpage download."""
2390 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2392 def report_extraction(self, file_id):
2393 """Report information extraction."""
2394 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2396 def _real_initialize(self):
2399 def _real_extract(self, url):
2400 # At this point we have a new file
2401 self._downloader.increment_downloads()
2403 file_id = url.split('/')[-1]
2404 # Rebuild url in english locale
2405 url = 'http://depositfiles.com/en/files/' + file_id
2407 # Retrieve file webpage with 'Free download' button pressed
2408 free_download_indication = { 'gateway_result' : '1' }
2409 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2411 self.report_download_webpage(file_id)
2412 webpage = urllib2.urlopen(request).read()
2413 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2414 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2417 # Search for the real file URL
2418 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2419 if (mobj is None) or (mobj.group(1) is None):
2420 # Try to figure out reason of the error.
2421 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2422 if (mobj is not None) and (mobj.group(1) is not None):
2423 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2424 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2426 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2429 file_url = mobj.group(1)
2430 file_extension = os.path.splitext(file_url)[1][1:]
2432 # Search for file title
2433 mobj = re.search(r'<b title="(.*?)">', webpage)
2435 self._downloader.trouble(u'ERROR: unable to extract title')
2437 file_title = mobj.group(1).decode('utf-8')
2440 # Process file information
2441 self._downloader.process_info({
2442 'id': file_id.decode('utf-8'),
2443 'url': file_url.decode('utf-8'),
2445 'upload_date': u'NA',
2446 'title': file_title,
2447 'stitle': file_title,
2448 'ext': file_extension.decode('utf-8'),
2452 except UnavailableVideoError, err:
2453 self._downloader.trouble(u'ERROR: unable to download file')
2455 class FacebookIE(InfoExtractor):
2456 """Information Extractor for Facebook"""
2458 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2459 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2460 _NETRC_MACHINE = 'facebook'
2461 _available_formats = ['highqual', 'lowqual']
2462 _video_extensions = {
2467 def __init__(self, downloader=None):
2468 InfoExtractor.__init__(self, downloader)
2472 return (re.match(FacebookIE._VALID_URL, url) is not None)
2474 def _reporter(self, message):
2475 """Add header and report message."""
2476 self._downloader.to_screen(u'[facebook] %s' % message)
2478 def report_login(self):
2479 """Report attempt to log in."""
2480 self._reporter(u'Logging in')
2482 def report_video_webpage_download(self, video_id):
2483 """Report attempt to download video webpage."""
2484 self._reporter(u'%s: Downloading video webpage' % video_id)
2486 def report_information_extraction(self, video_id):
2487 """Report attempt to extract video information."""
2488 self._reporter(u'%s: Extracting video information' % video_id)
2490 def _parse_page(self, video_webpage):
2491 """Extract video information from page"""
2493 data = {'title': r'class="video_title datawrap">(.*?)</',
2494 'description': r'<div class="datawrap">(.*?)</div>',
2495 'owner': r'\("video_owner_name", "(.*?)"\)',
2496 'upload_date': r'data-date="(.*?)"',
2497 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2500 for piece in data.keys():
2501 mobj = re.search(data[piece], video_webpage)
2502 if mobj is not None:
2503 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2507 for fmt in self._available_formats:
2508 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2509 if mobj is not None:
2510 # URL is in a Javascript segment inside an escaped Unicode format within
2511 # the generally utf-8 page
2512 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2513 video_info['video_urls'] = video_urls
2517 def _real_initialize(self):
2518 if self._downloader is None:
2523 downloader_params = self._downloader.params
2525 # Attempt to use provided username and password or .netrc data
2526 if downloader_params.get('username', None) is not None:
2527 useremail = downloader_params['username']
2528 password = downloader_params['password']
2529 elif downloader_params.get('usenetrc', False):
2531 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2532 if info is not None:
2536 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2537 except (IOError, netrc.NetrcParseError), err:
2538 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2541 if useremail is None:
2550 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2553 login_results = urllib2.urlopen(request).read()
2554 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2555 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2557 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2558 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2561 def _real_extract(self, url):
2562 mobj = re.match(self._VALID_URL, url)
2564 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2566 video_id = mobj.group('ID')
2569 self.report_video_webpage_download(video_id)
2570 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2572 page = urllib2.urlopen(request)
2573 video_webpage = page.read()
2574 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2575 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2578 # Start extracting information
2579 self.report_information_extraction(video_id)
2581 # Extract information
2582 video_info = self._parse_page(video_webpage)
2585 if 'owner' not in video_info:
2586 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2588 video_uploader = video_info['owner']
2591 if 'title' not in video_info:
2592 self._downloader.trouble(u'ERROR: unable to extract video title')
2594 video_title = video_info['title']
2595 video_title = video_title.decode('utf-8')
2596 video_title = sanitize_title(video_title)
2599 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2600 simple_title = simple_title.strip(ur'_')
2603 if 'thumbnail' not in video_info:
2604 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2605 video_thumbnail = ''
2607 video_thumbnail = video_info['thumbnail']
2611 if 'upload_date' in video_info:
2612 upload_time = video_info['upload_date']
2613 timetuple = email.utils.parsedate_tz(upload_time)
2614 if timetuple is not None:
2616 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2621 video_description = 'No description available.'
2622 if (self._downloader.params.get('forcedescription', False) and
2623 'description' in video_info):
2624 video_description = video_info['description']
2626 url_map = video_info['video_urls']
2627 if len(url_map.keys()) > 0:
2628 # Decide which formats to download
2629 req_format = self._downloader.params.get('format', None)
2630 format_limit = self._downloader.params.get('format_limit', None)
2632 if format_limit is not None and format_limit in self._available_formats:
2633 format_list = self._available_formats[self._available_formats.index(format_limit):]
2635 format_list = self._available_formats
2636 existing_formats = [x for x in format_list if x in url_map]
2637 if len(existing_formats) == 0:
2638 self._downloader.trouble(u'ERROR: no known formats available for video')
2640 if req_format is None:
2641 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2642 elif req_format == '-1':
2643 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2646 if req_format not in url_map:
2647 self._downloader.trouble(u'ERROR: requested format not available')
2649 video_url_list = [(req_format, url_map[req_format])] # Specific format
2651 for format_param, video_real_url in video_url_list:
2653 # At this point we have a new video
2654 self._downloader.increment_downloads()
2657 video_extension = self._video_extensions.get(format_param, 'mp4')
2659 # Find the video URL in fmt_url_map or conn paramters
2661 # Process video information
2662 self._downloader.process_info({
2663 'id': video_id.decode('utf-8'),
2664 'url': video_real_url.decode('utf-8'),
2665 'uploader': video_uploader.decode('utf-8'),
2666 'upload_date': upload_date,
2667 'title': video_title,
2668 'stitle': simple_title,
2669 'ext': video_extension.decode('utf-8'),
2670 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2671 'thumbnail': video_thumbnail.decode('utf-8'),
2672 'description': video_description.decode('utf-8'),
2675 except UnavailableVideoError, err:
2676 self._downloader.trouble(u'\nERROR: unable to download video')
2678 class PostProcessor(object):
2679 """Post Processor class.
2681 PostProcessor objects can be added to downloaders with their
2682 add_post_processor() method. When the downloader has finished a
2683 successful download, it will take its internal chain of PostProcessors
2684 and start calling the run() method on each one of them, first with
2685 an initial argument and then with the returned value of the previous
2688 The chain will be stopped if one of them ever returns None or the end
2689 of the chain is reached.
2691 PostProcessor objects follow a "mutual registration" process similar
2692 to InfoExtractor objects.
2697 def __init__(self, downloader=None):
2698 self._downloader = downloader
2700 def set_downloader(self, downloader):
2701 """Sets the downloader for this PP."""
2702 self._downloader = downloader
2704 def run(self, information):
2705 """Run the PostProcessor.
2707 The "information" argument is a dictionary like the ones
2708 composed by InfoExtractors. The only difference is that this
2709 one has an extra field called "filepath" that points to the
2712 When this method returns None, the postprocessing chain is
2713 stopped. However, this method may return an information
2714 dictionary that will be passed to the next postprocessing
2715 object in the chain. It can be the one it received after
2716 changing some fields.
2718 In addition, this method may raise a PostProcessingError
2719 exception that will be taken into account by the downloader
2722 return information # by default, do nothing
2724 ### MAIN PROGRAM ###
2725 if __name__ == '__main__':
2727 # Modules needed only when running the main program
2731 # Function to update the program file with the latest version from the repository.
2732 def update_self(downloader, filename):
2733 # Note: downloader only used for options
2734 if not os.access(filename, os.W_OK):
2735 sys.exit('ERROR: no write permissions on %s' % filename)
2737 downloader.to_screen('Updating to latest stable version...')
2739 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2740 latest_version = urllib.urlopen(latest_url).read().strip()
2741 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2742 newcontent = urllib.urlopen(prog_url).read()
2743 except (IOError, OSError), err:
2744 sys.exit('ERROR: unable to download latest version')
2746 stream = open(filename, 'w')
2747 stream.write(newcontent)
2749 except (IOError, OSError), err:
2750 sys.exit('ERROR: unable to overwrite current version')
2751 downloader.to_screen('Updated to version %s' % latest_version)
2753 # Parse command line
2754 parser = optparse.OptionParser(
2755 usage='Usage: %prog [options] url...',
2756 version='2011.01.30',
2757 conflict_handler='resolve',
2760 parser.add_option('-h', '--help',
2761 action='help', help='print this help text and exit')
2762 parser.add_option('-v', '--version',
2763 action='version', help='print program version and exit')
2764 parser.add_option('-U', '--update',
2765 action='store_true', dest='update_self', help='update this program to latest stable version')
2766 parser.add_option('-i', '--ignore-errors',
2767 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2768 parser.add_option('-r', '--rate-limit',
2769 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2770 parser.add_option('-R', '--retries',
2771 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2772 parser.add_option('--playlist-start',
2773 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2774 parser.add_option('--playlist-end',
2775 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2776 parser.add_option('--dump-user-agent',
2777 action='store_true', dest='dump_user_agent',
2778 help='display the current browser identification', default=False)
2780 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2781 authentication.add_option('-u', '--username',
2782 dest='username', metavar='USERNAME', help='account username')
2783 authentication.add_option('-p', '--password',
2784 dest='password', metavar='PASSWORD', help='account password')
2785 authentication.add_option('-n', '--netrc',
2786 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2787 parser.add_option_group(authentication)
2789 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2790 video_format.add_option('-f', '--format',
2791 action='store', dest='format', metavar='FORMAT', help='video format code')
2792 video_format.add_option('--all-formats',
2793 action='store_const', dest='format', help='download all available video formats', const='-1')
2794 video_format.add_option('--max-quality',
2795 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2796 parser.add_option_group(video_format)
2798 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2799 verbosity.add_option('-q', '--quiet',
2800 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2801 verbosity.add_option('-s', '--simulate',
2802 action='store_true', dest='simulate', help='do not download video', default=False)
2803 verbosity.add_option('-g', '--get-url',
2804 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2805 verbosity.add_option('-e', '--get-title',
2806 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2807 verbosity.add_option('--get-thumbnail',
2808 action='store_true', dest='getthumbnail',
2809 help='simulate, quiet but print thumbnail URL', default=False)
2810 verbosity.add_option('--get-description',
2811 action='store_true', dest='getdescription',
2812 help='simulate, quiet but print video description', default=False)
2813 verbosity.add_option('--get-filename',
2814 action='store_true', dest='getfilename',
2815 help='simulate, quiet but print output filename', default=False)
2816 verbosity.add_option('--no-progress',
2817 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2818 verbosity.add_option('--console-title',
2819 action='store_true', dest='consoletitle',
2820 help='display progress in console titlebar', default=False)
2821 parser.add_option_group(verbosity)
2823 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2824 filesystem.add_option('-t', '--title',
2825 action='store_true', dest='usetitle', help='use title in file name', default=False)
2826 filesystem.add_option('-l', '--literal',
2827 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2828 filesystem.add_option('-A', '--auto-number',
2829 action='store_true', dest='autonumber',
2830 help='number downloaded files starting from 00000', default=False)
2831 filesystem.add_option('-o', '--output',
2832 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2833 filesystem.add_option('-a', '--batch-file',
2834 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2835 filesystem.add_option('-w', '--no-overwrites',
2836 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2837 filesystem.add_option('-c', '--continue',
2838 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2839 filesystem.add_option('--cookies',
2840 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2841 filesystem.add_option('--no-part',
2842 action='store_true', dest='nopart', help='do not use .part files', default=False)
2843 filesystem.add_option('--no-mtime',
2844 action='store_false', dest='updatetime',
2845 help='do not use the Last-modified header to set the file modification time', default=True)
2846 parser.add_option_group(filesystem)
2848 (opts, args) = parser.parse_args()
2850 # Open appropriate CookieJar
2851 if opts.cookiefile is None:
2852 jar = cookielib.CookieJar()
2855 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2856 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2858 except (IOError, OSError), err:
2859 sys.exit(u'ERROR: unable to open cookie file')
2862 if opts.dump_user_agent:
2863 print std_headers['User-Agent']
2866 # General configuration
2867 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2868 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
2869 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2871 # Batch file verification
2873 if opts.batchfile is not None:
2875 if opts.batchfile == '-':
2878 batchfd = open(opts.batchfile, 'r')
2879 batchurls = batchfd.readlines()
2880 batchurls = [x.strip() for x in batchurls]
2881 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2883 sys.exit(u'ERROR: batch file could not be read')
2884 all_urls = batchurls + args
2886 # Conflicting, missing and erroneous options
2887 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2888 parser.error(u'using .netrc conflicts with giving username/password')
2889 if opts.password is not None and opts.username is None:
2890 parser.error(u'account username missing')
2891 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2892 parser.error(u'using output template conflicts with using title, literal title or auto number')
2893 if opts.usetitle and opts.useliteral:
2894 parser.error(u'using title conflicts with using literal title')
2895 if opts.username is not None and opts.password is None:
2896 opts.password = getpass.getpass(u'Type account password and press return:')
2897 if opts.ratelimit is not None:
2898 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2899 if numeric_limit is None:
2900 parser.error(u'invalid rate limit specified')
2901 opts.ratelimit = numeric_limit
2902 if opts.retries is not None:
2904 opts.retries = long(opts.retries)
2905 except (TypeError, ValueError), err:
2906 parser.error(u'invalid retry count specified')
2908 opts.playliststart = long(opts.playliststart)
2909 if opts.playliststart <= 0:
2911 except (TypeError, ValueError), err:
2912 parser.error(u'invalid playlist start number specified')
2914 opts.playlistend = long(opts.playlistend)
2915 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2917 except (TypeError, ValueError), err:
2918 parser.error(u'invalid playlist end number specified')
2920 # Information extractors
2921 vimeo_ie = VimeoIE()
2922 youtube_ie = YoutubeIE()
2923 metacafe_ie = MetacafeIE(youtube_ie)
2924 dailymotion_ie = DailymotionIE()
2925 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2926 youtube_user_ie = YoutubeUserIE(youtube_ie)
2927 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2928 google_ie = GoogleIE()
2929 google_search_ie = GoogleSearchIE(google_ie)
2930 photobucket_ie = PhotobucketIE()
2931 yahoo_ie = YahooIE()
2932 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2933 deposit_files_ie = DepositFilesIE()
2934 facebook_ie = FacebookIE()
2935 generic_ie = GenericIE()
2938 fd = FileDownloader({
2939 'usenetrc': opts.usenetrc,
2940 'username': opts.username,
2941 'password': opts.password,
2942 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
2943 'forceurl': opts.geturl,
2944 'forcetitle': opts.gettitle,
2945 'forcethumbnail': opts.getthumbnail,
2946 'forcedescription': opts.getdescription,
2947 'forcefilename': opts.getfilename,
2948 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
2949 'format': opts.format,
2950 'format_limit': opts.format_limit,
2951 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2952 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2953 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2954 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2955 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2956 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2957 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2958 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2959 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2960 or u'%(id)s.%(ext)s'),
2961 'ignoreerrors': opts.ignoreerrors,
2962 'ratelimit': opts.ratelimit,
2963 'nooverwrites': opts.nooverwrites,
2964 'retries': opts.retries,
2965 'continuedl': opts.continue_dl,
2966 'noprogress': opts.noprogress,
2967 'playliststart': opts.playliststart,
2968 'playlistend': opts.playlistend,
2969 'logtostderr': opts.outtmpl == '-',
2970 'consoletitle': opts.consoletitle,
2971 'nopart': opts.nopart,
2972 'updatetime': opts.updatetime,
2974 fd.add_info_extractor(vimeo_ie)
2975 fd.add_info_extractor(youtube_search_ie)
2976 fd.add_info_extractor(youtube_pl_ie)
2977 fd.add_info_extractor(youtube_user_ie)
2978 fd.add_info_extractor(metacafe_ie)
2979 fd.add_info_extractor(dailymotion_ie)
2980 fd.add_info_extractor(youtube_ie)
2981 fd.add_info_extractor(google_ie)
2982 fd.add_info_extractor(google_search_ie)
2983 fd.add_info_extractor(photobucket_ie)
2984 fd.add_info_extractor(yahoo_ie)
2985 fd.add_info_extractor(yahoo_search_ie)
2986 fd.add_info_extractor(deposit_files_ie)
2987 fd.add_info_extractor(facebook_ie)
2989 # This must come last since it's the
2990 # fallback if none of the others work
2991 fd.add_info_extractor(generic_ie)
2994 if opts.update_self:
2995 update_self(fd, sys.argv[0])
2998 if len(all_urls) < 1:
2999 if not opts.update_self:
3000 parser.error(u'you must provide at least one URL')
3003 retcode = fd.download(all_urls)
3005 # Dump cookie jar if requested
3006 if opts.cookiefile is not None:
3009 except (IOError, OSError), err:
3010 sys.exit(u'ERROR: unable to save cookie jar')
3014 except DownloadError:
3016 except SameFileError:
3017 sys.exit(u'ERROR: fixed output name but more than one file to download')
3018 except KeyboardInterrupt:
3019 sys.exit(u'\nERROR: Interrupted by user')