2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
21 __license__ = 'Public Domain'
22 __version__ = '2012.02.27'
24 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
57 except ImportError: # Python 2.4
60 import cStringIO as StringIO
64 # parse_qs was moved from the cgi module to the urlparse module recently.
66 from urlparse import parse_qs
68 from cgi import parse_qs
76 import xml.etree.ElementTree
77 except ImportError: # Python<2.5: Not officially supported, but let it slip
78 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
81 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
82 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
83 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Encoding': 'gzip, deflate',
85 'Accept-Language': 'en-us,en;q=0.5',
90 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
96 def raiseError(msg, i):
97 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
98 def skipSpace(i, expectMore=True):
99 while i < len(s) and s[i] in ' \t\r\n':
103 raiseError('Premature end', i)
105 def decodeEscape(match):
121 return unichr(int(esc[1:5], 16))
122 if len(esc) == 5+6 and esc[5:7] == '\\u':
123 hi = int(esc[1:5], 16)
124 low = int(esc[7:11], 16)
125 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
126 raise ValueError('Unknown escape ' + str(esc))
133 while s[e-bslashes-1] == '\\':
135 if bslashes % 2 == 1:
139 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
140 stri = rexp.sub(decodeEscape, s[i:e])
146 if s[i] == '}': # Empty dictionary
150 raiseError('Expected a string object key', i)
151 i,key = parseString(i)
153 if i >= len(s) or s[i] != ':':
154 raiseError('Expected a colon', i)
161 raiseError('Expected comma or closing curly brace', i)
166 if s[i] == ']': # Empty array
171 i = skipSpace(i) # Raise exception if premature end
175 raiseError('Expected a comma or closing bracket', i)
177 def parseDiscrete(i):
178 for k,v in {'true': True, 'false': False, 'null': None}.items():
179 if s.startswith(k, i):
181 raiseError('Not a boolean (or null)', i)
183 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
185 raiseError('Not a number', i)
187 if '.' in nums or 'e' in nums or 'E' in nums:
188 return (i+len(nums), float(nums))
189 return (i+len(nums), int(nums))
190 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
193 i,res = CHARMAP.get(s[i], parseNumber)(i)
194 i = skipSpace(i, False)
198 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
201 def preferredencoding():
202 """Get preferred encoding.
204 Returns the best encoding scheme for the system, based on
205 locale.getpreferredencoding() and some further tweaks.
207 def yield_preferredencoding():
209 pref = locale.getpreferredencoding()
215 return yield_preferredencoding().next()
218 def htmlentity_transform(matchobj):
219 """Transforms an HTML entity to a Unicode character.
221 This function receives a match object and is intended to be used with
222 the re.sub() function.
224 entity = matchobj.group(1)
226 # Known non-numeric HTML entity
227 if entity in htmlentitydefs.name2codepoint:
228 return unichr(htmlentitydefs.name2codepoint[entity])
231 mobj = re.match(ur'(?u)#(x?\d+)', entity)
233 numstr = mobj.group(1)
234 if numstr.startswith(u'x'):
236 numstr = u'0%s' % numstr
239 return unichr(long(numstr, base))
241 # Unknown entity in name, return its literal representation
242 return (u'&%s;' % entity)
245 def sanitize_title(utitle):
246 """Sanitizes a video title so it could be used as part of a filename."""
247 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
248 return utitle.replace(unicode(os.sep), u'%')
251 def sanitize_open(filename, open_mode):
252 """Try to open the given filename, and slightly tweak it if this fails.
254 Attempts to open the given filename. If this fails, it tries to change
255 the filename slightly, step by step, until it's either able to open it
256 or it fails and raises a final exception, like the standard open()
259 It returns the tuple (stream, definitive_file_name).
263 if sys.platform == 'win32':
265 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
266 return (sys.stdout, filename)
267 stream = open(_encodeFilename(filename), open_mode)
268 return (stream, filename)
269 except (IOError, OSError), err:
270 # In case of error, try to remove win32 forbidden chars
271 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
273 # An exception here should be caught in the caller
274 stream = open(_encodeFilename(filename), open_mode)
275 return (stream, filename)
278 def timeconvert(timestr):
279 """Convert RFC 2822 defined time string into system timestamp"""
281 timetuple = email.utils.parsedate_tz(timestr)
282 if timetuple is not None:
283 timestamp = email.utils.mktime_tz(timetuple)
286 def _simplify_title(title):
287 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
288 return expr.sub(u'_', title).strip(u'_')
290 def _orderedSet(iterable):
291 """ Remove all duplicates from the input iterable """
298 def _unescapeHTML(s):
300 @param s a string (of type unicode)
302 assert type(s) == type(u'')
304 htmlParser = HTMLParser.HTMLParser()
305 return htmlParser.unescape(s)
307 def _encodeFilename(s):
309 @param s The name of the file (of type unicode)
312 assert type(s) == type(u'')
314 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
315 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
316 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
317 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
320 return s.encode(sys.getfilesystemencoding(), 'ignore')
322 class DownloadError(Exception):
323 """Download Error exception.
325 This exception may be thrown by FileDownloader objects if they are not
326 configured to continue on errors. They will contain the appropriate
332 class SameFileError(Exception):
333 """Same File exception.
335 This exception will be thrown by FileDownloader objects if they detect
336 multiple files would have to be downloaded to the same file on disk.
341 class PostProcessingError(Exception):
342 """Post Processing exception.
344 This exception may be raised by PostProcessor's .run() method to
345 indicate an error in the postprocessing task.
349 class MaxDownloadsReached(Exception):
350 """ --max-downloads limit has been reached. """
354 class UnavailableVideoError(Exception):
355 """Unavailable Format exception.
357 This exception will be thrown when a video is requested
358 in a format that is not available for that video.
363 class ContentTooShortError(Exception):
364 """Content Too Short exception.
366 This exception may be raised by FileDownloader objects when a file they
367 download is too small for what the server announced first, indicating
368 the connection was probably interrupted.
374 def __init__(self, downloaded, expected):
375 self.downloaded = downloaded
376 self.expected = expected
379 class YoutubeDLHandler(urllib2.HTTPHandler):
380 """Handler for HTTP requests and responses.
382 This class, when installed with an OpenerDirector, automatically adds
383 the standard headers to every HTTP request and handles gzipped and
384 deflated responses from web servers. If compression is to be avoided in
385 a particular request, the original request in the program code only has
386 to include the HTTP header "Youtubedl-No-Compression", which will be
387 removed before making the real request.
389 Part of this code was copied from:
391 http://techknack.net/python-urllib2-handlers/
393 Andrew Rowls, the author of that code, agreed to release it to the
400 return zlib.decompress(data, -zlib.MAX_WBITS)
402 return zlib.decompress(data)
405 def addinfourl_wrapper(stream, headers, url, code):
406 if hasattr(urllib2.addinfourl, 'getcode'):
407 return urllib2.addinfourl(stream, headers, url, code)
408 ret = urllib2.addinfourl(stream, headers, url)
412 def http_request(self, req):
413 for h in std_headers:
416 req.add_header(h, std_headers[h])
417 if 'Youtubedl-no-compression' in req.headers:
418 if 'Accept-encoding' in req.headers:
419 del req.headers['Accept-encoding']
420 del req.headers['Youtubedl-no-compression']
423 def http_response(self, req, resp):
426 if resp.headers.get('Content-encoding', '') == 'gzip':
427 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
428 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
429 resp.msg = old_resp.msg
431 if resp.headers.get('Content-encoding', '') == 'deflate':
432 gz = StringIO.StringIO(self.deflate(resp.read()))
433 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
434 resp.msg = old_resp.msg
438 class FileDownloader(object):
439 """File Downloader class.
441 File downloader objects are the ones responsible of downloading the
442 actual video file and writing it to disk if the user has requested
443 it, among some other tasks. In most cases there should be one per
444 program. As, given a video URL, the downloader doesn't know how to
445 extract all the needed information, task that InfoExtractors do, it
446 has to pass the URL to one of them.
448 For this, file downloader objects have a method that allows
449 InfoExtractors to be registered in a given order. When it is passed
450 a URL, the file downloader handles it to the first InfoExtractor it
451 finds that reports being able to handle it. The InfoExtractor extracts
452 all the information about the video or videos the URL refers to, and
453 asks the FileDownloader to process the video information, possibly
454 downloading the video.
456 File downloaders accept a lot of parameters. In order not to saturate
457 the object constructor with arguments, it receives a dictionary of
458 options instead. These options are available through the params
459 attribute for the InfoExtractors to use. The FileDownloader also
460 registers itself as the downloader in charge for the InfoExtractors
461 that are added to it, so this is a "mutual registration".
465 username: Username for authentication purposes.
466 password: Password for authentication purposes.
467 usenetrc: Use netrc for authentication instead.
468 quiet: Do not print messages to stdout.
469 forceurl: Force printing final URL.
470 forcetitle: Force printing title.
471 forcethumbnail: Force printing thumbnail URL.
472 forcedescription: Force printing description.
473 forcefilename: Force printing final filename.
474 simulate: Do not download the video files.
475 format: Video format code.
476 format_limit: Highest quality format to try.
477 outtmpl: Template for output names.
478 ignoreerrors: Do not stop on download errors.
479 ratelimit: Download speed limit, in bytes/sec.
480 nooverwrites: Prevent overwriting files.
481 retries: Number of times to retry for HTTP error 5xx
482 continuedl: Try to continue downloads if possible.
483 noprogress: Do not print the progress bar.
484 playliststart: Playlist item to start at.
485 playlistend: Playlist item to end at.
486 matchtitle: Download only matching titles.
487 rejecttitle: Reject downloads for matching titles.
488 logtostderr: Log messages to stderr instead of stdout.
489 consoletitle: Display progress in console window's titlebar.
490 nopart: Do not use temporary .part files.
491 updatetime: Use the Last-modified header to set output file timestamps.
492 writedescription: Write the video description to a .description file
493 writeinfojson: Write the video description to a .info.json file
494 writesubtitles: Write the video subtitles to a .srt file
495 subtitleslang: Language of the subtitles to download
501 _download_retcode = None
502 _num_downloads = None
505 def __init__(self, params):
506 """Create a FileDownloader object with the given options."""
509 self._download_retcode = 0
510 self._num_downloads = 0
511 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
515 def format_bytes(bytes):
518 if type(bytes) is str:
523 exponent = long(math.log(bytes, 1024.0))
524 suffix = 'bkMGTPEZY'[exponent]
525 converted = float(bytes) / float(1024 ** exponent)
526 return '%.2f%s' % (converted, suffix)
529 def calc_percent(byte_counter, data_len):
532 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
535 def calc_eta(start, now, total, current):
539 if current == 0 or dif < 0.001: # One millisecond
541 rate = float(current) / dif
542 eta = long((float(total) - float(current)) / rate)
543 (eta_mins, eta_secs) = divmod(eta, 60)
546 return '%02d:%02d' % (eta_mins, eta_secs)
549 def calc_speed(start, now, bytes):
551 if bytes == 0 or dif < 0.001: # One millisecond
552 return '%10s' % '---b/s'
553 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
556 def best_block_size(elapsed_time, bytes):
557 new_min = max(bytes / 2.0, 1.0)
558 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
559 if elapsed_time < 0.001:
561 rate = bytes / elapsed_time
569 def parse_bytes(bytestr):
570 """Parse a string indicating a byte quantity into a long integer."""
571 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
574 number = float(matchobj.group(1))
575 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
576 return long(round(number * multiplier))
578 def add_info_extractor(self, ie):
579 """Add an InfoExtractor object to the end of the list."""
581 ie.set_downloader(self)
583 def add_post_processor(self, pp):
584 """Add a PostProcessor object to the end of the chain."""
586 pp.set_downloader(self)
588 def to_screen(self, message, skip_eol=False):
589 """Print message to stdout if not in quiet mode."""
590 assert type(message) == type(u'')
591 if not self.params.get('quiet', False):
592 terminator = [u'\n', u''][skip_eol]
593 output = message + terminator
595 if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
596 output = output.encode(preferredencoding(), 'ignore')
597 self._screen_file.write(output)
598 self._screen_file.flush()
600 def to_stderr(self, message):
601 """Print message to stderr."""
602 print >>sys.stderr, message.encode(preferredencoding())
604 def to_cons_title(self, message):
605 """Set console/terminal window title to message."""
606 if not self.params.get('consoletitle', False):
608 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
609 # c_wchar_p() might not be necessary if `message` is
610 # already of type unicode()
611 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
612 elif 'TERM' in os.environ:
613 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
615 def fixed_template(self):
616 """Checks if the output template is fixed."""
617 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
619 def trouble(self, message=None):
620 """Determine action to take when a download problem appears.
622 Depending on if the downloader has been configured to ignore
623 download errors or not, this method may throw an exception or
624 not when errors are found, after printing the message.
626 if message is not None:
627 self.to_stderr(message)
628 if not self.params.get('ignoreerrors', False):
629 raise DownloadError(message)
630 self._download_retcode = 1
632 def slow_down(self, start_time, byte_counter):
633 """Sleep if the download speed is over the rate limit."""
634 rate_limit = self.params.get('ratelimit', None)
635 if rate_limit is None or byte_counter == 0:
638 elapsed = now - start_time
641 speed = float(byte_counter) / elapsed
642 if speed > rate_limit:
643 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
645 def temp_name(self, filename):
646 """Returns a temporary filename for the given filename."""
647 if self.params.get('nopart', False) or filename == u'-' or \
648 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
650 return filename + u'.part'
652 def undo_temp_name(self, filename):
653 if filename.endswith(u'.part'):
654 return filename[:-len(u'.part')]
657 def try_rename(self, old_filename, new_filename):
659 if old_filename == new_filename:
661 os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
662 except (IOError, OSError), err:
663 self.trouble(u'ERROR: unable to rename file')
665 def try_utime(self, filename, last_modified_hdr):
666 """Try to set the last-modified time of the given file."""
667 if last_modified_hdr is None:
669 if not os.path.isfile(_encodeFilename(filename)):
671 timestr = last_modified_hdr
674 filetime = timeconvert(timestr)
678 os.utime(filename, (time.time(), filetime))
683 def report_writedescription(self, descfn):
684 """ Report that the description file is being written """
685 self.to_screen(u'[info] Writing video description to: ' + descfn)
687 def report_writesubtitles(self, srtfn):
688 """ Report that the subtitles file is being written """
689 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
691 def report_writeinfojson(self, infofn):
692 """ Report that the metadata file has been written """
693 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
695 def report_destination(self, filename):
696 """Report destination filename."""
697 self.to_screen(u'[download] Destination: ' + filename)
699 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
700 """Report download progress."""
701 if self.params.get('noprogress', False):
703 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
704 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
705 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
706 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
708 def report_resuming_byte(self, resume_len):
709 """Report attempt to resume at given byte."""
710 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
712 def report_retry(self, count, retries):
713 """Report retry in case of HTTP error 5xx"""
714 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
716 def report_file_already_downloaded(self, file_name):
717 """Report file has already been fully downloaded."""
719 self.to_screen(u'[download] %s has already been downloaded' % file_name)
720 except (UnicodeEncodeError), err:
721 self.to_screen(u'[download] The file has already been downloaded')
723 def report_unable_to_resume(self):
724 """Report it was impossible to resume download."""
725 self.to_screen(u'[download] Unable to resume')
727 def report_finish(self):
728 """Report download finished."""
729 if self.params.get('noprogress', False):
730 self.to_screen(u'[download] Download completed')
734 def increment_downloads(self):
735 """Increment the ordinal that assigns a number to each file."""
736 self._num_downloads += 1
738 def prepare_filename(self, info_dict):
739 """Generate the output filename."""
741 template_dict = dict(info_dict)
742 template_dict['epoch'] = unicode(long(time.time()))
743 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
744 filename = self.params['outtmpl'] % template_dict
746 except (ValueError, KeyError), err:
747 self.trouble(u'ERROR: invalid system charset or erroneous output template')
750 def _match_entry(self, info_dict):
751 """ Returns None iff the file should be downloaded """
753 title = info_dict['title']
754 matchtitle = self.params.get('matchtitle', False)
755 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
756 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
757 rejecttitle = self.params.get('rejecttitle', False)
758 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
759 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
762 def process_info(self, info_dict):
763 """Process a single dictionary returned by an InfoExtractor."""
765 reason = self._match_entry(info_dict)
766 if reason is not None:
767 self.to_screen(u'[download] ' + reason)
770 max_downloads = self.params.get('max_downloads')
771 if max_downloads is not None:
772 if self._num_downloads > int(max_downloads):
773 raise MaxDownloadsReached()
775 filename = self.prepare_filename(info_dict)
778 if self.params.get('forcetitle', False):
779 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
780 if self.params.get('forceurl', False):
781 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
782 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
783 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
784 if self.params.get('forcedescription', False) and 'description' in info_dict:
785 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
786 if self.params.get('forcefilename', False) and filename is not None:
787 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
788 if self.params.get('forceformat', False):
789 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
791 # Do nothing else if in simulate mode
792 if self.params.get('simulate', False):
799 dn = os.path.dirname(_encodeFilename(filename))
800 if dn != '' and not os.path.exists(dn): # dn is already encoded
802 except (OSError, IOError), err:
803 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
806 if self.params.get('writedescription', False):
808 descfn = filename + u'.description'
809 self.report_writedescription(descfn)
810 descfile = open(_encodeFilename(descfn), 'wb')
812 descfile.write(info_dict['description'].encode('utf-8'))
815 except (OSError, IOError):
816 self.trouble(u'ERROR: Cannot write description file ' + descfn)
819 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
820 # subtitles download errors are already managed as troubles in relevant IE
821 # that way it will silently go on when used with unsupporting IE
823 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
824 self.report_writesubtitles(srtfn)
825 srtfile = open(_encodeFilename(srtfn), 'wb')
827 srtfile.write(info_dict['subtitles'].encode('utf-8'))
830 except (OSError, IOError):
831 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
834 if self.params.get('writeinfojson', False):
835 infofn = filename + u'.info.json'
836 self.report_writeinfojson(infofn)
839 except (NameError,AttributeError):
840 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
843 infof = open(_encodeFilename(infofn), 'wb')
845 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
846 json.dump(json_info_dict, infof)
849 except (OSError, IOError):
850 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
853 if not self.params.get('skip_download', False):
854 if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
858 success = self._do_download(filename, info_dict)
859 except (OSError, IOError), err:
860 raise UnavailableVideoError
861 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
862 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
864 except (ContentTooShortError, ), err:
865 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
870 self.post_process(filename, info_dict)
871 except (PostProcessingError), err:
872 self.trouble(u'ERROR: postprocessing: %s' % str(err))
875 def download(self, url_list):
876 """Download a given list of URLs."""
877 if len(url_list) > 1 and self.fixed_template():
878 raise SameFileError(self.params['outtmpl'])
881 suitable_found = False
883 # Go to next InfoExtractor if not suitable
884 if not ie.suitable(url):
887 # Suitable InfoExtractor found
888 suitable_found = True
890 # Extract information from URL and process it
893 # Suitable InfoExtractor had been found; go to next URL
896 if not suitable_found:
897 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
899 return self._download_retcode
901 def post_process(self, filename, ie_info):
902 """Run the postprocessing chain on the given file."""
904 info['filepath'] = filename
910 def _download_with_rtmpdump(self, filename, url, player_url):
911 self.report_destination(filename)
912 tmpfilename = self.temp_name(filename)
914 # Check for rtmpdump first
916 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
917 except (OSError, IOError):
918 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
921 # Download using rtmpdump. rtmpdump returns exit code 2 when
922 # the connection was interrumpted and resuming appears to be
923 # possible. This is part of rtmpdump's normal usage, AFAIK.
924 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
925 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
926 if self.params.get('verbose', False):
929 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
932 self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
933 retval = subprocess.call(args)
934 while retval == 2 or retval == 1:
935 prevsize = os.path.getsize(_encodeFilename(tmpfilename))
936 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
937 time.sleep(5.0) # This seems to be needed
938 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
939 cursize = os.path.getsize(_encodeFilename(tmpfilename))
940 if prevsize == cursize and retval == 1:
942 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
943 if prevsize == cursize and retval == 2 and cursize > 1024:
944 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
948 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
949 self.try_rename(tmpfilename, filename)
952 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
955 def _do_download(self, filename, info_dict):
956 url = info_dict['url']
957 player_url = info_dict.get('player_url', None)
959 # Check file already present
960 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
961 self.report_file_already_downloaded(filename)
964 # Attempt to download using rtmpdump
965 if url.startswith('rtmp'):
966 return self._download_with_rtmpdump(filename, url, player_url)
968 tmpfilename = self.temp_name(filename)
971 # Do not include the Accept-Encoding header
972 headers = {'Youtubedl-no-compression': 'True'}
973 basic_request = urllib2.Request(url, None, headers)
974 request = urllib2.Request(url, None, headers)
976 # Establish possible resume length
977 if os.path.isfile(_encodeFilename(tmpfilename)):
978 resume_len = os.path.getsize(_encodeFilename(tmpfilename))
984 if self.params.get('continuedl', False):
985 self.report_resuming_byte(resume_len)
986 request.add_header('Range','bytes=%d-' % resume_len)
992 retries = self.params.get('retries', 0)
993 while count <= retries:
994 # Establish connection
996 if count == 0 and 'urlhandle' in info_dict:
997 data = info_dict['urlhandle']
998 data = urllib2.urlopen(request)
1000 except (urllib2.HTTPError, ), err:
1001 if (err.code < 500 or err.code >= 600) and err.code != 416:
1002 # Unexpected HTTP error
1004 elif err.code == 416:
1005 # Unable to resume (requested range not satisfiable)
1007 # Open the connection again without the range header
1008 data = urllib2.urlopen(basic_request)
1009 content_length = data.info()['Content-Length']
1010 except (urllib2.HTTPError, ), err:
1011 if err.code < 500 or err.code >= 600:
1014 # Examine the reported length
1015 if (content_length is not None and
1016 (resume_len - 100 < long(content_length) < resume_len + 100)):
1017 # The file had already been fully downloaded.
1018 # Explanation to the above condition: in issue #175 it was revealed that
1019 # YouTube sometimes adds or removes a few bytes from the end of the file,
1020 # changing the file size slightly and causing problems for some users. So
1021 # I decided to implement a suggested change and consider the file
1022 # completely downloaded if the file size differs less than 100 bytes from
1023 # the one in the hard drive.
1024 self.report_file_already_downloaded(filename)
1025 self.try_rename(tmpfilename, filename)
1028 # The length does not match, we start the download over
1029 self.report_unable_to_resume()
1034 if count <= retries:
1035 self.report_retry(count, retries)
1038 self.trouble(u'ERROR: giving up after %s retries' % retries)
1041 data_len = data.info().get('Content-length', None)
1042 if data_len is not None:
1043 data_len = long(data_len) + resume_len
1044 data_len_str = self.format_bytes(data_len)
1045 byte_counter = 0 + resume_len
1049 # Download and write
1050 before = time.time()
1051 data_block = data.read(block_size)
1053 if len(data_block) == 0:
1055 byte_counter += len(data_block)
1057 # Open file just in time
1060 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1061 assert stream is not None
1062 filename = self.undo_temp_name(tmpfilename)
1063 self.report_destination(filename)
1064 except (OSError, IOError), err:
1065 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1068 stream.write(data_block)
1069 except (IOError, OSError), err:
1070 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1072 block_size = self.best_block_size(after - before, len(data_block))
1075 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1076 if data_len is None:
1077 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1079 percent_str = self.calc_percent(byte_counter, data_len)
1080 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1081 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1084 self.slow_down(start, byte_counter - resume_len)
1087 self.trouble(u'\nERROR: Did not get any data blocks')
1090 self.report_finish()
1091 if data_len is not None and byte_counter != data_len:
1092 raise ContentTooShortError(byte_counter, long(data_len))
1093 self.try_rename(tmpfilename, filename)
1095 # Update file modification time
1096 if self.params.get('updatetime', True):
1097 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1102 class InfoExtractor(object):
1103 """Information Extractor class.
1105 Information extractors are the classes that, given a URL, extract
1106 information from the video (or videos) the URL refers to. This
1107 information includes the real video URL, the video title and simplified
1108 title, author and others. The information is stored in a dictionary
1109 which is then passed to the FileDownloader. The FileDownloader
1110 processes this information possibly downloading the video to the file
1111 system, among other possible outcomes. The dictionaries must include
1112 the following fields:
1114 id: Video identifier.
1115 url: Final video URL.
1116 uploader: Nickname of the video uploader.
1117 title: Literal title.
1118 stitle: Simplified title.
1119 ext: Video filename extension.
1120 format: Video format.
1121 player_url: SWF Player URL (may be None).
1123 The following fields are optional. Their primary purpose is to allow
1124 youtube-dl to serve as the backend for a video search function, such
1125 as the one in youtube2mp3. They are only used when their respective
1126 forced printing functions are called:
1128 thumbnail: Full URL to a video thumbnail image.
1129 description: One-line video description.
1131 Subclasses of this one should re-define the _real_initialize() and
1132 _real_extract() methods and define a _VALID_URL regexp.
1133 Probably, they should also be added to the list of extractors.
1139 def __init__(self, downloader=None):
1140 """Constructor. Receives an optional downloader."""
1142 self.set_downloader(downloader)
1144 def suitable(self, url):
1145 """Receives a URL and returns True if suitable for this IE."""
1146 return re.match(self._VALID_URL, url) is not None
1148 def initialize(self):
1149 """Initializes an instance (authentication, etc)."""
1151 self._real_initialize()
1154 def extract(self, url):
1155 """Extracts URL information and returns it in list of dicts."""
1157 return self._real_extract(url)
1159 def set_downloader(self, downloader):
1160 """Sets the downloader for this IE."""
1161 self._downloader = downloader
1163 def _real_initialize(self):
1164 """Real initialization process. Redefine in subclasses."""
1167 def _real_extract(self, url):
1168 """Real extraction process. Redefine in subclasses."""
1172 class YoutubeIE(InfoExtractor):
1173 """Information extractor for youtube.com."""
1175 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1176 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1177 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1178 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1179 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
1180 _NETRC_MACHINE = 'youtube'
1181 # Listed in order of quality
1182 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1183 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1184 _video_extensions = {
1190 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1195 _video_dimensions = {
1210 IE_NAME = u'youtube'
1212 def report_lang(self):
1213 """Report attempt to set language."""
1214 self._downloader.to_screen(u'[youtube] Setting language')
1216 def report_login(self):
1217 """Report attempt to log in."""
1218 self._downloader.to_screen(u'[youtube] Logging in')
1220 def report_age_confirmation(self):
1221 """Report attempt to confirm age."""
1222 self._downloader.to_screen(u'[youtube] Confirming age')
1224 def report_video_webpage_download(self, video_id):
1225 """Report attempt to download video webpage."""
1226 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1228 def report_video_info_webpage_download(self, video_id):
1229 """Report attempt to download video info webpage."""
1230 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1232 def report_video_subtitles_download(self, video_id):
1233 """Report attempt to download video info webpage."""
1234 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1236 def report_information_extraction(self, video_id):
1237 """Report attempt to extract video information."""
1238 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1240 def report_unavailable_format(self, video_id, format):
1241 """Report extracted video URL."""
1242 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1244 def report_rtmp_download(self):
1245 """Indicate the download will use the RTMP protocol."""
1246 self._downloader.to_screen(u'[youtube] RTMP download detected')
1248 def _closed_captions_xml_to_srt(self, xml_string):
1250 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1251 # TODO parse xml instead of regex
1252 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1253 if not dur: dur = '4'
1254 start = float(start)
1255 end = start + float(dur)
1256 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1257 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1258 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
1259 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
1260 srt += str(n) + '\n'
1261 srt += start + ' --> ' + end + '\n'
1262 srt += caption + '\n\n'
1265 def _print_formats(self, formats):
1266 print 'Available formats:'
1268 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1270 def _real_initialize(self):
1271 if self._downloader is None:
1276 downloader_params = self._downloader.params
1278 # Attempt to use provided username and password or .netrc data
1279 if downloader_params.get('username', None) is not None:
1280 username = downloader_params['username']
1281 password = downloader_params['password']
1282 elif downloader_params.get('usenetrc', False):
1284 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1285 if info is not None:
1289 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1290 except (IOError, netrc.NetrcParseError), err:
1291 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1295 request = urllib2.Request(self._LANG_URL)
1298 urllib2.urlopen(request).read()
1299 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1300 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1303 # No authentication to be performed
1304 if username is None:
1309 'current_form': 'loginForm',
1311 'action_login': 'Log In',
1312 'username': username,
1313 'password': password,
1315 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1318 login_results = urllib2.urlopen(request).read()
1319 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1320 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1322 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1323 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1329 'action_confirm': 'Confirm',
1331 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1333 self.report_age_confirmation()
1334 age_results = urllib2.urlopen(request).read()
1335 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1336 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1339 def _real_extract(self, url):
1340 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1341 mobj = re.search(self._NEXT_URL_RE, url)
1343 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
1345 # Extract video id from URL
1346 mobj = re.match(self._VALID_URL, url)
1348 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1350 video_id = mobj.group(2)
1353 self.report_video_webpage_download(video_id)
1354 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1356 video_webpage = urllib2.urlopen(request).read()
1357 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1358 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1361 # Attempt to extract SWF player URL
1362 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1363 if mobj is not None:
1364 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1369 self.report_video_info_webpage_download(video_id)
1370 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1371 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1372 % (video_id, el_type))
1373 request = urllib2.Request(video_info_url)
1375 video_info_webpage = urllib2.urlopen(request).read()
1376 video_info = parse_qs(video_info_webpage)
1377 if 'token' in video_info:
1379 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1380 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1382 if 'token' not in video_info:
1383 if 'reason' in video_info:
1384 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1386 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1389 # Start extracting information
1390 self.report_information_extraction(video_id)
1393 if 'author' not in video_info:
1394 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1396 video_uploader = urllib.unquote_plus(video_info['author'][0])
1399 if 'title' not in video_info:
1400 self._downloader.trouble(u'ERROR: unable to extract video title')
1402 video_title = urllib.unquote_plus(video_info['title'][0])
1403 video_title = video_title.decode('utf-8')
1404 video_title = sanitize_title(video_title)
1407 simple_title = _simplify_title(video_title)
1410 if 'thumbnail_url' not in video_info:
1411 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1412 video_thumbnail = ''
1413 else: # don't panic if we can't find it
1414 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1418 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1419 if mobj is not None:
1420 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1421 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1422 for expression in format_expressions:
1424 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1432 video_description = u'No description available.'
1433 mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1434 if mobj is not None:
1435 video_description = mobj.group(1).decode('utf-8')
1437 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1438 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1439 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1440 # TODO use another parser
1443 video_subtitles = None
1444 if self._downloader.params.get('writesubtitles', False):
1445 self.report_video_subtitles_download(video_id)
1446 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1448 srt_list = urllib2.urlopen(request).read()
1449 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1450 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1452 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1454 if self._downloader.params.get('subtitleslang', False):
1455 srt_lang = self._downloader.params.get('subtitleslang')
1456 elif 'en' in srt_lang_list:
1459 srt_lang = srt_lang_list[0]
1460 if not srt_lang in srt_lang_list:
1461 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
1463 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1465 srt_xml = urllib2.urlopen(request).read()
1466 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1467 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1469 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1471 self._downloader.trouble(u'WARNING: video has no closed captions')
1474 video_token = urllib.unquote_plus(video_info['token'][0])
1476 # Decide which formats to download
1477 req_format = self._downloader.params.get('format', None)
1479 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1480 self.report_rtmp_download()
1481 video_url_list = [(None, video_info['conn'][0])]
1482 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1483 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1484 url_data = [parse_qs(uds) for uds in url_data_strs]
1485 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1486 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1488 format_limit = self._downloader.params.get('format_limit', None)
1489 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1490 if format_limit is not None and format_limit in available_formats:
1491 format_list = available_formats[available_formats.index(format_limit):]
1493 format_list = available_formats
1494 existing_formats = [x for x in format_list if x in url_map]
1495 if len(existing_formats) == 0:
1496 self._downloader.trouble(u'ERROR: no known formats available for video')
1498 if self._downloader.params.get('listformats', None):
1499 self._print_formats(existing_formats)
1501 if req_format is None or req_format == 'best':
1502 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1503 elif req_format == 'worst':
1504 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1505 elif req_format in ('-1', 'all'):
1506 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1508 # Specific formats. We pick the first in a slash-delimeted sequence.
1509 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1510 req_formats = req_format.split('/')
1511 video_url_list = None
1512 for rf in req_formats:
1514 video_url_list = [(rf, url_map[rf])]
1516 if video_url_list is None:
1517 self._downloader.trouble(u'ERROR: requested format not available')
1520 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1523 for format_param, video_real_url in video_url_list:
1524 # At this point we have a new video
1525 self._downloader.increment_downloads()
1528 video_extension = self._video_extensions.get(format_param, 'flv')
1531 # Process video information
1532 self._downloader.process_info({
1533 'id': video_id.decode('utf-8'),
1534 'url': video_real_url.decode('utf-8'),
1535 'uploader': video_uploader.decode('utf-8'),
1536 'upload_date': upload_date,
1537 'title': video_title,
1538 'stitle': simple_title,
1539 'ext': video_extension.decode('utf-8'),
1540 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1541 'thumbnail': video_thumbnail.decode('utf-8'),
1542 'description': video_description,
1543 'player_url': player_url,
1544 'subtitles': video_subtitles
1546 except UnavailableVideoError, err:
1547 self._downloader.trouble(u'\nERROR: unable to download video')
1550 class MetacafeIE(InfoExtractor):
1551 """Information Extractor for metacafe.com."""
1553 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1554 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1555 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1557 IE_NAME = u'metacafe'
1559 def __init__(self, youtube_ie, downloader=None):
1560 InfoExtractor.__init__(self, downloader)
1561 self._youtube_ie = youtube_ie
1563 def report_disclaimer(self):
1564 """Report disclaimer retrieval."""
1565 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1567 def report_age_confirmation(self):
1568 """Report attempt to confirm age."""
1569 self._downloader.to_screen(u'[metacafe] Confirming age')
1571 def report_download_webpage(self, video_id):
1572 """Report webpage download."""
1573 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1575 def report_extraction(self, video_id):
1576 """Report information extraction."""
1577 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1579 def _real_initialize(self):
1580 # Retrieve disclaimer
1581 request = urllib2.Request(self._DISCLAIMER)
1583 self.report_disclaimer()
1584 disclaimer = urllib2.urlopen(request).read()
1585 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1586 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1592 'submit': "Continue - I'm over 18",
1594 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1596 self.report_age_confirmation()
1597 disclaimer = urllib2.urlopen(request).read()
1598 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1599 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1602 def _real_extract(self, url):
1603 # Extract id and simplified title from URL
1604 mobj = re.match(self._VALID_URL, url)
1606 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1609 video_id = mobj.group(1)
1611 # Check if video comes from YouTube
1612 mobj2 = re.match(r'^yt-(.*)$', video_id)
1613 if mobj2 is not None:
1614 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1617 # At this point we have a new video
1618 self._downloader.increment_downloads()
1620 simple_title = mobj.group(2).decode('utf-8')
1622 # Retrieve video webpage to extract further information
1623 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1625 self.report_download_webpage(video_id)
1626 webpage = urllib2.urlopen(request).read()
1627 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1628 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1631 # Extract URL, uploader and title from webpage
1632 self.report_extraction(video_id)
1633 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1634 if mobj is not None:
1635 mediaURL = urllib.unquote(mobj.group(1))
1636 video_extension = mediaURL[-3:]
1638 # Extract gdaKey if available
1639 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1641 video_url = mediaURL
1643 gdaKey = mobj.group(1)
1644 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1646 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1648 self._downloader.trouble(u'ERROR: unable to extract media URL')
1650 vardict = parse_qs(mobj.group(1))
1651 if 'mediaData' not in vardict:
1652 self._downloader.trouble(u'ERROR: unable to extract media URL')
1654 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1656 self._downloader.trouble(u'ERROR: unable to extract media URL')
1658 mediaURL = mobj.group(1).replace('\\/', '/')
1659 video_extension = mediaURL[-3:]
1660 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1662 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1664 self._downloader.trouble(u'ERROR: unable to extract title')
1666 video_title = mobj.group(1).decode('utf-8')
1667 video_title = sanitize_title(video_title)
1669 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1671 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1673 video_uploader = mobj.group(1)
1676 # Process video information
1677 self._downloader.process_info({
1678 'id': video_id.decode('utf-8'),
1679 'url': video_url.decode('utf-8'),
1680 'uploader': video_uploader.decode('utf-8'),
1681 'upload_date': u'NA',
1682 'title': video_title,
1683 'stitle': simple_title,
1684 'ext': video_extension.decode('utf-8'),
1688 except UnavailableVideoError:
1689 self._downloader.trouble(u'\nERROR: unable to download video')
1692 class DailymotionIE(InfoExtractor):
1693 """Information Extractor for Dailymotion"""
1695 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1696 IE_NAME = u'dailymotion'
1698 def __init__(self, downloader=None):
1699 InfoExtractor.__init__(self, downloader)
1701 def report_download_webpage(self, video_id):
1702 """Report webpage download."""
1703 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1705 def report_extraction(self, video_id):
1706 """Report information extraction."""
1707 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1709 def _real_extract(self, url):
1710 # Extract id and simplified title from URL
1711 mobj = re.match(self._VALID_URL, url)
1713 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1716 # At this point we have a new video
1717 self._downloader.increment_downloads()
1718 video_id = mobj.group(1)
1720 video_extension = 'flv'
1722 # Retrieve video webpage to extract further information
1723 request = urllib2.Request(url)
1724 request.add_header('Cookie', 'family_filter=off')
1726 self.report_download_webpage(video_id)
1727 webpage = urllib2.urlopen(request).read()
1728 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1729 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1732 # Extract URL, uploader and title from webpage
1733 self.report_extraction(video_id)
1734 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1736 self._downloader.trouble(u'ERROR: unable to extract media URL')
1738 sequence = urllib.unquote(mobj.group(1))
1739 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1741 self._downloader.trouble(u'ERROR: unable to extract media URL')
1743 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1745 # if needed add http://www.dailymotion.com/ if relative URL
1747 video_url = mediaURL
1749 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1751 self._downloader.trouble(u'ERROR: unable to extract title')
1753 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1754 video_title = sanitize_title(video_title)
1755 simple_title = _simplify_title(video_title)
1757 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1759 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1761 video_uploader = mobj.group(1)
1764 # Process video information
1765 self._downloader.process_info({
1766 'id': video_id.decode('utf-8'),
1767 'url': video_url.decode('utf-8'),
1768 'uploader': video_uploader.decode('utf-8'),
1769 'upload_date': u'NA',
1770 'title': video_title,
1771 'stitle': simple_title,
1772 'ext': video_extension.decode('utf-8'),
1776 except UnavailableVideoError:
1777 self._downloader.trouble(u'\nERROR: unable to download video')
1780 class GoogleIE(InfoExtractor):
1781 """Information extractor for video.google.com."""
1783 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1784 IE_NAME = u'video.google'
1786 def __init__(self, downloader=None):
1787 InfoExtractor.__init__(self, downloader)
1789 def report_download_webpage(self, video_id):
1790 """Report webpage download."""
1791 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1793 def report_extraction(self, video_id):
1794 """Report information extraction."""
1795 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1797 def _real_extract(self, url):
1798 # Extract id from URL
1799 mobj = re.match(self._VALID_URL, url)
1801 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1804 # At this point we have a new video
1805 self._downloader.increment_downloads()
1806 video_id = mobj.group(1)
1808 video_extension = 'mp4'
1810 # Retrieve video webpage to extract further information
1811 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1813 self.report_download_webpage(video_id)
1814 webpage = urllib2.urlopen(request).read()
1815 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1816 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1819 # Extract URL, uploader, and title from webpage
1820 self.report_extraction(video_id)
1821 mobj = re.search(r"download_url:'([^']+)'", webpage)
1823 video_extension = 'flv'
1824 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1826 self._downloader.trouble(u'ERROR: unable to extract media URL')
1828 mediaURL = urllib.unquote(mobj.group(1))
1829 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1830 mediaURL = mediaURL.replace('\\x26', '\x26')
1832 video_url = mediaURL
1834 mobj = re.search(r'<title>(.*)</title>', webpage)
1836 self._downloader.trouble(u'ERROR: unable to extract title')
1838 video_title = mobj.group(1).decode('utf-8')
1839 video_title = sanitize_title(video_title)
1840 simple_title = _simplify_title(video_title)
1842 # Extract video description
1843 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1845 self._downloader.trouble(u'ERROR: unable to extract video description')
1847 video_description = mobj.group(1).decode('utf-8')
1848 if not video_description:
1849 video_description = 'No description available.'
1851 # Extract video thumbnail
1852 if self._downloader.params.get('forcethumbnail', False):
1853 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1855 webpage = urllib2.urlopen(request).read()
1856 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1857 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1859 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1861 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1863 video_thumbnail = mobj.group(1)
1864 else: # we need something to pass to process_info
1865 video_thumbnail = ''
1868 # Process video information
1869 self._downloader.process_info({
1870 'id': video_id.decode('utf-8'),
1871 'url': video_url.decode('utf-8'),
1873 'upload_date': u'NA',
1874 'title': video_title,
1875 'stitle': simple_title,
1876 'ext': video_extension.decode('utf-8'),
1880 except UnavailableVideoError:
1881 self._downloader.trouble(u'\nERROR: unable to download video')
1884 class PhotobucketIE(InfoExtractor):
1885 """Information extractor for photobucket.com."""
1887 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1888 IE_NAME = u'photobucket'
1890 def __init__(self, downloader=None):
1891 InfoExtractor.__init__(self, downloader)
1893 def report_download_webpage(self, video_id):
1894 """Report webpage download."""
1895 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1897 def report_extraction(self, video_id):
1898 """Report information extraction."""
1899 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1901 def _real_extract(self, url):
1902 # Extract id from URL
1903 mobj = re.match(self._VALID_URL, url)
1905 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1908 # At this point we have a new video
1909 self._downloader.increment_downloads()
1910 video_id = mobj.group(1)
1912 video_extension = 'flv'
1914 # Retrieve video webpage to extract further information
1915 request = urllib2.Request(url)
1917 self.report_download_webpage(video_id)
1918 webpage = urllib2.urlopen(request).read()
1919 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1920 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1923 # Extract URL, uploader, and title from webpage
1924 self.report_extraction(video_id)
1925 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1927 self._downloader.trouble(u'ERROR: unable to extract media URL')
1929 mediaURL = urllib.unquote(mobj.group(1))
1931 video_url = mediaURL
1933 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1935 self._downloader.trouble(u'ERROR: unable to extract title')
1937 video_title = mobj.group(1).decode('utf-8')
1938 video_title = sanitize_title(video_title)
1939 simple_title = _simplify_title(vide_title)
1941 video_uploader = mobj.group(2).decode('utf-8')
1944 # Process video information
1945 self._downloader.process_info({
1946 'id': video_id.decode('utf-8'),
1947 'url': video_url.decode('utf-8'),
1948 'uploader': video_uploader,
1949 'upload_date': u'NA',
1950 'title': video_title,
1951 'stitle': simple_title,
1952 'ext': video_extension.decode('utf-8'),
1956 except UnavailableVideoError:
1957 self._downloader.trouble(u'\nERROR: unable to download video')
1960 class YahooIE(InfoExtractor):
1961 """Information extractor for video.yahoo.com."""
1963 # _VALID_URL matches all Yahoo! Video URLs
1964 # _VPAGE_URL matches only the extractable '/watch/' URLs
1965 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1966 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1967 IE_NAME = u'video.yahoo'
1969 def __init__(self, downloader=None):
1970 InfoExtractor.__init__(self, downloader)
1972 def report_download_webpage(self, video_id):
1973 """Report webpage download."""
1974 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1976 def report_extraction(self, video_id):
1977 """Report information extraction."""
1978 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1980 def _real_extract(self, url, new_video=True):
1981 # Extract ID from URL
1982 mobj = re.match(self._VALID_URL, url)
1984 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1987 # At this point we have a new video
1988 self._downloader.increment_downloads()
1989 video_id = mobj.group(2)
1990 video_extension = 'flv'
1992 # Rewrite valid but non-extractable URLs as
1993 # extractable English language /watch/ URLs
1994 if re.match(self._VPAGE_URL, url) is None:
1995 request = urllib2.Request(url)
1997 webpage = urllib2.urlopen(request).read()
1998 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1999 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2002 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
2004 self._downloader.trouble(u'ERROR: Unable to extract id field')
2006 yahoo_id = mobj.group(1)
2008 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2010 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2012 yahoo_vid = mobj.group(1)
2014 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2015 return self._real_extract(url, new_video=False)
2017 # Retrieve video webpage to extract further information
2018 request = urllib2.Request(url)
2020 self.report_download_webpage(video_id)
2021 webpage = urllib2.urlopen(request).read()
2022 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2023 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2026 # Extract uploader and title from webpage
2027 self.report_extraction(video_id)
2028 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2030 self._downloader.trouble(u'ERROR: unable to extract video title')
2032 video_title = mobj.group(1).decode('utf-8')
2033 simple_title = _simplify_title(video_title)
2035 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2037 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2039 video_uploader = mobj.group(1).decode('utf-8')
2041 # Extract video thumbnail
2042 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2044 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2046 video_thumbnail = mobj.group(1).decode('utf-8')
2048 # Extract video description
2049 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2051 self._downloader.trouble(u'ERROR: unable to extract video description')
2053 video_description = mobj.group(1).decode('utf-8')
2054 if not video_description:
2055 video_description = 'No description available.'
2057 # Extract video height and width
2058 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2060 self._downloader.trouble(u'ERROR: unable to extract video height')
2062 yv_video_height = mobj.group(1)
2064 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2066 self._downloader.trouble(u'ERROR: unable to extract video width')
2068 yv_video_width = mobj.group(1)
2070 # Retrieve video playlist to extract media URL
2071 # I'm not completely sure what all these options are, but we
2072 # seem to need most of them, otherwise the server sends a 401.
2073 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
2074 yv_bitrate = '700' # according to Wikipedia this is hard-coded
2075 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2076 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2077 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2079 self.report_download_webpage(video_id)
2080 webpage = urllib2.urlopen(request).read()
2081 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2082 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2085 # Extract media URL from playlist XML
2086 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2088 self._downloader.trouble(u'ERROR: Unable to extract media URL')
2090 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2091 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2094 # Process video information
2095 self._downloader.process_info({
2096 'id': video_id.decode('utf-8'),
2098 'uploader': video_uploader,
2099 'upload_date': u'NA',
2100 'title': video_title,
2101 'stitle': simple_title,
2102 'ext': video_extension.decode('utf-8'),
2103 'thumbnail': video_thumbnail.decode('utf-8'),
2104 'description': video_description,
2105 'thumbnail': video_thumbnail,
2108 except UnavailableVideoError:
2109 self._downloader.trouble(u'\nERROR: unable to download video')
2112 class VimeoIE(InfoExtractor):
2113 """Information extractor for vimeo.com."""
2115 # _VALID_URL matches Vimeo URLs
2116 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2119 def __init__(self, downloader=None):
2120 InfoExtractor.__init__(self, downloader)
2122 def report_download_webpage(self, video_id):
2123 """Report webpage download."""
2124 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2126 def report_extraction(self, video_id):
2127 """Report information extraction."""
2128 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2130 def _real_extract(self, url, new_video=True):
2131 # Extract ID from URL
2132 mobj = re.match(self._VALID_URL, url)
2134 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2137 # At this point we have a new video
2138 self._downloader.increment_downloads()
2139 video_id = mobj.group(1)
2141 # Retrieve video webpage to extract further information
2142 request = urllib2.Request(url, None, std_headers)
2144 self.report_download_webpage(video_id)
2145 webpage = urllib2.urlopen(request).read()
2146 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2147 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2150 # Now we begin extracting as much information as we can from what we
2151 # retrieved. First we extract the information common to all extractors,
2152 # and latter we extract those that are Vimeo specific.
2153 self.report_extraction(video_id)
2155 # Extract the config JSON
2156 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2158 config = json.loads(config)
2160 self._downloader.trouble(u'ERROR: unable to extract info section')
2164 video_title = config["video"]["title"]
2165 simple_title = _simplify_title(video_title)
2168 video_uploader = config["video"]["owner"]["name"]
2170 # Extract video thumbnail
2171 video_thumbnail = config["video"]["thumbnail"]
2173 # Extract video description
2177 video_description = u'No description available.'
2178 mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2179 if mobj is not None:
2180 video_description = mobj.group(1)
2182 html_parser = lxml.etree.HTMLParser()
2183 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2184 video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2185 # TODO use another parser
2187 # Extract upload date
2188 video_upload_date = u'NA'
2189 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2190 if mobj is not None:
2191 video_upload_date = mobj.group(1)
2193 # Vimeo specific: extract request signature and timestamp
2194 sig = config['request']['signature']
2195 timestamp = config['request']['timestamp']
2197 # Vimeo specific: extract video codec and quality information
2198 # TODO bind to format param
2199 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2200 for codec in codecs:
2201 if codec[0] in config["video"]["files"]:
2202 video_codec = codec[0]
2203 video_extension = codec[1]
2204 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2205 else: quality = 'sd'
2208 self._downloader.trouble(u'ERROR: no known codec found')
2211 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2212 %(video_id, sig, timestamp, quality, video_codec.upper())
2215 # Process video information
2216 self._downloader.process_info({
2219 'uploader': video_uploader,
2220 'upload_date': video_upload_date,
2221 'title': video_title,
2222 'stitle': simple_title,
2223 'ext': video_extension,
2224 'thumbnail': video_thumbnail,
2225 'description': video_description,
2228 except UnavailableVideoError:
2229 self._downloader.trouble(u'ERROR: unable to download video')
2232 class GenericIE(InfoExtractor):
2233 """Generic last-resort information extractor."""
2236 IE_NAME = u'generic'
2238 def __init__(self, downloader=None):
2239 InfoExtractor.__init__(self, downloader)
2241 def report_download_webpage(self, video_id):
2242 """Report webpage download."""
2243 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2244 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2246 def report_extraction(self, video_id):
2247 """Report information extraction."""
2248 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2250 def report_following_redirect(self, new_url):
2251 """Report information extraction."""
2252 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
2254 def _test_redirect(self, url):
2255 """Check if it is a redirect, like url shorteners, in case restart chain."""
2256 class HeadRequest(urllib2.Request):
2257 def get_method(self):
2260 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
2262 Subclass the HTTPRedirectHandler to make it use our
2263 HeadRequest also on the redirected URL
2265 def redirect_request(self, req, fp, code, msg, headers, newurl):
2266 if code in (301, 302, 303, 307):
2267 newurl = newurl.replace(' ', '%20')
2268 newheaders = dict((k,v) for k,v in req.headers.items()
2269 if k.lower() not in ("content-length", "content-type"))
2270 return HeadRequest(newurl,
2272 origin_req_host=req.get_origin_req_host(),
2275 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
2277 class HTTPMethodFallback(urllib2.BaseHandler):
2279 Fallback to GET if HEAD is not allowed (405 HTTP error)
2281 def http_error_405(self, req, fp, code, msg, headers):
2285 newheaders = dict((k,v) for k,v in req.headers.items()
2286 if k.lower() not in ("content-length", "content-type"))
2287 return self.parent.open(urllib2.Request(req.get_full_url(),
2289 origin_req_host=req.get_origin_req_host(),
2293 opener = urllib2.OpenerDirector()
2294 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
2295 HTTPMethodFallback, HEADRedirectHandler,
2296 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
2297 opener.add_handler(handler())
2299 response = opener.open(HeadRequest(url))
2300 new_url = response.geturl()
2302 if url == new_url: return False
2304 self.report_following_redirect(new_url)
2305 self._downloader.download([new_url])
2308 def _real_extract(self, url):
2309 if self._test_redirect(url): return
2311 # At this point we have a new video
2312 self._downloader.increment_downloads()
2314 video_id = url.split('/')[-1]
2315 request = urllib2.Request(url)
2317 self.report_download_webpage(video_id)
2318 webpage = urllib2.urlopen(request).read()
2319 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2320 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2322 except ValueError, err:
2323 # since this is the last-resort InfoExtractor, if
2324 # this error is thrown, it'll be thrown here
2325 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2328 self.report_extraction(video_id)
2329 # Start with something easy: JW Player in SWFObject
2330 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2332 # Broaden the search a little bit
2333 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2335 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2338 # It's possible that one of the regexes
2339 # matched, but returned an empty group:
2340 if mobj.group(1) is None:
2341 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2344 video_url = urllib.unquote(mobj.group(1))
2345 video_id = os.path.basename(video_url)
2347 # here's a fun little line of code for you:
2348 video_extension = os.path.splitext(video_id)[1][1:]
2349 video_id = os.path.splitext(video_id)[0]
2351 # it's tempting to parse this further, but you would
2352 # have to take into account all the variations like
2353 # Video Title - Site Name
2354 # Site Name | Video Title
2355 # Video Title - Tagline | Site Name
2356 # and so on and so forth; it's just not practical
2357 mobj = re.search(r'<title>(.*)</title>', webpage)
2359 self._downloader.trouble(u'ERROR: unable to extract title')
2361 video_title = mobj.group(1).decode('utf-8')
2362 video_title = sanitize_title(video_title)
2363 simple_title = _simplify_title(video_title)
2365 # video uploader is domain name
2366 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2368 self._downloader.trouble(u'ERROR: unable to extract title')
2370 video_uploader = mobj.group(1).decode('utf-8')
2373 # Process video information
2374 self._downloader.process_info({
2375 'id': video_id.decode('utf-8'),
2376 'url': video_url.decode('utf-8'),
2377 'uploader': video_uploader,
2378 'upload_date': u'NA',
2379 'title': video_title,
2380 'stitle': simple_title,
2381 'ext': video_extension.decode('utf-8'),
2385 except UnavailableVideoError, err:
2386 self._downloader.trouble(u'\nERROR: unable to download video')
2389 class YoutubeSearchIE(InfoExtractor):
2390 """Information Extractor for YouTube search queries."""
2391 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2392 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
2394 _max_youtube_results = 1000
2395 IE_NAME = u'youtube:search'
2397 def __init__(self, youtube_ie, downloader=None):
2398 InfoExtractor.__init__(self, downloader)
2399 self._youtube_ie = youtube_ie
2401 def report_download_page(self, query, pagenum):
2402 """Report attempt to download playlist page with given number."""
2403 query = query.decode(preferredencoding())
2404 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2406 def _real_initialize(self):
2407 self._youtube_ie.initialize()
2409 def _real_extract(self, query):
2410 mobj = re.match(self._VALID_URL, query)
2412 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2415 prefix, query = query.split(':')
2417 query = query.encode('utf-8')
2419 self._download_n_results(query, 1)
2421 elif prefix == 'all':
2422 self._download_n_results(query, self._max_youtube_results)
2428 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2430 elif n > self._max_youtube_results:
2431 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2432 n = self._max_youtube_results
2433 self._download_n_results(query, n)
2435 except ValueError: # parsing prefix as integer fails
2436 self._download_n_results(query, 1)
2439 def _download_n_results(self, query, n):
2440 """Downloads a specified number of results for a query"""
2446 while (50 * pagenum) < limit:
2447 self.report_download_page(query, pagenum+1)
2448 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
2449 request = urllib2.Request(result_url)
2451 data = urllib2.urlopen(request).read()
2452 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2453 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
2455 api_response = json.loads(data)['data']
2457 new_ids = list(video['id'] for video in api_response['items'])
2458 video_ids += new_ids
2460 limit = min(n, api_response['totalItems'])
2463 if len(video_ids) > n:
2464 video_ids = video_ids[:n]
2465 for id in video_ids:
2466 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2470 class GoogleSearchIE(InfoExtractor):
2471 """Information Extractor for Google Video search queries."""
2472 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2473 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2474 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2475 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2477 _max_google_results = 1000
2478 IE_NAME = u'video.google:search'
2480 def __init__(self, google_ie, downloader=None):
2481 InfoExtractor.__init__(self, downloader)
2482 self._google_ie = google_ie
2484 def report_download_page(self, query, pagenum):
2485 """Report attempt to download playlist page with given number."""
2486 query = query.decode(preferredencoding())
2487 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2489 def _real_initialize(self):
2490 self._google_ie.initialize()
2492 def _real_extract(self, query):
2493 mobj = re.match(self._VALID_URL, query)
2495 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2498 prefix, query = query.split(':')
2500 query = query.encode('utf-8')
2502 self._download_n_results(query, 1)
2504 elif prefix == 'all':
2505 self._download_n_results(query, self._max_google_results)
2511 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2513 elif n > self._max_google_results:
2514 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2515 n = self._max_google_results
2516 self._download_n_results(query, n)
2518 except ValueError: # parsing prefix as integer fails
2519 self._download_n_results(query, 1)
2522 def _download_n_results(self, query, n):
2523 """Downloads a specified number of results for a query"""
2529 self.report_download_page(query, pagenum)
2530 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2531 request = urllib2.Request(result_url)
2533 page = urllib2.urlopen(request).read()
2534 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2535 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2538 # Extract video identifiers
2539 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2540 video_id = mobj.group(1)
2541 if video_id not in video_ids:
2542 video_ids.append(video_id)
2543 if len(video_ids) == n:
2544 # Specified n videos reached
2545 for id in video_ids:
2546 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2549 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2550 for id in video_ids:
2551 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2554 pagenum = pagenum + 1
2557 class YahooSearchIE(InfoExtractor):
2558 """Information Extractor for Yahoo! Video search queries."""
2559 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2560 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2561 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2562 _MORE_PAGES_INDICATOR = r'\s*Next'
2564 _max_yahoo_results = 1000
2565 IE_NAME = u'video.yahoo:search'
2567 def __init__(self, yahoo_ie, downloader=None):
2568 InfoExtractor.__init__(self, downloader)
2569 self._yahoo_ie = yahoo_ie
2571 def report_download_page(self, query, pagenum):
2572 """Report attempt to download playlist page with given number."""
2573 query = query.decode(preferredencoding())
2574 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2576 def _real_initialize(self):
2577 self._yahoo_ie.initialize()
2579 def _real_extract(self, query):
2580 mobj = re.match(self._VALID_URL, query)
2582 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2585 prefix, query = query.split(':')
2587 query = query.encode('utf-8')
2589 self._download_n_results(query, 1)
2591 elif prefix == 'all':
2592 self._download_n_results(query, self._max_yahoo_results)
2598 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2600 elif n > self._max_yahoo_results:
2601 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2602 n = self._max_yahoo_results
2603 self._download_n_results(query, n)
2605 except ValueError: # parsing prefix as integer fails
2606 self._download_n_results(query, 1)
2609 def _download_n_results(self, query, n):
2610 """Downloads a specified number of results for a query"""
2613 already_seen = set()
2617 self.report_download_page(query, pagenum)
2618 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2619 request = urllib2.Request(result_url)
2621 page = urllib2.urlopen(request).read()
2622 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2623 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2626 # Extract video identifiers
2627 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2628 video_id = mobj.group(1)
2629 if video_id not in already_seen:
2630 video_ids.append(video_id)
2631 already_seen.add(video_id)
2632 if len(video_ids) == n:
2633 # Specified n videos reached
2634 for id in video_ids:
2635 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2638 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2639 for id in video_ids:
2640 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2643 pagenum = pagenum + 1
2646 class YoutubePlaylistIE(InfoExtractor):
2647 """Information Extractor for YouTube playlists."""
2649 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2650 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2651 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=PL%s&'
2652 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2654 IE_NAME = u'youtube:playlist'
2656 def __init__(self, youtube_ie, downloader=None):
2657 InfoExtractor.__init__(self, downloader)
2658 self._youtube_ie = youtube_ie
2660 def report_download_page(self, playlist_id, pagenum):
2661 """Report attempt to download playlist page with given number."""
2662 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2664 def _real_initialize(self):
2665 self._youtube_ie.initialize()
2667 def _real_extract(self, url):
2668 # Extract playlist id
2669 mobj = re.match(self._VALID_URL, url)
2671 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2675 if mobj.group(3) is not None:
2676 self._youtube_ie.extract(mobj.group(3))
2679 # Download playlist pages
2680 # prefix is 'p' as default for playlists but there are other types that need extra care
2681 playlist_prefix = mobj.group(1)
2682 if playlist_prefix == 'a':
2683 playlist_access = 'artist'
2685 playlist_prefix = 'p'
2686 playlist_access = 'view_play_list'
2687 playlist_id = mobj.group(2)
2692 self.report_download_page(playlist_id, pagenum)
2693 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2694 request = urllib2.Request(url)
2696 page = urllib2.urlopen(request).read()
2697 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2698 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2701 # Extract video identifiers
2703 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
2704 if mobj.group(1) not in ids_in_page:
2705 ids_in_page.append(mobj.group(1))
2706 video_ids.extend(ids_in_page)
2708 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2710 pagenum = pagenum + 1
2712 playliststart = self._downloader.params.get('playliststart', 1) - 1
2713 playlistend = self._downloader.params.get('playlistend', -1)
2714 if playlistend == -1:
2715 video_ids = video_ids[playliststart:]
2717 video_ids = video_ids[playliststart:playlistend]
2719 for id in video_ids:
2720 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2724 class YoutubeUserIE(InfoExtractor):
2725 """Information Extractor for YouTube users."""
2727 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2728 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2729 _GDATA_PAGE_SIZE = 50
2730 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2731 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2733 IE_NAME = u'youtube:user'
2735 def __init__(self, youtube_ie, downloader=None):
2736 InfoExtractor.__init__(self, downloader)
2737 self._youtube_ie = youtube_ie
2739 def report_download_page(self, username, start_index):
2740 """Report attempt to download user page."""
2741 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2742 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2744 def _real_initialize(self):
2745 self._youtube_ie.initialize()
2747 def _real_extract(self, url):
2749 mobj = re.match(self._VALID_URL, url)
2751 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2754 username = mobj.group(1)
2756 # Download video ids using YouTube Data API. Result size per
2757 # query is limited (currently to 50 videos) so we need to query
2758 # page by page until there are no video ids - it means we got
2765 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2766 self.report_download_page(username, start_index)
2768 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2771 page = urllib2.urlopen(request).read()
2772 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2773 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2776 # Extract video identifiers
2779 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2780 if mobj.group(1) not in ids_in_page:
2781 ids_in_page.append(mobj.group(1))
2783 video_ids.extend(ids_in_page)
2785 # A little optimization - if current page is not
2786 # "full", ie. does not contain PAGE_SIZE video ids then
2787 # we can assume that this page is the last one - there
2788 # are no more ids on further pages - no need to query
2791 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2796 all_ids_count = len(video_ids)
2797 playliststart = self._downloader.params.get('playliststart', 1) - 1
2798 playlistend = self._downloader.params.get('playlistend', -1)
2800 if playlistend == -1:
2801 video_ids = video_ids[playliststart:]
2803 video_ids = video_ids[playliststart:playlistend]
2805 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2806 (username, all_ids_count, len(video_ids)))
2808 for video_id in video_ids:
2809 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2812 class DepositFilesIE(InfoExtractor):
2813 """Information extractor for depositfiles.com"""
2815 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2816 IE_NAME = u'DepositFiles'
2818 def __init__(self, downloader=None):
2819 InfoExtractor.__init__(self, downloader)
2821 def report_download_webpage(self, file_id):
2822 """Report webpage download."""
2823 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2825 def report_extraction(self, file_id):
2826 """Report information extraction."""
2827 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2829 def _real_extract(self, url):
2830 # At this point we have a new file
2831 self._downloader.increment_downloads()
2833 file_id = url.split('/')[-1]
2834 # Rebuild url in english locale
2835 url = 'http://depositfiles.com/en/files/' + file_id
2837 # Retrieve file webpage with 'Free download' button pressed
2838 free_download_indication = { 'gateway_result' : '1' }
2839 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2841 self.report_download_webpage(file_id)
2842 webpage = urllib2.urlopen(request).read()
2843 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2844 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2847 # Search for the real file URL
2848 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2849 if (mobj is None) or (mobj.group(1) is None):
2850 # Try to figure out reason of the error.
2851 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2852 if (mobj is not None) and (mobj.group(1) is not None):
2853 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2854 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2856 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2859 file_url = mobj.group(1)
2860 file_extension = os.path.splitext(file_url)[1][1:]
2862 # Search for file title
2863 mobj = re.search(r'<b title="(.*?)">', webpage)
2865 self._downloader.trouble(u'ERROR: unable to extract title')
2867 file_title = mobj.group(1).decode('utf-8')
2870 # Process file information
2871 self._downloader.process_info({
2872 'id': file_id.decode('utf-8'),
2873 'url': file_url.decode('utf-8'),
2875 'upload_date': u'NA',
2876 'title': file_title,
2877 'stitle': file_title,
2878 'ext': file_extension.decode('utf-8'),
2882 except UnavailableVideoError, err:
2883 self._downloader.trouble(u'ERROR: unable to download file')
2886 class FacebookIE(InfoExtractor):
2887 """Information Extractor for Facebook"""
2889 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2890 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2891 _NETRC_MACHINE = 'facebook'
2892 _available_formats = ['video', 'highqual', 'lowqual']
2893 _video_extensions = {
2898 IE_NAME = u'facebook'
2900 def __init__(self, downloader=None):
2901 InfoExtractor.__init__(self, downloader)
2903 def _reporter(self, message):
2904 """Add header and report message."""
2905 self._downloader.to_screen(u'[facebook] %s' % message)
2907 def report_login(self):
2908 """Report attempt to log in."""
2909 self._reporter(u'Logging in')
2911 def report_video_webpage_download(self, video_id):
2912 """Report attempt to download video webpage."""
2913 self._reporter(u'%s: Downloading video webpage' % video_id)
2915 def report_information_extraction(self, video_id):
2916 """Report attempt to extract video information."""
2917 self._reporter(u'%s: Extracting video information' % video_id)
2919 def _parse_page(self, video_webpage):
2920 """Extract video information from page"""
2922 data = {'title': r'\("video_title", "(.*?)"\)',
2923 'description': r'<div class="datawrap">(.*?)</div>',
2924 'owner': r'\("video_owner_name", "(.*?)"\)',
2925 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2928 for piece in data.keys():
2929 mobj = re.search(data[piece], video_webpage)
2930 if mobj is not None:
2931 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2935 for fmt in self._available_formats:
2936 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2937 if mobj is not None:
2938 # URL is in a Javascript segment inside an escaped Unicode format within
2939 # the generally utf-8 page
2940 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2941 video_info['video_urls'] = video_urls
2945 def _real_initialize(self):
2946 if self._downloader is None:
2951 downloader_params = self._downloader.params
2953 # Attempt to use provided username and password or .netrc data
2954 if downloader_params.get('username', None) is not None:
2955 useremail = downloader_params['username']
2956 password = downloader_params['password']
2957 elif downloader_params.get('usenetrc', False):
2959 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2960 if info is not None:
2964 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2965 except (IOError, netrc.NetrcParseError), err:
2966 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2969 if useremail is None:
2978 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2981 login_results = urllib2.urlopen(request).read()
2982 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2983 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2985 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2986 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2989 def _real_extract(self, url):
2990 mobj = re.match(self._VALID_URL, url)
2992 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2994 video_id = mobj.group('ID')
2997 self.report_video_webpage_download(video_id)
2998 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
3000 page = urllib2.urlopen(request)
3001 video_webpage = page.read()
3002 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3003 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3006 # Start extracting information
3007 self.report_information_extraction(video_id)
3009 # Extract information
3010 video_info = self._parse_page(video_webpage)
3013 if 'owner' not in video_info:
3014 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
3016 video_uploader = video_info['owner']
3019 if 'title' not in video_info:
3020 self._downloader.trouble(u'ERROR: unable to extract video title')
3022 video_title = video_info['title']
3023 video_title = video_title.decode('utf-8')
3024 video_title = sanitize_title(video_title)
3026 simple_title = _simplify_title(video_title)
3029 if 'thumbnail' not in video_info:
3030 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
3031 video_thumbnail = ''
3033 video_thumbnail = video_info['thumbnail']
3037 if 'upload_date' in video_info:
3038 upload_time = video_info['upload_date']
3039 timetuple = email.utils.parsedate_tz(upload_time)
3040 if timetuple is not None:
3042 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
3047 video_description = video_info.get('description', 'No description available.')
3049 url_map = video_info['video_urls']
3050 if len(url_map.keys()) > 0:
3051 # Decide which formats to download
3052 req_format = self._downloader.params.get('format', None)
3053 format_limit = self._downloader.params.get('format_limit', None)
3055 if format_limit is not None and format_limit in self._available_formats:
3056 format_list = self._available_formats[self._available_formats.index(format_limit):]
3058 format_list = self._available_formats
3059 existing_formats = [x for x in format_list if x in url_map]
3060 if len(existing_formats) == 0:
3061 self._downloader.trouble(u'ERROR: no known formats available for video')
3063 if req_format is None:
3064 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
3065 elif req_format == 'worst':
3066 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3067 elif req_format == '-1':
3068 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3071 if req_format not in url_map:
3072 self._downloader.trouble(u'ERROR: requested format not available')
3074 video_url_list = [(req_format, url_map[req_format])] # Specific format
3076 for format_param, video_real_url in video_url_list:
3078 # At this point we have a new video
3079 self._downloader.increment_downloads()
3082 video_extension = self._video_extensions.get(format_param, 'mp4')
3085 # Process video information
3086 self._downloader.process_info({
3087 'id': video_id.decode('utf-8'),
3088 'url': video_real_url.decode('utf-8'),
3089 'uploader': video_uploader.decode('utf-8'),
3090 'upload_date': upload_date,
3091 'title': video_title,
3092 'stitle': simple_title,
3093 'ext': video_extension.decode('utf-8'),
3094 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3095 'thumbnail': video_thumbnail.decode('utf-8'),
3096 'description': video_description.decode('utf-8'),
3099 except UnavailableVideoError, err:
3100 self._downloader.trouble(u'\nERROR: unable to download video')
3102 class BlipTVIE(InfoExtractor):
3103 """Information extractor for blip.tv"""
3105 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3106 _URL_EXT = r'^.*\.([a-z0-9]+)$'
3107 IE_NAME = u'blip.tv'
3109 def report_extraction(self, file_id):
3110 """Report information extraction."""
3111 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3113 def report_direct_download(self, title):
3114 """Report information extraction."""
3115 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3117 def _real_extract(self, url):
3118 mobj = re.match(self._VALID_URL, url)
3120 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3127 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3128 request = urllib2.Request(json_url)
3129 self.report_extraction(mobj.group(1))
3132 urlh = urllib2.urlopen(request)
3133 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3134 basename = url.split('/')[-1]
3135 title,ext = os.path.splitext(basename)
3136 title = title.decode('UTF-8')
3137 ext = ext.replace('.', '')
3138 self.report_direct_download(title)
3143 'stitle': _simplify_title(title),
3147 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3148 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3150 if info is None: # Regular URL
3152 json_code = urlh.read()
3153 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3154 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3158 json_data = json.loads(json_code)
3159 if 'Post' in json_data:
3160 data = json_data['Post']
3164 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3165 video_url = data['media']['url']
3166 umobj = re.match(self._URL_EXT, video_url)
3168 raise ValueError('Can not determine filename extension')
3169 ext = umobj.group(1)
3172 'id': data['item_id'],
3174 'uploader': data['display_name'],
3175 'upload_date': upload_date,
3176 'title': data['title'],
3177 'stitle': _simplify_title(data['title']),
3179 'format': data['media']['mimeType'],
3180 'thumbnail': data['thumbnailUrl'],
3181 'description': data['description'],
3182 'player_url': data['embedUrl']
3184 except (ValueError,KeyError), err:
3185 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3188 self._downloader.increment_downloads()
3191 self._downloader.process_info(info)
3192 except UnavailableVideoError, err:
3193 self._downloader.trouble(u'\nERROR: unable to download video')
3196 class MyVideoIE(InfoExtractor):
3197 """Information Extractor for myvideo.de."""
3199 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3200 IE_NAME = u'myvideo'
3202 def __init__(self, downloader=None):
3203 InfoExtractor.__init__(self, downloader)
3205 def report_download_webpage(self, video_id):
3206 """Report webpage download."""
3207 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3209 def report_extraction(self, video_id):
3210 """Report information extraction."""
3211 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3213 def _real_extract(self,url):
3214 mobj = re.match(self._VALID_URL, url)
3216 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3219 video_id = mobj.group(1)
3222 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3224 self.report_download_webpage(video_id)
3225 webpage = urllib2.urlopen(request).read()
3226 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3227 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3230 self.report_extraction(video_id)
3231 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3234 self._downloader.trouble(u'ERROR: unable to extract media URL')
3236 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3238 mobj = re.search('<title>([^<]+)</title>', webpage)
3240 self._downloader.trouble(u'ERROR: unable to extract title')
3243 video_title = mobj.group(1)
3244 video_title = sanitize_title(video_title)
3246 simple_title = _simplify_title(video_title)
3249 self._downloader.process_info({
3253 'upload_date': u'NA',
3254 'title': video_title,
3255 'stitle': simple_title,
3260 except UnavailableVideoError:
3261 self._downloader.trouble(u'\nERROR: Unable to download video')
3263 class ComedyCentralIE(InfoExtractor):
3264 """Information extractor for The Daily Show and Colbert Report """
3266 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3267 IE_NAME = u'comedycentral'
3269 def report_extraction(self, episode_id):
3270 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3272 def report_config_download(self, episode_id):
3273 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3275 def report_index_download(self, episode_id):
3276 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3278 def report_player_url(self, episode_id):
3279 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3281 def _real_extract(self, url):
3282 mobj = re.match(self._VALID_URL, url)
3284 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3287 if mobj.group('shortname'):
3288 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3289 url = u'http://www.thedailyshow.com/full-episodes/'
3291 url = u'http://www.colbertnation.com/full-episodes/'
3292 mobj = re.match(self._VALID_URL, url)
3293 assert mobj is not None
3295 dlNewest = not mobj.group('episode')
3297 epTitle = mobj.group('showname')
3299 epTitle = mobj.group('episode')
3301 req = urllib2.Request(url)
3302 self.report_extraction(epTitle)
3304 htmlHandle = urllib2.urlopen(req)
3305 html = htmlHandle.read()
3306 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3307 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3310 url = htmlHandle.geturl()
3311 mobj = re.match(self._VALID_URL, url)
3313 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3315 if mobj.group('episode') == '':
3316 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3318 epTitle = mobj.group('episode')
3320 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3321 if len(mMovieParams) == 0:
3322 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3325 playerUrl_raw = mMovieParams[0][0]
3326 self.report_player_url(epTitle)
3328 urlHandle = urllib2.urlopen(playerUrl_raw)
3329 playerUrl = urlHandle.geturl()
3330 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3331 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3334 uri = mMovieParams[0][1]
3335 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3336 self.report_index_download(epTitle)
3338 indexXml = urllib2.urlopen(indexUrl).read()
3339 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3340 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3343 idoc = xml.etree.ElementTree.fromstring(indexXml)
3344 itemEls = idoc.findall('.//item')
3345 for itemEl in itemEls:
3346 mediaId = itemEl.findall('./guid')[0].text
3347 shortMediaId = mediaId.split(':')[-1]
3348 showId = mediaId.split(':')[-2].replace('.com', '')
3349 officialTitle = itemEl.findall('./title')[0].text
3350 officialDate = itemEl.findall('./pubDate')[0].text
3352 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3353 urllib.urlencode({'uri': mediaId}))
3354 configReq = urllib2.Request(configUrl)
3355 self.report_config_download(epTitle)
3357 configXml = urllib2.urlopen(configReq).read()
3358 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3359 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3362 cdoc = xml.etree.ElementTree.fromstring(configXml)
3364 for rendition in cdoc.findall('.//rendition'):
3365 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3369 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3372 # For now, just pick the highest bitrate
3373 format,video_url = turls[-1]
3375 self._downloader.increment_downloads()
3377 effTitle = showId + u'-' + epTitle
3382 'upload_date': officialDate,
3384 'stitle': _simplify_title(effTitle),
3388 'description': officialTitle,
3389 'player_url': playerUrl
3393 self._downloader.process_info(info)
3394 except UnavailableVideoError, err:
3395 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3399 class EscapistIE(InfoExtractor):
3400 """Information extractor for The Escapist """
3402 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3403 IE_NAME = u'escapist'
3405 def report_extraction(self, showName):
3406 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3408 def report_config_download(self, showName):
3409 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3411 def _real_extract(self, url):
3412 htmlParser = HTMLParser.HTMLParser()
3414 mobj = re.match(self._VALID_URL, url)
3416 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3418 showName = mobj.group('showname')
3419 videoId = mobj.group('episode')
3421 self.report_extraction(showName)
3423 webPage = urllib2.urlopen(url).read()
3424 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3425 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3428 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3429 description = htmlParser.unescape(descMatch.group(1))
3430 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3431 imgUrl = htmlParser.unescape(imgMatch.group(1))
3432 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3433 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3434 configUrlMatch = re.search('config=(.*)$', playerUrl)
3435 configUrl = urllib2.unquote(configUrlMatch.group(1))
3437 self.report_config_download(showName)
3439 configJSON = urllib2.urlopen(configUrl).read()
3440 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3441 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3444 # Technically, it's JavaScript, not JSON
3445 configJSON = configJSON.replace("'", '"')
3448 config = json.loads(configJSON)
3449 except (ValueError,), err:
3450 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3453 playlist = config['playlist']
3454 videoUrl = playlist[1]['url']
3456 self._downloader.increment_downloads()
3460 'uploader': showName,
3461 'upload_date': None,
3463 'stitle': _simplify_title(showName),
3466 'thumbnail': imgUrl,
3467 'description': description,
3468 'player_url': playerUrl,
3472 self._downloader.process_info(info)
3473 except UnavailableVideoError, err:
3474 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3477 class CollegeHumorIE(InfoExtractor):
3478 """Information extractor for collegehumor.com"""
3480 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3481 IE_NAME = u'collegehumor'
3483 def report_webpage(self, video_id):
3484 """Report information extraction."""
3485 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3487 def report_extraction(self, video_id):
3488 """Report information extraction."""
3489 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3491 def _real_extract(self, url):
3492 htmlParser = HTMLParser.HTMLParser()
3494 mobj = re.match(self._VALID_URL, url)
3496 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3498 video_id = mobj.group('videoid')
3500 self.report_webpage(video_id)
3501 request = urllib2.Request(url)
3503 webpage = urllib2.urlopen(request).read()
3504 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3505 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3508 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3510 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3512 internal_video_id = m.group('internalvideoid')
3516 'internal_id': internal_video_id,
3519 self.report_extraction(video_id)
3520 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3522 metaXml = urllib2.urlopen(xmlUrl).read()
3523 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3524 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3527 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3529 videoNode = mdoc.findall('./video')[0]
3530 info['description'] = videoNode.findall('./description')[0].text
3531 info['title'] = videoNode.findall('./caption')[0].text
3532 info['stitle'] = _simplify_title(info['title'])
3533 info['url'] = videoNode.findall('./file')[0].text
3534 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3535 info['ext'] = info['url'].rpartition('.')[2]
3536 info['format'] = info['ext']
3538 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3541 self._downloader.increment_downloads()
3544 self._downloader.process_info(info)
3545 except UnavailableVideoError, err:
3546 self._downloader.trouble(u'\nERROR: unable to download video')
3549 class XVideosIE(InfoExtractor):
3550 """Information extractor for xvideos.com"""
3552 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3553 IE_NAME = u'xvideos'
3555 def report_webpage(self, video_id):
3556 """Report information extraction."""
3557 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3559 def report_extraction(self, video_id):
3560 """Report information extraction."""
3561 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3563 def _real_extract(self, url):
3564 htmlParser = HTMLParser.HTMLParser()
3566 mobj = re.match(self._VALID_URL, url)
3568 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3570 video_id = mobj.group(1).decode('utf-8')
3572 self.report_webpage(video_id)
3574 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3576 webpage = urllib2.urlopen(request).read()
3577 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3578 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3581 self.report_extraction(video_id)
3585 mobj = re.search(r'flv_url=(.+?)&', webpage)
3587 self._downloader.trouble(u'ERROR: unable to extract video url')
3589 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3593 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3595 self._downloader.trouble(u'ERROR: unable to extract video title')
3597 video_title = mobj.group(1).decode('utf-8')
3600 # Extract video thumbnail
3601 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3603 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3605 video_thumbnail = mobj.group(1).decode('utf-8')
3609 self._downloader.increment_downloads()
3614 'upload_date': None,
3615 'title': video_title,
3616 'stitle': _simplify_title(video_title),
3619 'thumbnail': video_thumbnail,
3620 'description': None,
3625 self._downloader.process_info(info)
3626 except UnavailableVideoError, err:
3627 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3630 class SoundcloudIE(InfoExtractor):
3631 """Information extractor for soundcloud.com
3632 To access the media, the uid of the song and a stream token
3633 must be extracted from the page source and the script must make
3634 a request to media.soundcloud.com/crossdomain.xml. Then
3635 the media can be grabbed by requesting from an url composed
3636 of the stream token and uid
3639 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3640 IE_NAME = u'soundcloud'
3642 def __init__(self, downloader=None):
3643 InfoExtractor.__init__(self, downloader)
3645 def report_webpage(self, video_id):
3646 """Report information extraction."""
3647 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3649 def report_extraction(self, video_id):
3650 """Report information extraction."""
3651 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3653 def _real_extract(self, url):
3654 htmlParser = HTMLParser.HTMLParser()
3656 mobj = re.match(self._VALID_URL, url)
3658 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3661 # extract uploader (which is in the url)
3662 uploader = mobj.group(1).decode('utf-8')
3663 # extract simple title (uploader + slug of song title)
3664 slug_title = mobj.group(2).decode('utf-8')
3665 simple_title = uploader + '-' + slug_title
3667 self.report_webpage('%s/%s' % (uploader, slug_title))
3669 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3671 webpage = urllib2.urlopen(request).read()
3672 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3673 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3676 self.report_extraction('%s/%s' % (uploader, slug_title))
3678 # extract uid and stream token that soundcloud hands out for access
3679 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3681 video_id = mobj.group(1)
3682 stream_token = mobj.group(2)
3684 # extract unsimplified title
3685 mobj = re.search('"title":"(.*?)",', webpage)
3687 title = mobj.group(1)
3689 # construct media url (with uid/token)
3690 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3691 mediaURL = mediaURL % (video_id, stream_token)
3694 description = u'No description available'
3695 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3697 description = mobj.group(1)
3701 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3704 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3705 except Exception, e:
3708 # for soundcloud, a request to a cross domain is required for cookies
3709 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3712 self._downloader.process_info({
3713 'id': video_id.decode('utf-8'),
3715 'uploader': uploader.decode('utf-8'),
3716 'upload_date': upload_date,
3717 'title': simple_title.decode('utf-8'),
3718 'stitle': simple_title.decode('utf-8'),
3722 'description': description.decode('utf-8')
3724 except UnavailableVideoError:
3725 self._downloader.trouble(u'\nERROR: unable to download video')
3728 class InfoQIE(InfoExtractor):
3729 """Information extractor for infoq.com"""
3731 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3734 def report_webpage(self, video_id):
3735 """Report information extraction."""
3736 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3738 def report_extraction(self, video_id):
3739 """Report information extraction."""
3740 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3742 def _real_extract(self, url):
3743 htmlParser = HTMLParser.HTMLParser()
3745 mobj = re.match(self._VALID_URL, url)
3747 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3750 self.report_webpage(url)
3752 request = urllib2.Request(url)
3754 webpage = urllib2.urlopen(request).read()
3755 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3756 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3759 self.report_extraction(url)
3763 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3765 self._downloader.trouble(u'ERROR: unable to extract video url')
3767 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3771 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3773 self._downloader.trouble(u'ERROR: unable to extract video title')
3775 video_title = mobj.group(1).decode('utf-8')
3777 # Extract description
3778 video_description = u'No description available.'
3779 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3780 if mobj is not None:
3781 video_description = mobj.group(1).decode('utf-8')
3783 video_filename = video_url.split('/')[-1]
3784 video_id, extension = video_filename.split('.')
3786 self._downloader.increment_downloads()
3791 'upload_date': None,
3792 'title': video_title,
3793 'stitle': _simplify_title(video_title),
3795 'format': extension, # Extension is always(?) mp4, but seems to be flv
3797 'description': video_description,
3802 self._downloader.process_info(info)
3803 except UnavailableVideoError, err:
3804 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3806 class MixcloudIE(InfoExtractor):
3807 """Information extractor for www.mixcloud.com"""
3808 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3809 IE_NAME = u'mixcloud'
3811 def __init__(self, downloader=None):
3812 InfoExtractor.__init__(self, downloader)
3814 def report_download_json(self, file_id):
3815 """Report JSON download."""
3816 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3818 def report_extraction(self, file_id):
3819 """Report information extraction."""
3820 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3822 def get_urls(self, jsonData, fmt, bitrate='best'):
3823 """Get urls from 'audio_formats' section in json"""
3826 bitrate_list = jsonData[fmt]
3827 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3828 bitrate = max(bitrate_list) # select highest
3830 url_list = jsonData[fmt][bitrate]
3831 except TypeError: # we have no bitrate info.
3832 url_list = jsonData[fmt]
3836 def check_urls(self, url_list):
3837 """Returns 1st active url from list"""
3838 for url in url_list:
3840 urllib2.urlopen(url)
3842 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3847 def _print_formats(self, formats):
3848 print 'Available formats:'
3849 for fmt in formats.keys():
3850 for b in formats[fmt]:
3852 ext = formats[fmt][b][0]
3853 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3854 except TypeError: # we have no bitrate info
3855 ext = formats[fmt][0]
3856 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3859 def _real_extract(self, url):
3860 mobj = re.match(self._VALID_URL, url)
3862 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3864 # extract uploader & filename from url
3865 uploader = mobj.group(1).decode('utf-8')
3866 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3868 # construct API request
3869 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3870 # retrieve .json file with links to files
3871 request = urllib2.Request(file_url)
3873 self.report_download_json(file_url)
3874 jsonData = urllib2.urlopen(request).read()
3875 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3876 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3880 json_data = json.loads(jsonData)
3881 player_url = json_data['player_swf_url']
3882 formats = dict(json_data['audio_formats'])
3884 req_format = self._downloader.params.get('format', None)
3887 if self._downloader.params.get('listformats', None):
3888 self._print_formats(formats)
3891 if req_format is None or req_format == 'best':
3892 for format_param in formats.keys():
3893 url_list = self.get_urls(formats, format_param)
3895 file_url = self.check_urls(url_list)
3896 if file_url is not None:
3899 if req_format not in formats.keys():
3900 self._downloader.trouble(u'ERROR: format is not available')
3903 url_list = self.get_urls(formats, req_format)
3904 file_url = self.check_urls(url_list)
3905 format_param = req_format
3908 self._downloader.increment_downloads()
3910 # Process file information
3911 self._downloader.process_info({
3912 'id': file_id.decode('utf-8'),
3913 'url': file_url.decode('utf-8'),
3914 'uploader': uploader.decode('utf-8'),
3915 'upload_date': u'NA',
3916 'title': json_data['name'],
3917 'stitle': _simplify_title(json_data['name']),
3918 'ext': file_url.split('.')[-1].decode('utf-8'),
3919 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3920 'thumbnail': json_data['thumbnail_url'],
3921 'description': json_data['description'],
3922 'player_url': player_url.decode('utf-8'),
3924 except UnavailableVideoError, err:
3925 self._downloader.trouble(u'ERROR: unable to download file')
3927 class StanfordOpenClassroomIE(InfoExtractor):
3928 """Information extractor for Stanford's Open ClassRoom"""
3930 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3931 IE_NAME = u'stanfordoc'
3933 def report_download_webpage(self, objid):
3934 """Report information extraction."""
3935 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3937 def report_extraction(self, video_id):
3938 """Report information extraction."""
3939 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3941 def _real_extract(self, url):
3942 mobj = re.match(self._VALID_URL, url)
3944 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3947 if mobj.group('course') and mobj.group('video'): # A specific video
3948 course = mobj.group('course')
3949 video = mobj.group('video')
3951 'id': _simplify_title(course + '_' + video),
3954 self.report_extraction(info['id'])
3955 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3956 xmlUrl = baseUrl + video + '.xml'
3958 metaXml = urllib2.urlopen(xmlUrl).read()
3959 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3960 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3962 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3964 info['title'] = mdoc.findall('./title')[0].text
3965 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3967 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3969 info['stitle'] = _simplify_title(info['title'])
3970 info['ext'] = info['url'].rpartition('.')[2]
3971 info['format'] = info['ext']
3972 self._downloader.increment_downloads()
3974 self._downloader.process_info(info)
3975 except UnavailableVideoError, err:
3976 self._downloader.trouble(u'\nERROR: unable to download video')
3977 elif mobj.group('course'): # A course page
3978 unescapeHTML = HTMLParser.HTMLParser().unescape
3980 course = mobj.group('course')
3982 'id': _simplify_title(course),
3986 self.report_download_webpage(info['id'])
3988 coursepage = urllib2.urlopen(url).read()
3989 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3990 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3993 m = re.search('<h1>([^<]+)</h1>', coursepage)
3995 info['title'] = unescapeHTML(m.group(1))
3997 info['title'] = info['id']
3998 info['stitle'] = _simplify_title(info['title'])
4000 m = re.search('<description>([^<]+)</description>', coursepage)
4002 info['description'] = unescapeHTML(m.group(1))
4004 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
4007 'type': 'reference',
4008 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
4012 for entry in info['list']:
4013 assert entry['type'] == 'reference'
4014 self.extract(entry['url'])
4016 unescapeHTML = HTMLParser.HTMLParser().unescape
4019 'id': 'Stanford OpenClassroom',
4023 self.report_download_webpage(info['id'])
4024 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
4026 rootpage = urllib2.urlopen(rootURL).read()
4027 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4028 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
4031 info['title'] = info['id']
4032 info['stitle'] = _simplify_title(info['title'])
4034 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
4037 'type': 'reference',
4038 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
4042 for entry in info['list']:
4043 assert entry['type'] == 'reference'
4044 self.extract(entry['url'])
4046 class MTVIE(InfoExtractor):
4047 """Information extractor for MTV.com"""
4049 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
4052 def report_webpage(self, video_id):
4053 """Report information extraction."""
4054 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
4056 def report_extraction(self, video_id):
4057 """Report information extraction."""
4058 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
4060 def _real_extract(self, url):
4061 mobj = re.match(self._VALID_URL, url)
4063 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4065 if not mobj.group('proto'):
4066 url = 'http://' + url
4067 video_id = mobj.group('videoid')
4068 self.report_webpage(video_id)
4070 request = urllib2.Request(url)
4072 webpage = urllib2.urlopen(request).read()
4073 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4074 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4077 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4079 self._downloader.trouble(u'ERROR: unable to extract song name')
4081 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4082 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4084 self._downloader.trouble(u'ERROR: unable to extract performer')
4086 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4087 video_title = performer + ' - ' + song_name
4089 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4091 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4093 mtvn_uri = mobj.group(1)
4095 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4097 self._downloader.trouble(u'ERROR: unable to extract content id')
4099 content_id = mobj.group(1)
4101 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4102 self.report_extraction(video_id)
4103 request = urllib2.Request(videogen_url)
4105 metadataXml = urllib2.urlopen(request).read()
4106 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4107 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4110 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4111 renditions = mdoc.findall('.//rendition')
4113 # For now, always pick the highest quality.
4114 rendition = renditions[-1]
4117 _,_,ext = rendition.attrib['type'].partition('/')
4118 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4119 video_url = rendition.find('./src').text
4121 self._downloader.trouble('Invalid rendition field.')
4124 self._downloader.increment_downloads()
4128 'uploader': performer,
4129 'title': video_title,
4130 'stitle': _simplify_title(video_title),
4136 self._downloader.process_info(info)
4137 except UnavailableVideoError, err:
4138 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4141 class PostProcessor(object):
4142 """Post Processor class.
4144 PostProcessor objects can be added to downloaders with their
4145 add_post_processor() method. When the downloader has finished a
4146 successful download, it will take its internal chain of PostProcessors
4147 and start calling the run() method on each one of them, first with
4148 an initial argument and then with the returned value of the previous
4151 The chain will be stopped if one of them ever returns None or the end
4152 of the chain is reached.
4154 PostProcessor objects follow a "mutual registration" process similar
4155 to InfoExtractor objects.
4160 def __init__(self, downloader=None):
4161 self._downloader = downloader
4163 def set_downloader(self, downloader):
4164 """Sets the downloader for this PP."""
4165 self._downloader = downloader
4167 def run(self, information):
4168 """Run the PostProcessor.
4170 The "information" argument is a dictionary like the ones
4171 composed by InfoExtractors. The only difference is that this
4172 one has an extra field called "filepath" that points to the
4175 When this method returns None, the postprocessing chain is
4176 stopped. However, this method may return an information
4177 dictionary that will be passed to the next postprocessing
4178 object in the chain. It can be the one it received after
4179 changing some fields.
4181 In addition, this method may raise a PostProcessingError
4182 exception that will be taken into account by the downloader
4185 return information # by default, do nothing
4187 class AudioConversionError(BaseException):
4188 def __init__(self, message):
4189 self.message = message
4191 class FFmpegExtractAudioPP(PostProcessor):
4193 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4194 PostProcessor.__init__(self, downloader)
4195 if preferredcodec is None:
4196 preferredcodec = 'best'
4197 self._preferredcodec = preferredcodec
4198 self._preferredquality = preferredquality
4199 self._keepvideo = keepvideo
4202 def get_audio_codec(path):
4204 cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4205 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4206 output = handle.communicate()[0]
4207 if handle.wait() != 0:
4209 except (IOError, OSError):
4212 for line in output.split('\n'):
4213 if line.startswith('codec_name='):
4214 audio_codec = line.split('=')[1].strip()
4215 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4220 def run_ffmpeg(path, out_path, codec, more_opts):
4224 acodec_opts = ['-acodec', codec]
4225 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4227 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4228 stdout,stderr = p.communicate()
4229 except (IOError, OSError):
4230 e = sys.exc_info()[1]
4231 if isinstance(e, OSError) and e.errno == 2:
4232 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4235 if p.returncode != 0:
4236 msg = stderr.strip().split('\n')[-1]
4237 raise AudioConversionError(msg)
4239 def run(self, information):
4240 path = information['filepath']
4242 filecodec = self.get_audio_codec(path)
4243 if filecodec is None:
4244 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4248 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4249 if self._preferredcodec == 'm4a' and filecodec == 'aac':
4250 # Lossless, but in another container
4252 extension = self._preferredcodec
4253 more_opts = ['-absf', 'aac_adtstoasc']
4254 elif filecodec in ['aac', 'mp3', 'vorbis']:
4255 # Lossless if possible
4257 extension = filecodec
4258 if filecodec == 'aac':
4259 more_opts = ['-f', 'adts']
4260 if filecodec == 'vorbis':
4264 acodec = 'libmp3lame'
4267 if self._preferredquality is not None:
4268 more_opts += ['-ab', self._preferredquality]
4270 # We convert the audio (lossy)
4271 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4272 extension = self._preferredcodec
4274 if self._preferredquality is not None:
4275 more_opts += ['-ab', self._preferredquality]
4276 if self._preferredcodec == 'aac':
4277 more_opts += ['-f', 'adts']
4278 if self._preferredcodec == 'm4a':
4279 more_opts += ['-absf', 'aac_adtstoasc']
4280 if self._preferredcodec == 'vorbis':
4282 if self._preferredcodec == 'wav':
4284 more_opts += ['-f', 'wav']
4286 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4287 new_path = prefix + sep + extension
4288 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4290 self.run_ffmpeg(path, new_path, acodec, more_opts)
4292 etype,e,tb = sys.exc_info()
4293 if isinstance(e, AudioConversionError):
4294 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4296 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4299 # Try to update the date time for extracted audio file.
4300 if information.get('filetime') is not None:
4302 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4304 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4306 if not self._keepvideo:
4308 os.remove(_encodeFilename(path))
4309 except (IOError, OSError):
4310 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4313 information['filepath'] = new_path
4317 def updateSelf(downloader, filename):
4318 ''' Update the program file with the latest version from the repository '''
4319 # Note: downloader only used for options
4320 if not os.access(filename, os.W_OK):
4321 sys.exit('ERROR: no write permissions on %s' % filename)
4323 downloader.to_screen(u'Updating to latest version...')
4327 urlh = urllib.urlopen(UPDATE_URL)
4328 newcontent = urlh.read()
4330 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4331 if vmatch is not None and vmatch.group(1) == __version__:
4332 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4336 except (IOError, OSError), err:
4337 sys.exit('ERROR: unable to download latest version')
4340 outf = open(filename, 'wb')
4342 outf.write(newcontent)
4345 except (IOError, OSError), err:
4346 sys.exit('ERROR: unable to overwrite current version')
4348 downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4351 def _readOptions(filename_bytes):
4353 optionf = open(filename_bytes)
4355 return [] # silently skip if file is not present
4359 res += shlex.split(l, comments=True)
4364 def _format_option_string(option):
4365 ''' ('-o', '--option') -> -o, --format METAVAR'''
4369 if option._short_opts: opts.append(option._short_opts[0])
4370 if option._long_opts: opts.append(option._long_opts[0])
4371 if len(opts) > 1: opts.insert(1, ', ')
4373 if option.takes_value(): opts.append(' %s' % option.metavar)
4375 return "".join(opts)
4377 def _find_term_columns():
4378 columns = os.environ.get('COLUMNS', None)
4383 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4384 out,err = sp.communicate()
4385 return int(out.split()[1])
4391 max_help_position = 80
4393 # No need to wrap help messages if we're on a wide console
4394 columns = _find_term_columns()
4395 if columns: max_width = columns
4397 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4398 fmt.format_option_strings = _format_option_string
4401 'version' : __version__,
4403 'usage' : '%prog [options] url [url...]',
4404 'conflict_handler' : 'resolve',
4407 parser = optparse.OptionParser(**kw)
4410 general = optparse.OptionGroup(parser, 'General Options')
4411 selection = optparse.OptionGroup(parser, 'Video Selection')
4412 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4413 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4414 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4415 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4416 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4418 general.add_option('-h', '--help',
4419 action='help', help='print this help text and exit')
4420 general.add_option('-v', '--version',
4421 action='version', help='print program version and exit')
4422 general.add_option('-U', '--update',
4423 action='store_true', dest='update_self', help='update this program to latest version')
4424 general.add_option('-i', '--ignore-errors',
4425 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4426 general.add_option('-r', '--rate-limit',
4427 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4428 general.add_option('-R', '--retries',
4429 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4430 general.add_option('--dump-user-agent',
4431 action='store_true', dest='dump_user_agent',
4432 help='display the current browser identification', default=False)
4433 general.add_option('--list-extractors',
4434 action='store_true', dest='list_extractors',
4435 help='List all supported extractors and the URLs they would handle', default=False)
4437 selection.add_option('--playlist-start',
4438 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4439 selection.add_option('--playlist-end',
4440 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4441 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4442 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4443 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4445 authentication.add_option('-u', '--username',
4446 dest='username', metavar='USERNAME', help='account username')
4447 authentication.add_option('-p', '--password',
4448 dest='password', metavar='PASSWORD', help='account password')
4449 authentication.add_option('-n', '--netrc',
4450 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4453 video_format.add_option('-f', '--format',
4454 action='store', dest='format', metavar='FORMAT', help='video format code')
4455 video_format.add_option('--all-formats',
4456 action='store_const', dest='format', help='download all available video formats', const='all')
4457 video_format.add_option('--prefer-free-formats',
4458 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4459 video_format.add_option('--max-quality',
4460 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4461 video_format.add_option('-F', '--list-formats',
4462 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4463 video_format.add_option('--write-srt',
4464 action='store_true', dest='writesubtitles',
4465 help='write video closed captions to a .srt file (currently youtube only)', default=False)
4466 video_format.add_option('--srt-lang',
4467 action='store', dest='subtitleslang', metavar='LANG',
4468 help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
4471 verbosity.add_option('-q', '--quiet',
4472 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4473 verbosity.add_option('-s', '--simulate',
4474 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4475 verbosity.add_option('--skip-download',
4476 action='store_true', dest='skip_download', help='do not download the video', default=False)
4477 verbosity.add_option('-g', '--get-url',
4478 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4479 verbosity.add_option('-e', '--get-title',
4480 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4481 verbosity.add_option('--get-thumbnail',
4482 action='store_true', dest='getthumbnail',
4483 help='simulate, quiet but print thumbnail URL', default=False)
4484 verbosity.add_option('--get-description',
4485 action='store_true', dest='getdescription',
4486 help='simulate, quiet but print video description', default=False)
4487 verbosity.add_option('--get-filename',
4488 action='store_true', dest='getfilename',
4489 help='simulate, quiet but print output filename', default=False)
4490 verbosity.add_option('--get-format',
4491 action='store_true', dest='getformat',
4492 help='simulate, quiet but print output format', default=False)
4493 verbosity.add_option('--no-progress',
4494 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4495 verbosity.add_option('--console-title',
4496 action='store_true', dest='consoletitle',
4497 help='display progress in console titlebar', default=False)
4498 verbosity.add_option('-v', '--verbose',
4499 action='store_true', dest='verbose', help='print various debugging information', default=False)
4502 filesystem.add_option('-t', '--title',
4503 action='store_true', dest='usetitle', help='use title in file name', default=False)
4504 filesystem.add_option('-l', '--literal',
4505 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4506 filesystem.add_option('-A', '--auto-number',
4507 action='store_true', dest='autonumber',
4508 help='number downloaded files starting from 00000', default=False)
4509 filesystem.add_option('-o', '--output',
4510 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4511 filesystem.add_option('-a', '--batch-file',
4512 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4513 filesystem.add_option('-w', '--no-overwrites',
4514 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4515 filesystem.add_option('-c', '--continue',
4516 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4517 filesystem.add_option('--no-continue',
4518 action='store_false', dest='continue_dl',
4519 help='do not resume partially downloaded files (restart from beginning)')
4520 filesystem.add_option('--cookies',
4521 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4522 filesystem.add_option('--no-part',
4523 action='store_true', dest='nopart', help='do not use .part files', default=False)
4524 filesystem.add_option('--no-mtime',
4525 action='store_false', dest='updatetime',
4526 help='do not use the Last-modified header to set the file modification time', default=True)
4527 filesystem.add_option('--write-description',
4528 action='store_true', dest='writedescription',
4529 help='write video description to a .description file', default=False)
4530 filesystem.add_option('--write-info-json',
4531 action='store_true', dest='writeinfojson',
4532 help='write video metadata to a .info.json file', default=False)
4535 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4536 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4537 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4538 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4539 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4540 help='ffmpeg audio bitrate specification, 128k by default')
4541 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4542 help='keeps the video file on disk after the post-processing; the video is erased by default')
4545 parser.add_option_group(general)
4546 parser.add_option_group(selection)
4547 parser.add_option_group(filesystem)
4548 parser.add_option_group(verbosity)
4549 parser.add_option_group(video_format)
4550 parser.add_option_group(authentication)
4551 parser.add_option_group(postproc)
4553 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4555 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4557 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4558 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4559 opts, args = parser.parse_args(argv)
4561 return parser, opts, args
4563 def gen_extractors():
4564 """ Return a list of an instance of every supported extractor.
4565 The order does matter; the first extractor matched is the one handling the URL.
4567 youtube_ie = YoutubeIE()
4568 google_ie = GoogleIE()
4569 yahoo_ie = YahooIE()
4571 YoutubePlaylistIE(youtube_ie),
4572 YoutubeUserIE(youtube_ie),
4573 YoutubeSearchIE(youtube_ie),
4575 MetacafeIE(youtube_ie),
4578 GoogleSearchIE(google_ie),
4581 YahooSearchIE(yahoo_ie),
4594 StanfordOpenClassroomIE(),
4601 parser, opts, args = parseOpts()
4603 # Open appropriate CookieJar
4604 if opts.cookiefile is None:
4605 jar = cookielib.CookieJar()
4608 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4609 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4611 except (IOError, OSError), err:
4612 sys.exit(u'ERROR: unable to open cookie file')
4615 if opts.dump_user_agent:
4616 print std_headers['User-Agent']
4619 # Batch file verification
4621 if opts.batchfile is not None:
4623 if opts.batchfile == '-':
4626 batchfd = open(opts.batchfile, 'r')
4627 batchurls = batchfd.readlines()
4628 batchurls = [x.strip() for x in batchurls]
4629 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4631 sys.exit(u'ERROR: batch file could not be read')
4632 all_urls = batchurls + args
4633 all_urls = map(lambda url: url.strip(), all_urls)
4635 # General configuration
4636 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4637 proxy_handler = urllib2.ProxyHandler()
4638 opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4639 urllib2.install_opener(opener)
4640 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4643 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4645 extractors = gen_extractors()
4647 if opts.list_extractors:
4648 for ie in extractors:
4650 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4651 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4652 for mu in matchedUrls:
4656 # Conflicting, missing and erroneous options
4657 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4658 parser.error(u'using .netrc conflicts with giving username/password')
4659 if opts.password is not None and opts.username is None:
4660 parser.error(u'account username missing')
4661 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4662 parser.error(u'using output template conflicts with using title, literal title or auto number')
4663 if opts.usetitle and opts.useliteral:
4664 parser.error(u'using title conflicts with using literal title')
4665 if opts.username is not None and opts.password is None:
4666 opts.password = getpass.getpass(u'Type account password and press return:')
4667 if opts.ratelimit is not None:
4668 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4669 if numeric_limit is None:
4670 parser.error(u'invalid rate limit specified')
4671 opts.ratelimit = numeric_limit
4672 if opts.retries is not None:
4674 opts.retries = long(opts.retries)
4675 except (TypeError, ValueError), err:
4676 parser.error(u'invalid retry count specified')
4678 opts.playliststart = int(opts.playliststart)
4679 if opts.playliststart <= 0:
4680 raise ValueError(u'Playlist start must be positive')
4681 except (TypeError, ValueError), err:
4682 parser.error(u'invalid playlist start number specified')
4684 opts.playlistend = int(opts.playlistend)
4685 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4686 raise ValueError(u'Playlist end must be greater than playlist start')
4687 except (TypeError, ValueError), err:
4688 parser.error(u'invalid playlist end number specified')
4689 if opts.extractaudio:
4690 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4691 parser.error(u'invalid audio format specified')
4694 fd = FileDownloader({
4695 'usenetrc': opts.usenetrc,
4696 'username': opts.username,
4697 'password': opts.password,
4698 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4699 'forceurl': opts.geturl,
4700 'forcetitle': opts.gettitle,
4701 'forcethumbnail': opts.getthumbnail,
4702 'forcedescription': opts.getdescription,
4703 'forcefilename': opts.getfilename,
4704 'forceformat': opts.getformat,
4705 'simulate': opts.simulate,
4706 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4707 'format': opts.format,
4708 'format_limit': opts.format_limit,
4709 'listformats': opts.listformats,
4710 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4711 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4712 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4713 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4714 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4715 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4716 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4717 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4718 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4719 or u'%(id)s.%(ext)s'),
4720 'ignoreerrors': opts.ignoreerrors,
4721 'ratelimit': opts.ratelimit,
4722 'nooverwrites': opts.nooverwrites,
4723 'retries': opts.retries,
4724 'continuedl': opts.continue_dl,
4725 'noprogress': opts.noprogress,
4726 'playliststart': opts.playliststart,
4727 'playlistend': opts.playlistend,
4728 'logtostderr': opts.outtmpl == '-',
4729 'consoletitle': opts.consoletitle,
4730 'nopart': opts.nopart,
4731 'updatetime': opts.updatetime,
4732 'writedescription': opts.writedescription,
4733 'writeinfojson': opts.writeinfojson,
4734 'writesubtitles': opts.writesubtitles,
4735 'subtitleslang': opts.subtitleslang,
4736 'matchtitle': opts.matchtitle,
4737 'rejecttitle': opts.rejecttitle,
4738 'max_downloads': opts.max_downloads,
4739 'prefer_free_formats': opts.prefer_free_formats,
4740 'verbose': opts.verbose,
4742 for extractor in extractors:
4743 fd.add_info_extractor(extractor)
4746 if opts.extractaudio:
4747 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4750 if opts.update_self:
4751 updateSelf(fd, sys.argv[0])
4754 if len(all_urls) < 1:
4755 if not opts.update_self:
4756 parser.error(u'you must provide at least one URL')
4761 retcode = fd.download(all_urls)
4762 except MaxDownloadsReached:
4763 fd.to_screen(u'--max-download limit reached, aborting.')
4766 # Dump cookie jar if requested
4767 if opts.cookiefile is not None:
4770 except (IOError, OSError), err:
4771 sys.exit(u'ERROR: unable to save cookie jar')
4778 except DownloadError:
4780 except SameFileError:
4781 sys.exit(u'ERROR: fixed output name but more than one file to download')
4782 except KeyboardInterrupt:
4783 sys.exit(u'\nERROR: Interrupted by user')
4785 if __name__ == '__main__':
4788 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: