2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
21 __license__ = 'Public Domain'
22 __version__ = '2012.02.27'
24 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
57 except ImportError: # Python 2.4
60 import cStringIO as StringIO
64 # parse_qs was moved from the cgi module to the urlparse module recently.
66 from urlparse import parse_qs
68 from cgi import parse_qs
76 import xml.etree.ElementTree
77 except ImportError: # Python<2.5: Not officially supported, but let it slip
78 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
81 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
82 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
83 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Encoding': 'gzip, deflate',
85 'Accept-Language': 'en-us,en;q=0.5',
90 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
96 def raiseError(msg, i):
97 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
98 def skipSpace(i, expectMore=True):
99 while i < len(s) and s[i] in ' \t\r\n':
103 raiseError('Premature end', i)
105 def decodeEscape(match):
121 return unichr(int(esc[1:5], 16))
122 if len(esc) == 5+6 and esc[5:7] == '\\u':
123 hi = int(esc[1:5], 16)
124 low = int(esc[7:11], 16)
125 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
126 raise ValueError('Unknown escape ' + str(esc))
133 while s[e-bslashes-1] == '\\':
135 if bslashes % 2 == 1:
139 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
140 stri = rexp.sub(decodeEscape, s[i:e])
146 if s[i] == '}': # Empty dictionary
150 raiseError('Expected a string object key', i)
151 i,key = parseString(i)
153 if i >= len(s) or s[i] != ':':
154 raiseError('Expected a colon', i)
161 raiseError('Expected comma or closing curly brace', i)
166 if s[i] == ']': # Empty array
171 i = skipSpace(i) # Raise exception if premature end
175 raiseError('Expected a comma or closing bracket', i)
177 def parseDiscrete(i):
178 for k,v in {'true': True, 'false': False, 'null': None}.items():
179 if s.startswith(k, i):
181 raiseError('Not a boolean (or null)', i)
183 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
185 raiseError('Not a number', i)
187 if '.' in nums or 'e' in nums or 'E' in nums:
188 return (i+len(nums), float(nums))
189 return (i+len(nums), int(nums))
190 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
193 i,res = CHARMAP.get(s[i], parseNumber)(i)
194 i = skipSpace(i, False)
198 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
201 def preferredencoding():
202 """Get preferred encoding.
204 Returns the best encoding scheme for the system, based on
205 locale.getpreferredencoding() and some further tweaks.
207 def yield_preferredencoding():
209 pref = locale.getpreferredencoding()
215 return yield_preferredencoding().next()
218 def htmlentity_transform(matchobj):
219 """Transforms an HTML entity to a Unicode character.
221 This function receives a match object and is intended to be used with
222 the re.sub() function.
224 entity = matchobj.group(1)
226 # Known non-numeric HTML entity
227 if entity in htmlentitydefs.name2codepoint:
228 return unichr(htmlentitydefs.name2codepoint[entity])
231 mobj = re.match(ur'(?u)#(x?\d+)', entity)
233 numstr = mobj.group(1)
234 if numstr.startswith(u'x'):
236 numstr = u'0%s' % numstr
239 return unichr(long(numstr, base))
241 # Unknown entity in name, return its literal representation
242 return (u'&%s;' % entity)
245 def sanitize_title(utitle):
246 """Sanitizes a video title so it could be used as part of a filename."""
247 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
248 return utitle.replace(unicode(os.sep), u'%')
251 def sanitize_open(filename, open_mode):
252 """Try to open the given filename, and slightly tweak it if this fails.
254 Attempts to open the given filename. If this fails, it tries to change
255 the filename slightly, step by step, until it's either able to open it
256 or it fails and raises a final exception, like the standard open()
259 It returns the tuple (stream, definitive_file_name).
263 if sys.platform == 'win32':
265 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
266 return (sys.stdout, filename)
267 stream = open(_encodeFilename(filename), open_mode)
268 return (stream, filename)
269 except (IOError, OSError), err:
270 # In case of error, try to remove win32 forbidden chars
271 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
273 # An exception here should be caught in the caller
274 stream = open(_encodeFilename(filename), open_mode)
275 return (stream, filename)
278 def timeconvert(timestr):
279 """Convert RFC 2822 defined time string into system timestamp"""
281 timetuple = email.utils.parsedate_tz(timestr)
282 if timetuple is not None:
283 timestamp = email.utils.mktime_tz(timetuple)
286 def _simplify_title(title):
287 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
288 return expr.sub(u'_', title).strip(u'_')
290 def _orderedSet(iterable):
291 """ Remove all duplicates from the input iterable """
298 def _unescapeHTML(s):
300 @param s a string (of type unicode)
302 assert type(s) == type(u'')
304 htmlParser = HTMLParser.HTMLParser()
305 return htmlParser.unescape(s)
307 def _encodeFilename(s):
309 @param s The name of the file (of type unicode)
312 assert type(s) == type(u'')
314 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
315 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
316 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
317 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
320 return s.encode(sys.getfilesystemencoding(), 'ignore')
322 class DownloadError(Exception):
323 """Download Error exception.
325 This exception may be thrown by FileDownloader objects if they are not
326 configured to continue on errors. They will contain the appropriate
332 class SameFileError(Exception):
333 """Same File exception.
335 This exception will be thrown by FileDownloader objects if they detect
336 multiple files would have to be downloaded to the same file on disk.
341 class PostProcessingError(Exception):
342 """Post Processing exception.
344 This exception may be raised by PostProcessor's .run() method to
345 indicate an error in the postprocessing task.
349 class MaxDownloadsReached(Exception):
350 """ --max-downloads limit has been reached. """
354 class UnavailableVideoError(Exception):
355 """Unavailable Format exception.
357 This exception will be thrown when a video is requested
358 in a format that is not available for that video.
363 class ContentTooShortError(Exception):
364 """Content Too Short exception.
366 This exception may be raised by FileDownloader objects when a file they
367 download is too small for what the server announced first, indicating
368 the connection was probably interrupted.
374 def __init__(self, downloaded, expected):
375 self.downloaded = downloaded
376 self.expected = expected
379 class YoutubeDLHandler(urllib2.HTTPHandler):
380 """Handler for HTTP requests and responses.
382 This class, when installed with an OpenerDirector, automatically adds
383 the standard headers to every HTTP request and handles gzipped and
384 deflated responses from web servers. If compression is to be avoided in
385 a particular request, the original request in the program code only has
386 to include the HTTP header "Youtubedl-No-Compression", which will be
387 removed before making the real request.
389 Part of this code was copied from:
391 http://techknack.net/python-urllib2-handlers/
393 Andrew Rowls, the author of that code, agreed to release it to the
400 return zlib.decompress(data, -zlib.MAX_WBITS)
402 return zlib.decompress(data)
405 def addinfourl_wrapper(stream, headers, url, code):
406 if hasattr(urllib2.addinfourl, 'getcode'):
407 return urllib2.addinfourl(stream, headers, url, code)
408 ret = urllib2.addinfourl(stream, headers, url)
412 def http_request(self, req):
413 for h in std_headers:
416 req.add_header(h, std_headers[h])
417 if 'Youtubedl-no-compression' in req.headers:
418 if 'Accept-encoding' in req.headers:
419 del req.headers['Accept-encoding']
420 del req.headers['Youtubedl-no-compression']
423 def http_response(self, req, resp):
426 if resp.headers.get('Content-encoding', '') == 'gzip':
427 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
428 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
429 resp.msg = old_resp.msg
431 if resp.headers.get('Content-encoding', '') == 'deflate':
432 gz = StringIO.StringIO(self.deflate(resp.read()))
433 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
434 resp.msg = old_resp.msg
438 class FileDownloader(object):
439 """File Downloader class.
441 File downloader objects are the ones responsible of downloading the
442 actual video file and writing it to disk if the user has requested
443 it, among some other tasks. In most cases there should be one per
444 program. As, given a video URL, the downloader doesn't know how to
445 extract all the needed information, task that InfoExtractors do, it
446 has to pass the URL to one of them.
448 For this, file downloader objects have a method that allows
449 InfoExtractors to be registered in a given order. When it is passed
450 a URL, the file downloader handles it to the first InfoExtractor it
451 finds that reports being able to handle it. The InfoExtractor extracts
452 all the information about the video or videos the URL refers to, and
453 asks the FileDownloader to process the video information, possibly
454 downloading the video.
456 File downloaders accept a lot of parameters. In order not to saturate
457 the object constructor with arguments, it receives a dictionary of
458 options instead. These options are available through the params
459 attribute for the InfoExtractors to use. The FileDownloader also
460 registers itself as the downloader in charge for the InfoExtractors
461 that are added to it, so this is a "mutual registration".
465 username: Username for authentication purposes.
466 password: Password for authentication purposes.
467 usenetrc: Use netrc for authentication instead.
468 quiet: Do not print messages to stdout.
469 forceurl: Force printing final URL.
470 forcetitle: Force printing title.
471 forcethumbnail: Force printing thumbnail URL.
472 forcedescription: Force printing description.
473 forcefilename: Force printing final filename.
474 simulate: Do not download the video files.
475 format: Video format code.
476 format_limit: Highest quality format to try.
477 outtmpl: Template for output names.
478 ignoreerrors: Do not stop on download errors.
479 ratelimit: Download speed limit, in bytes/sec.
480 nooverwrites: Prevent overwriting files.
481 retries: Number of times to retry for HTTP error 5xx
482 continuedl: Try to continue downloads if possible.
483 noprogress: Do not print the progress bar.
484 playliststart: Playlist item to start at.
485 playlistend: Playlist item to end at.
486 matchtitle: Download only matching titles.
487 rejecttitle: Reject downloads for matching titles.
488 logtostderr: Log messages to stderr instead of stdout.
489 consoletitle: Display progress in console window's titlebar.
490 nopart: Do not use temporary .part files.
491 updatetime: Use the Last-modified header to set output file timestamps.
492 writedescription: Write the video description to a .description file
493 writeinfojson: Write the video description to a .info.json file
494 writesubtitles: Write the video subtitles to a .srt file
495 subtitleslang: Language of the subtitles to download
501 _download_retcode = None
502 _num_downloads = None
505 def __init__(self, params):
506 """Create a FileDownloader object with the given options."""
509 self._download_retcode = 0
510 self._num_downloads = 0
511 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
515 def format_bytes(bytes):
518 if type(bytes) is str:
523 exponent = long(math.log(bytes, 1024.0))
524 suffix = 'bkMGTPEZY'[exponent]
525 converted = float(bytes) / float(1024 ** exponent)
526 return '%.2f%s' % (converted, suffix)
529 def calc_percent(byte_counter, data_len):
532 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
535 def calc_eta(start, now, total, current):
539 if current == 0 or dif < 0.001: # One millisecond
541 rate = float(current) / dif
542 eta = long((float(total) - float(current)) / rate)
543 (eta_mins, eta_secs) = divmod(eta, 60)
546 return '%02d:%02d' % (eta_mins, eta_secs)
549 def calc_speed(start, now, bytes):
551 if bytes == 0 or dif < 0.001: # One millisecond
552 return '%10s' % '---b/s'
553 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
556 def best_block_size(elapsed_time, bytes):
557 new_min = max(bytes / 2.0, 1.0)
558 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
559 if elapsed_time < 0.001:
561 rate = bytes / elapsed_time
569 def parse_bytes(bytestr):
570 """Parse a string indicating a byte quantity into a long integer."""
571 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
574 number = float(matchobj.group(1))
575 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
576 return long(round(number * multiplier))
578 def add_info_extractor(self, ie):
579 """Add an InfoExtractor object to the end of the list."""
581 ie.set_downloader(self)
583 def add_post_processor(self, pp):
584 """Add a PostProcessor object to the end of the chain."""
586 pp.set_downloader(self)
588 def to_screen(self, message, skip_eol=False):
589 """Print message to stdout if not in quiet mode."""
590 assert type(message) == type(u'')
591 if not self.params.get('quiet', False):
592 terminator = [u'\n', u''][skip_eol]
593 output = message + terminator
595 if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
596 output = output.encode(preferredencoding(), 'ignore')
597 self._screen_file.write(output)
598 self._screen_file.flush()
600 def to_stderr(self, message):
601 """Print message to stderr."""
602 print >>sys.stderr, message.encode(preferredencoding())
604 def to_cons_title(self, message):
605 """Set console/terminal window title to message."""
606 if not self.params.get('consoletitle', False):
608 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
609 # c_wchar_p() might not be necessary if `message` is
610 # already of type unicode()
611 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
612 elif 'TERM' in os.environ:
613 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
615 def fixed_template(self):
616 """Checks if the output template is fixed."""
617 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
619 def trouble(self, message=None):
620 """Determine action to take when a download problem appears.
622 Depending on if the downloader has been configured to ignore
623 download errors or not, this method may throw an exception or
624 not when errors are found, after printing the message.
626 if message is not None:
627 self.to_stderr(message)
628 if not self.params.get('ignoreerrors', False):
629 raise DownloadError(message)
630 self._download_retcode = 1
632 def slow_down(self, start_time, byte_counter):
633 """Sleep if the download speed is over the rate limit."""
634 rate_limit = self.params.get('ratelimit', None)
635 if rate_limit is None or byte_counter == 0:
638 elapsed = now - start_time
641 speed = float(byte_counter) / elapsed
642 if speed > rate_limit:
643 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
645 def temp_name(self, filename):
646 """Returns a temporary filename for the given filename."""
647 if self.params.get('nopart', False) or filename == u'-' or \
648 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
650 return filename + u'.part'
652 def undo_temp_name(self, filename):
653 if filename.endswith(u'.part'):
654 return filename[:-len(u'.part')]
657 def try_rename(self, old_filename, new_filename):
659 if old_filename == new_filename:
661 os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
662 except (IOError, OSError), err:
663 self.trouble(u'ERROR: unable to rename file')
665 def try_utime(self, filename, last_modified_hdr):
666 """Try to set the last-modified time of the given file."""
667 if last_modified_hdr is None:
669 if not os.path.isfile(_encodeFilename(filename)):
671 timestr = last_modified_hdr
674 filetime = timeconvert(timestr)
678 os.utime(filename, (time.time(), filetime))
683 def report_writedescription(self, descfn):
684 """ Report that the description file is being written """
685 self.to_screen(u'[info] Writing video description to: ' + descfn)
687 def report_writesubtitles(self, srtfn):
688 """ Report that the subtitles file is being written """
689 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
691 def report_writeinfojson(self, infofn):
692 """ Report that the metadata file has been written """
693 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
695 def report_destination(self, filename):
696 """Report destination filename."""
697 self.to_screen(u'[download] Destination: ' + filename)
699 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
700 """Report download progress."""
701 if self.params.get('noprogress', False):
703 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
704 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
705 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
706 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
708 def report_resuming_byte(self, resume_len):
709 """Report attempt to resume at given byte."""
710 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
712 def report_retry(self, count, retries):
713 """Report retry in case of HTTP error 5xx"""
714 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
716 def report_file_already_downloaded(self, file_name):
717 """Report file has already been fully downloaded."""
719 self.to_screen(u'[download] %s has already been downloaded' % file_name)
720 except (UnicodeEncodeError), err:
721 self.to_screen(u'[download] The file has already been downloaded')
723 def report_unable_to_resume(self):
724 """Report it was impossible to resume download."""
725 self.to_screen(u'[download] Unable to resume')
727 def report_finish(self):
728 """Report download finished."""
729 if self.params.get('noprogress', False):
730 self.to_screen(u'[download] Download completed')
734 def increment_downloads(self):
735 """Increment the ordinal that assigns a number to each file."""
736 self._num_downloads += 1
738 def prepare_filename(self, info_dict):
739 """Generate the output filename."""
741 template_dict = dict(info_dict)
742 template_dict['epoch'] = unicode(long(time.time()))
743 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
744 filename = self.params['outtmpl'] % template_dict
746 except (ValueError, KeyError), err:
747 self.trouble(u'ERROR: invalid system charset or erroneous output template')
750 def _match_entry(self, info_dict):
751 """ Returns None iff the file should be downloaded """
753 title = info_dict['title']
754 matchtitle = self.params.get('matchtitle', False)
755 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
756 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
757 rejecttitle = self.params.get('rejecttitle', False)
758 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
759 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
762 def process_info(self, info_dict):
763 """Process a single dictionary returned by an InfoExtractor."""
765 reason = self._match_entry(info_dict)
766 if reason is not None:
767 self.to_screen(u'[download] ' + reason)
770 max_downloads = self.params.get('max_downloads')
771 if max_downloads is not None:
772 if self._num_downloads > int(max_downloads):
773 raise MaxDownloadsReached()
775 filename = self.prepare_filename(info_dict)
778 if self.params.get('forcetitle', False):
779 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
780 if self.params.get('forceurl', False):
781 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
782 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
783 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
784 if self.params.get('forcedescription', False) and 'description' in info_dict:
785 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
786 if self.params.get('forcefilename', False) and filename is not None:
787 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
788 if self.params.get('forceformat', False):
789 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
791 # Do nothing else if in simulate mode
792 if self.params.get('simulate', False):
799 dn = os.path.dirname(_encodeFilename(filename))
800 if dn != '' and not os.path.exists(dn): # dn is already encoded
802 except (OSError, IOError), err:
803 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
806 if self.params.get('writedescription', False):
808 descfn = filename + u'.description'
809 self.report_writedescription(descfn)
810 descfile = open(_encodeFilename(descfn), 'wb')
812 descfile.write(info_dict['description'].encode('utf-8'))
815 except (OSError, IOError):
816 self.trouble(u'ERROR: Cannot write description file ' + descfn)
819 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
820 # subtitles download errors are already managed as troubles in relevant IE
821 # that way it will silently go on when used with unsupporting IE
823 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
824 self.report_writesubtitles(srtfn)
825 srtfile = open(_encodeFilename(srtfn), 'wb')
827 srtfile.write(info_dict['subtitles'].encode('utf-8'))
830 except (OSError, IOError):
831 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
834 if self.params.get('writeinfojson', False):
835 infofn = filename + u'.info.json'
836 self.report_writeinfojson(infofn)
839 except (NameError,AttributeError):
840 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
843 infof = open(_encodeFilename(infofn), 'wb')
845 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
846 json.dump(json_info_dict, infof)
849 except (OSError, IOError):
850 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
853 if not self.params.get('skip_download', False):
854 if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
858 success = self._do_download(filename, info_dict)
859 except (OSError, IOError), err:
860 raise UnavailableVideoError
861 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
862 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
864 except (ContentTooShortError, ), err:
865 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
870 self.post_process(filename, info_dict)
871 except (PostProcessingError), err:
872 self.trouble(u'ERROR: postprocessing: %s' % str(err))
875 def download(self, url_list):
876 """Download a given list of URLs."""
877 if len(url_list) > 1 and self.fixed_template():
878 raise SameFileError(self.params['outtmpl'])
881 suitable_found = False
883 # Go to next InfoExtractor if not suitable
884 if not ie.suitable(url):
887 # Suitable InfoExtractor found
888 suitable_found = True
890 # Extract information from URL and process it
893 # Suitable InfoExtractor had been found; go to next URL
896 if not suitable_found:
897 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
899 return self._download_retcode
901 def post_process(self, filename, ie_info):
902 """Run the postprocessing chain on the given file."""
904 info['filepath'] = filename
910 def _download_with_rtmpdump(self, filename, url, player_url):
911 self.report_destination(filename)
912 tmpfilename = self.temp_name(filename)
914 # Check for rtmpdump first
916 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
917 except (OSError, IOError):
918 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
921 # Download using rtmpdump. rtmpdump returns exit code 2 when
922 # the connection was interrumpted and resuming appears to be
923 # possible. This is part of rtmpdump's normal usage, AFAIK.
924 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
925 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
926 if self.params.get('verbose', False):
929 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
932 self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
933 retval = subprocess.call(args)
934 while retval == 2 or retval == 1:
935 prevsize = os.path.getsize(_encodeFilename(tmpfilename))
936 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
937 time.sleep(5.0) # This seems to be needed
938 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
939 cursize = os.path.getsize(_encodeFilename(tmpfilename))
940 if prevsize == cursize and retval == 1:
942 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
943 if prevsize == cursize and retval == 2 and cursize > 1024:
944 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
948 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
949 self.try_rename(tmpfilename, filename)
952 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
955 def _do_download(self, filename, info_dict):
956 url = info_dict['url']
957 player_url = info_dict.get('player_url', None)
959 # Check file already present
960 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
961 self.report_file_already_downloaded(filename)
964 # Attempt to download using rtmpdump
965 if url.startswith('rtmp'):
966 return self._download_with_rtmpdump(filename, url, player_url)
968 tmpfilename = self.temp_name(filename)
971 # Do not include the Accept-Encoding header
972 headers = {'Youtubedl-no-compression': 'True'}
973 basic_request = urllib2.Request(url, None, headers)
974 request = urllib2.Request(url, None, headers)
976 # Establish possible resume length
977 if os.path.isfile(_encodeFilename(tmpfilename)):
978 resume_len = os.path.getsize(_encodeFilename(tmpfilename))
984 if self.params.get('continuedl', False):
985 self.report_resuming_byte(resume_len)
986 request.add_header('Range','bytes=%d-' % resume_len)
992 retries = self.params.get('retries', 0)
993 while count <= retries:
994 # Establish connection
996 if count == 0 and 'urlhandle' in info_dict:
997 data = info_dict['urlhandle']
998 data = urllib2.urlopen(request)
1000 except (urllib2.HTTPError, ), err:
1001 if (err.code < 500 or err.code >= 600) and err.code != 416:
1002 # Unexpected HTTP error
1004 elif err.code == 416:
1005 # Unable to resume (requested range not satisfiable)
1007 # Open the connection again without the range header
1008 data = urllib2.urlopen(basic_request)
1009 content_length = data.info()['Content-Length']
1010 except (urllib2.HTTPError, ), err:
1011 if err.code < 500 or err.code >= 600:
1014 # Examine the reported length
1015 if (content_length is not None and
1016 (resume_len - 100 < long(content_length) < resume_len + 100)):
1017 # The file had already been fully downloaded.
1018 # Explanation to the above condition: in issue #175 it was revealed that
1019 # YouTube sometimes adds or removes a few bytes from the end of the file,
1020 # changing the file size slightly and causing problems for some users. So
1021 # I decided to implement a suggested change and consider the file
1022 # completely downloaded if the file size differs less than 100 bytes from
1023 # the one in the hard drive.
1024 self.report_file_already_downloaded(filename)
1025 self.try_rename(tmpfilename, filename)
1028 # The length does not match, we start the download over
1029 self.report_unable_to_resume()
1034 if count <= retries:
1035 self.report_retry(count, retries)
1038 self.trouble(u'ERROR: giving up after %s retries' % retries)
1041 data_len = data.info().get('Content-length', None)
1042 if data_len is not None:
1043 data_len = long(data_len) + resume_len
1044 data_len_str = self.format_bytes(data_len)
1045 byte_counter = 0 + resume_len
1049 # Download and write
1050 before = time.time()
1051 data_block = data.read(block_size)
1053 if len(data_block) == 0:
1055 byte_counter += len(data_block)
1057 # Open file just in time
1060 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1061 assert stream is not None
1062 filename = self.undo_temp_name(tmpfilename)
1063 self.report_destination(filename)
1064 except (OSError, IOError), err:
1065 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1068 stream.write(data_block)
1069 except (IOError, OSError), err:
1070 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1072 block_size = self.best_block_size(after - before, len(data_block))
1075 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1076 if data_len is None:
1077 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1079 percent_str = self.calc_percent(byte_counter, data_len)
1080 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1081 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1084 self.slow_down(start, byte_counter - resume_len)
1087 self.trouble(u'\nERROR: Did not get any data blocks')
1090 self.report_finish()
1091 if data_len is not None and byte_counter != data_len:
1092 raise ContentTooShortError(byte_counter, long(data_len))
1093 self.try_rename(tmpfilename, filename)
1095 # Update file modification time
1096 if self.params.get('updatetime', True):
1097 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1102 class InfoExtractor(object):
1103 """Information Extractor class.
1105 Information extractors are the classes that, given a URL, extract
1106 information from the video (or videos) the URL refers to. This
1107 information includes the real video URL, the video title and simplified
1108 title, author and others. The information is stored in a dictionary
1109 which is then passed to the FileDownloader. The FileDownloader
1110 processes this information possibly downloading the video to the file
1111 system, among other possible outcomes. The dictionaries must include
1112 the following fields:
1114 id: Video identifier.
1115 url: Final video URL.
1116 uploader: Nickname of the video uploader.
1117 title: Literal title.
1118 stitle: Simplified title.
1119 ext: Video filename extension.
1120 format: Video format.
1121 player_url: SWF Player URL (may be None).
1123 The following fields are optional. Their primary purpose is to allow
1124 youtube-dl to serve as the backend for a video search function, such
1125 as the one in youtube2mp3. They are only used when their respective
1126 forced printing functions are called:
1128 thumbnail: Full URL to a video thumbnail image.
1129 description: One-line video description.
1131 Subclasses of this one should re-define the _real_initialize() and
1132 _real_extract() methods and define a _VALID_URL regexp.
1133 Probably, they should also be added to the list of extractors.
1139 def __init__(self, downloader=None):
1140 """Constructor. Receives an optional downloader."""
1142 self.set_downloader(downloader)
1144 def suitable(self, url):
1145 """Receives a URL and returns True if suitable for this IE."""
1146 return re.match(self._VALID_URL, url) is not None
1148 def initialize(self):
1149 """Initializes an instance (authentication, etc)."""
1151 self._real_initialize()
1154 def extract(self, url):
1155 """Extracts URL information and returns it in list of dicts."""
1157 return self._real_extract(url)
1159 def set_downloader(self, downloader):
1160 """Sets the downloader for this IE."""
1161 self._downloader = downloader
1163 def _real_initialize(self):
1164 """Real initialization process. Redefine in subclasses."""
1167 def _real_extract(self, url):
1168 """Real extraction process. Redefine in subclasses."""
1172 class YoutubeIE(InfoExtractor):
1173 """Information extractor for youtube.com."""
1175 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1176 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1177 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1178 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1179 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
1180 _NETRC_MACHINE = 'youtube'
1181 # Listed in order of quality
1182 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1183 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1184 _video_extensions = {
1190 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1195 _video_dimensions = {
1210 IE_NAME = u'youtube'
1212 def report_lang(self):
1213 """Report attempt to set language."""
1214 self._downloader.to_screen(u'[youtube] Setting language')
1216 def report_login(self):
1217 """Report attempt to log in."""
1218 self._downloader.to_screen(u'[youtube] Logging in')
1220 def report_age_confirmation(self):
1221 """Report attempt to confirm age."""
1222 self._downloader.to_screen(u'[youtube] Confirming age')
1224 def report_video_webpage_download(self, video_id):
1225 """Report attempt to download video webpage."""
1226 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1228 def report_video_info_webpage_download(self, video_id):
1229 """Report attempt to download video info webpage."""
1230 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1232 def report_video_subtitles_download(self, video_id):
1233 """Report attempt to download video info webpage."""
1234 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1236 def report_information_extraction(self, video_id):
1237 """Report attempt to extract video information."""
1238 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1240 def report_unavailable_format(self, video_id, format):
1241 """Report extracted video URL."""
1242 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1244 def report_rtmp_download(self):
1245 """Indicate the download will use the RTMP protocol."""
1246 self._downloader.to_screen(u'[youtube] RTMP download detected')
1248 def _closed_captions_xml_to_srt(self, xml_string):
1250 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1251 # TODO parse xml instead of regex
1252 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1253 if not dur: dur = '4'
1254 start = float(start)
1255 end = start + float(dur)
1256 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1257 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1258 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
1259 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
1260 srt += str(n) + '\n'
1261 srt += start + ' --> ' + end + '\n'
1262 srt += caption + '\n\n'
1265 def _print_formats(self, formats):
1266 print 'Available formats:'
1268 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1270 def _real_initialize(self):
1271 if self._downloader is None:
1276 downloader_params = self._downloader.params
1278 # Attempt to use provided username and password or .netrc data
1279 if downloader_params.get('username', None) is not None:
1280 username = downloader_params['username']
1281 password = downloader_params['password']
1282 elif downloader_params.get('usenetrc', False):
1284 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1285 if info is not None:
1289 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1290 except (IOError, netrc.NetrcParseError), err:
1291 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1295 request = urllib2.Request(self._LANG_URL)
1298 urllib2.urlopen(request).read()
1299 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1300 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1303 # No authentication to be performed
1304 if username is None:
1309 'current_form': 'loginForm',
1311 'action_login': 'Log In',
1312 'username': username,
1313 'password': password,
1315 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1318 login_results = urllib2.urlopen(request).read()
1319 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1320 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1322 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1323 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1329 'action_confirm': 'Confirm',
1331 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1333 self.report_age_confirmation()
1334 age_results = urllib2.urlopen(request).read()
1335 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1336 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1339 def _real_extract(self, url):
1340 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1341 mobj = re.search(self._NEXT_URL_RE, url)
1343 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
1345 # Extract video id from URL
1346 mobj = re.match(self._VALID_URL, url)
1348 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1350 video_id = mobj.group(2)
1353 self.report_video_webpage_download(video_id)
1354 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1356 video_webpage = urllib2.urlopen(request).read()
1357 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1358 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1361 # Attempt to extract SWF player URL
1362 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1363 if mobj is not None:
1364 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1369 self.report_video_info_webpage_download(video_id)
1370 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1371 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1372 % (video_id, el_type))
1373 request = urllib2.Request(video_info_url)
1375 video_info_webpage = urllib2.urlopen(request).read()
1376 video_info = parse_qs(video_info_webpage)
1377 if 'token' in video_info:
1379 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1380 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1382 if 'token' not in video_info:
1383 if 'reason' in video_info:
1384 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1386 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1389 # Start extracting information
1390 self.report_information_extraction(video_id)
1393 if 'author' not in video_info:
1394 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1396 video_uploader = urllib.unquote_plus(video_info['author'][0])
1399 if 'title' not in video_info:
1400 self._downloader.trouble(u'ERROR: unable to extract video title')
1402 video_title = urllib.unquote_plus(video_info['title'][0])
1403 video_title = video_title.decode('utf-8')
1404 video_title = sanitize_title(video_title)
1407 simple_title = _simplify_title(video_title)
1410 if 'thumbnail_url' not in video_info:
1411 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1412 video_thumbnail = ''
1413 else: # don't panic if we can't find it
1414 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1418 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1419 if mobj is not None:
1420 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1421 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1422 for expression in format_expressions:
1424 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1432 video_description = u'No description available.'
1433 mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1434 if mobj is not None:
1435 video_description = mobj.group(1).decode('utf-8')
1437 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1438 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1439 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1440 # TODO use another parser
1443 video_subtitles = None
1444 if self._downloader.params.get('writesubtitles', False):
1445 self.report_video_subtitles_download(video_id)
1446 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1448 srt_list = urllib2.urlopen(request).read()
1449 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1450 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1452 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1454 if self._downloader.params.get('subtitleslang', False):
1455 srt_lang = self._downloader.params.get('subtitleslang')
1456 elif 'en' in srt_lang_list:
1459 srt_lang = srt_lang_list[0]
1460 if not srt_lang in srt_lang_list:
1461 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
1463 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1465 srt_xml = urllib2.urlopen(request).read()
1466 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1467 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1469 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1471 self._downloader.trouble(u'WARNING: video has no closed captions')
1474 video_token = urllib.unquote_plus(video_info['token'][0])
1476 # Decide which formats to download
1477 req_format = self._downloader.params.get('format', None)
1479 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1480 self.report_rtmp_download()
1481 video_url_list = [(None, video_info['conn'][0])]
1482 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1483 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1484 url_data = [parse_qs(uds) for uds in url_data_strs]
1485 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1486 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1488 format_limit = self._downloader.params.get('format_limit', None)
1489 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1490 if format_limit is not None and format_limit in available_formats:
1491 format_list = available_formats[available_formats.index(format_limit):]
1493 format_list = available_formats
1494 existing_formats = [x for x in format_list if x in url_map]
1495 if len(existing_formats) == 0:
1496 self._downloader.trouble(u'ERROR: no known formats available for video')
1498 if self._downloader.params.get('listformats', None):
1499 self._print_formats(existing_formats)
1501 if req_format is None or req_format == 'best':
1502 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1503 elif req_format == 'worst':
1504 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1505 elif req_format in ('-1', 'all'):
1506 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1508 # Specific formats. We pick the first in a slash-delimeted sequence.
1509 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1510 req_formats = req_format.split('/')
1511 video_url_list = None
1512 for rf in req_formats:
1514 video_url_list = [(rf, url_map[rf])]
1516 if video_url_list is None:
1517 self._downloader.trouble(u'ERROR: requested format not available')
1520 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1523 for format_param, video_real_url in video_url_list:
1524 # At this point we have a new video
1525 self._downloader.increment_downloads()
1528 video_extension = self._video_extensions.get(format_param, 'flv')
1531 # Process video information
1532 self._downloader.process_info({
1533 'id': video_id.decode('utf-8'),
1534 'url': video_real_url.decode('utf-8'),
1535 'uploader': video_uploader.decode('utf-8'),
1536 'upload_date': upload_date,
1537 'title': video_title,
1538 'stitle': simple_title,
1539 'ext': video_extension.decode('utf-8'),
1540 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1541 'thumbnail': video_thumbnail.decode('utf-8'),
1542 'description': video_description,
1543 'player_url': player_url,
1544 'subtitles': video_subtitles
1546 except UnavailableVideoError, err:
1547 self._downloader.trouble(u'\nERROR: unable to download video')
1550 class MetacafeIE(InfoExtractor):
1551 """Information Extractor for metacafe.com."""
1553 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1554 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1555 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1557 IE_NAME = u'metacafe'
1559 def __init__(self, youtube_ie, downloader=None):
1560 InfoExtractor.__init__(self, downloader)
1561 self._youtube_ie = youtube_ie
1563 def report_disclaimer(self):
1564 """Report disclaimer retrieval."""
1565 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1567 def report_age_confirmation(self):
1568 """Report attempt to confirm age."""
1569 self._downloader.to_screen(u'[metacafe] Confirming age')
1571 def report_download_webpage(self, video_id):
1572 """Report webpage download."""
1573 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1575 def report_extraction(self, video_id):
1576 """Report information extraction."""
1577 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1579 def _real_initialize(self):
1580 # Retrieve disclaimer
1581 request = urllib2.Request(self._DISCLAIMER)
1583 self.report_disclaimer()
1584 disclaimer = urllib2.urlopen(request).read()
1585 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1586 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1592 'submit': "Continue - I'm over 18",
1594 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1596 self.report_age_confirmation()
1597 disclaimer = urllib2.urlopen(request).read()
1598 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1599 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1602 def _real_extract(self, url):
1603 # Extract id and simplified title from URL
1604 mobj = re.match(self._VALID_URL, url)
1606 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1609 video_id = mobj.group(1)
1611 # Check if video comes from YouTube
1612 mobj2 = re.match(r'^yt-(.*)$', video_id)
1613 if mobj2 is not None:
1614 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1617 # At this point we have a new video
1618 self._downloader.increment_downloads()
1620 simple_title = mobj.group(2).decode('utf-8')
1622 # Retrieve video webpage to extract further information
1623 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1625 self.report_download_webpage(video_id)
1626 webpage = urllib2.urlopen(request).read()
1627 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1628 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1631 # Extract URL, uploader and title from webpage
1632 self.report_extraction(video_id)
1633 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1634 if mobj is not None:
1635 mediaURL = urllib.unquote(mobj.group(1))
1636 video_extension = mediaURL[-3:]
1638 # Extract gdaKey if available
1639 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1641 video_url = mediaURL
1643 gdaKey = mobj.group(1)
1644 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1646 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1648 self._downloader.trouble(u'ERROR: unable to extract media URL')
1650 vardict = parse_qs(mobj.group(1))
1651 if 'mediaData' not in vardict:
1652 self._downloader.trouble(u'ERROR: unable to extract media URL')
1654 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1656 self._downloader.trouble(u'ERROR: unable to extract media URL')
1658 mediaURL = mobj.group(1).replace('\\/', '/')
1659 video_extension = mediaURL[-3:]
1660 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1662 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1664 self._downloader.trouble(u'ERROR: unable to extract title')
1666 video_title = mobj.group(1).decode('utf-8')
1667 video_title = sanitize_title(video_title)
1669 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1671 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1673 video_uploader = mobj.group(1)
1676 # Process video information
1677 self._downloader.process_info({
1678 'id': video_id.decode('utf-8'),
1679 'url': video_url.decode('utf-8'),
1680 'uploader': video_uploader.decode('utf-8'),
1681 'upload_date': u'NA',
1682 'title': video_title,
1683 'stitle': simple_title,
1684 'ext': video_extension.decode('utf-8'),
1688 except UnavailableVideoError:
1689 self._downloader.trouble(u'\nERROR: unable to download video')
1692 class DailymotionIE(InfoExtractor):
1693 """Information Extractor for Dailymotion"""
1695 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1696 IE_NAME = u'dailymotion'
1698 def __init__(self, downloader=None):
1699 InfoExtractor.__init__(self, downloader)
1701 def report_download_webpage(self, video_id):
1702 """Report webpage download."""
1703 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1705 def report_extraction(self, video_id):
1706 """Report information extraction."""
1707 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1709 def _real_extract(self, url):
1710 # Extract id and simplified title from URL
1711 mobj = re.match(self._VALID_URL, url)
1713 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1716 # At this point we have a new video
1717 self._downloader.increment_downloads()
1718 video_id = mobj.group(1)
1720 video_extension = 'flv'
1722 # Retrieve video webpage to extract further information
1723 request = urllib2.Request(url)
1724 request.add_header('Cookie', 'family_filter=off')
1726 self.report_download_webpage(video_id)
1727 webpage = urllib2.urlopen(request).read()
1728 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1729 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1732 # Extract URL, uploader and title from webpage
1733 self.report_extraction(video_id)
1734 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1736 self._downloader.trouble(u'ERROR: unable to extract media URL')
1738 sequence = urllib.unquote(mobj.group(1))
1739 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1741 self._downloader.trouble(u'ERROR: unable to extract media URL')
1743 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1745 # if needed add http://www.dailymotion.com/ if relative URL
1747 video_url = mediaURL
1749 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1751 self._downloader.trouble(u'ERROR: unable to extract title')
1753 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1754 video_title = sanitize_title(video_title)
1755 simple_title = _simplify_title(video_title)
1757 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1759 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1761 video_uploader = mobj.group(1)
1764 # Process video information
1765 self._downloader.process_info({
1766 'id': video_id.decode('utf-8'),
1767 'url': video_url.decode('utf-8'),
1768 'uploader': video_uploader.decode('utf-8'),
1769 'upload_date': u'NA',
1770 'title': video_title,
1771 'stitle': simple_title,
1772 'ext': video_extension.decode('utf-8'),
1776 except UnavailableVideoError:
1777 self._downloader.trouble(u'\nERROR: unable to download video')
1780 class GoogleIE(InfoExtractor):
1781 """Information extractor for video.google.com."""
1783 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1784 IE_NAME = u'video.google'
1786 def __init__(self, downloader=None):
1787 InfoExtractor.__init__(self, downloader)
1789 def report_download_webpage(self, video_id):
1790 """Report webpage download."""
1791 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1793 def report_extraction(self, video_id):
1794 """Report information extraction."""
1795 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1797 def _real_extract(self, url):
1798 # Extract id from URL
1799 mobj = re.match(self._VALID_URL, url)
1801 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1804 # At this point we have a new video
1805 self._downloader.increment_downloads()
1806 video_id = mobj.group(1)
1808 video_extension = 'mp4'
1810 # Retrieve video webpage to extract further information
1811 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1813 self.report_download_webpage(video_id)
1814 webpage = urllib2.urlopen(request).read()
1815 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1816 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1819 # Extract URL, uploader, and title from webpage
1820 self.report_extraction(video_id)
1821 mobj = re.search(r"download_url:'([^']+)'", webpage)
1823 video_extension = 'flv'
1824 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1826 self._downloader.trouble(u'ERROR: unable to extract media URL')
1828 mediaURL = urllib.unquote(mobj.group(1))
1829 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1830 mediaURL = mediaURL.replace('\\x26', '\x26')
1832 video_url = mediaURL
1834 mobj = re.search(r'<title>(.*)</title>', webpage)
1836 self._downloader.trouble(u'ERROR: unable to extract title')
1838 video_title = mobj.group(1).decode('utf-8')
1839 video_title = sanitize_title(video_title)
1840 simple_title = _simplify_title(video_title)
1842 # Extract video description
1843 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1845 self._downloader.trouble(u'ERROR: unable to extract video description')
1847 video_description = mobj.group(1).decode('utf-8')
1848 if not video_description:
1849 video_description = 'No description available.'
1851 # Extract video thumbnail
1852 if self._downloader.params.get('forcethumbnail', False):
1853 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1855 webpage = urllib2.urlopen(request).read()
1856 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1857 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1859 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1861 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1863 video_thumbnail = mobj.group(1)
1864 else: # we need something to pass to process_info
1865 video_thumbnail = ''
1868 # Process video information
1869 self._downloader.process_info({
1870 'id': video_id.decode('utf-8'),
1871 'url': video_url.decode('utf-8'),
1873 'upload_date': u'NA',
1874 'title': video_title,
1875 'stitle': simple_title,
1876 'ext': video_extension.decode('utf-8'),
1880 except UnavailableVideoError:
1881 self._downloader.trouble(u'\nERROR: unable to download video')
1884 class PhotobucketIE(InfoExtractor):
1885 """Information extractor for photobucket.com."""
1887 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1888 IE_NAME = u'photobucket'
1890 def __init__(self, downloader=None):
1891 InfoExtractor.__init__(self, downloader)
1893 def report_download_webpage(self, video_id):
1894 """Report webpage download."""
1895 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1897 def report_extraction(self, video_id):
1898 """Report information extraction."""
1899 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1901 def _real_extract(self, url):
1902 # Extract id from URL
1903 mobj = re.match(self._VALID_URL, url)
1905 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1908 # At this point we have a new video
1909 self._downloader.increment_downloads()
1910 video_id = mobj.group(1)
1912 video_extension = 'flv'
1914 # Retrieve video webpage to extract further information
1915 request = urllib2.Request(url)
1917 self.report_download_webpage(video_id)
1918 webpage = urllib2.urlopen(request).read()
1919 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1920 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1923 # Extract URL, uploader, and title from webpage
1924 self.report_extraction(video_id)
1925 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1927 self._downloader.trouble(u'ERROR: unable to extract media URL')
1929 mediaURL = urllib.unquote(mobj.group(1))
1931 video_url = mediaURL
1933 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1935 self._downloader.trouble(u'ERROR: unable to extract title')
1937 video_title = mobj.group(1).decode('utf-8')
1938 video_title = sanitize_title(video_title)
1939 simple_title = _simplify_title(vide_title)
1941 video_uploader = mobj.group(2).decode('utf-8')
1944 # Process video information
1945 self._downloader.process_info({
1946 'id': video_id.decode('utf-8'),
1947 'url': video_url.decode('utf-8'),
1948 'uploader': video_uploader,
1949 'upload_date': u'NA',
1950 'title': video_title,
1951 'stitle': simple_title,
1952 'ext': video_extension.decode('utf-8'),
1956 except UnavailableVideoError:
1957 self._downloader.trouble(u'\nERROR: unable to download video')
1960 class YahooIE(InfoExtractor):
1961 """Information extractor for video.yahoo.com."""
1963 # _VALID_URL matches all Yahoo! Video URLs
1964 # _VPAGE_URL matches only the extractable '/watch/' URLs
1965 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1966 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1967 IE_NAME = u'video.yahoo'
1969 def __init__(self, downloader=None):
1970 InfoExtractor.__init__(self, downloader)
1972 def report_download_webpage(self, video_id):
1973 """Report webpage download."""
1974 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1976 def report_extraction(self, video_id):
1977 """Report information extraction."""
1978 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1980 def _real_extract(self, url, new_video=True):
1981 # Extract ID from URL
1982 mobj = re.match(self._VALID_URL, url)
1984 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1987 # At this point we have a new video
1988 self._downloader.increment_downloads()
1989 video_id = mobj.group(2)
1990 video_extension = 'flv'
1992 # Rewrite valid but non-extractable URLs as
1993 # extractable English language /watch/ URLs
1994 if re.match(self._VPAGE_URL, url) is None:
1995 request = urllib2.Request(url)
1997 webpage = urllib2.urlopen(request).read()
1998 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1999 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2002 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
2004 self._downloader.trouble(u'ERROR: Unable to extract id field')
2006 yahoo_id = mobj.group(1)
2008 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2010 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2012 yahoo_vid = mobj.group(1)
2014 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2015 return self._real_extract(url, new_video=False)
2017 # Retrieve video webpage to extract further information
2018 request = urllib2.Request(url)
2020 self.report_download_webpage(video_id)
2021 webpage = urllib2.urlopen(request).read()
2022 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2023 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2026 # Extract uploader and title from webpage
2027 self.report_extraction(video_id)
2028 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2030 self._downloader.trouble(u'ERROR: unable to extract video title')
2032 video_title = mobj.group(1).decode('utf-8')
2033 simple_title = _simplify_title(video_title)
2035 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2037 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2039 video_uploader = mobj.group(1).decode('utf-8')
2041 # Extract video thumbnail
2042 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2044 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2046 video_thumbnail = mobj.group(1).decode('utf-8')
2048 # Extract video description
2049 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2051 self._downloader.trouble(u'ERROR: unable to extract video description')
2053 video_description = mobj.group(1).decode('utf-8')
2054 if not video_description:
2055 video_description = 'No description available.'
2057 # Extract video height and width
2058 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2060 self._downloader.trouble(u'ERROR: unable to extract video height')
2062 yv_video_height = mobj.group(1)
2064 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2066 self._downloader.trouble(u'ERROR: unable to extract video width')
2068 yv_video_width = mobj.group(1)
2070 # Retrieve video playlist to extract media URL
2071 # I'm not completely sure what all these options are, but we
2072 # seem to need most of them, otherwise the server sends a 401.
2073 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
2074 yv_bitrate = '700' # according to Wikipedia this is hard-coded
2075 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2076 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2077 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2079 self.report_download_webpage(video_id)
2080 webpage = urllib2.urlopen(request).read()
2081 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2082 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2085 # Extract media URL from playlist XML
2086 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2088 self._downloader.trouble(u'ERROR: Unable to extract media URL')
2090 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2091 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2094 # Process video information
2095 self._downloader.process_info({
2096 'id': video_id.decode('utf-8'),
2098 'uploader': video_uploader,
2099 'upload_date': u'NA',
2100 'title': video_title,
2101 'stitle': simple_title,
2102 'ext': video_extension.decode('utf-8'),
2103 'thumbnail': video_thumbnail.decode('utf-8'),
2104 'description': video_description,
2105 'thumbnail': video_thumbnail,
2108 except UnavailableVideoError:
2109 self._downloader.trouble(u'\nERROR: unable to download video')
2112 class VimeoIE(InfoExtractor):
2113 """Information extractor for vimeo.com."""
2115 # _VALID_URL matches Vimeo URLs
2116 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2119 def __init__(self, downloader=None):
2120 InfoExtractor.__init__(self, downloader)
2122 def report_download_webpage(self, video_id):
2123 """Report webpage download."""
2124 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2126 def report_extraction(self, video_id):
2127 """Report information extraction."""
2128 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2130 def _real_extract(self, url, new_video=True):
2131 # Extract ID from URL
2132 mobj = re.match(self._VALID_URL, url)
2134 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2137 # At this point we have a new video
2138 self._downloader.increment_downloads()
2139 video_id = mobj.group(1)
2141 # Retrieve video webpage to extract further information
2142 request = urllib2.Request(url, None, std_headers)
2144 self.report_download_webpage(video_id)
2145 webpage = urllib2.urlopen(request).read()
2146 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2147 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2150 # Now we begin extracting as much information as we can from what we
2151 # retrieved. First we extract the information common to all extractors,
2152 # and latter we extract those that are Vimeo specific.
2153 self.report_extraction(video_id)
2155 # Extract the config JSON
2156 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2158 config = json.loads(config)
2160 self._downloader.trouble(u'ERROR: unable to extract info section')
2164 video_title = config["video"]["title"]
2165 simple_title = _simplify_title(video_title)
2168 video_uploader = config["video"]["owner"]["name"]
2170 # Extract video thumbnail
2171 video_thumbnail = config["video"]["thumbnail"]
2173 # Extract video description
2177 video_description = u'No description available.'
2178 mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2179 if mobj is not None:
2180 video_description = mobj.group(1)
2182 html_parser = lxml.etree.HTMLParser()
2183 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2184 video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2185 # TODO use another parser
2187 # Extract upload date
2188 video_upload_date = u'NA'
2189 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2190 if mobj is not None:
2191 video_upload_date = mobj.group(1)
2193 # Vimeo specific: extract request signature and timestamp
2194 sig = config['request']['signature']
2195 timestamp = config['request']['timestamp']
2197 # Vimeo specific: extract video codec and quality information
2198 # TODO bind to format param
2199 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2200 for codec in codecs:
2201 if codec[0] in config["video"]["files"]:
2202 video_codec = codec[0]
2203 video_extension = codec[1]
2204 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2205 else: quality = 'sd'
2208 self._downloader.trouble(u'ERROR: no known codec found')
2211 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2212 %(video_id, sig, timestamp, quality, video_codec.upper())
2215 # Process video information
2216 self._downloader.process_info({
2219 'uploader': video_uploader,
2220 'upload_date': video_upload_date,
2221 'title': video_title,
2222 'stitle': simple_title,
2223 'ext': video_extension,
2224 'thumbnail': video_thumbnail,
2225 'description': video_description,
2228 except UnavailableVideoError:
2229 self._downloader.trouble(u'ERROR: unable to download video')
2232 class GenericIE(InfoExtractor):
2233 """Generic last-resort information extractor."""
2236 IE_NAME = u'generic'
2238 def __init__(self, downloader=None):
2239 InfoExtractor.__init__(self, downloader)
2241 def report_download_webpage(self, video_id):
2242 """Report webpage download."""
2243 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2244 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2246 def report_extraction(self, video_id):
2247 """Report information extraction."""
2248 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2250 def _real_extract(self, url):
2251 # At this point we have a new video
2252 self._downloader.increment_downloads()
2254 video_id = url.split('/')[-1]
2255 request = urllib2.Request(url)
2257 self.report_download_webpage(video_id)
2258 webpage = urllib2.urlopen(request).read()
2259 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2260 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2262 except ValueError, err:
2263 # since this is the last-resort InfoExtractor, if
2264 # this error is thrown, it'll be thrown here
2265 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2268 self.report_extraction(video_id)
2269 # Start with something easy: JW Player in SWFObject
2270 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2272 # Broaden the search a little bit
2273 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2275 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2278 # It's possible that one of the regexes
2279 # matched, but returned an empty group:
2280 if mobj.group(1) is None:
2281 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2284 video_url = urllib.unquote(mobj.group(1))
2285 video_id = os.path.basename(video_url)
2287 # here's a fun little line of code for you:
2288 video_extension = os.path.splitext(video_id)[1][1:]
2289 video_id = os.path.splitext(video_id)[0]
2291 # it's tempting to parse this further, but you would
2292 # have to take into account all the variations like
2293 # Video Title - Site Name
2294 # Site Name | Video Title
2295 # Video Title - Tagline | Site Name
2296 # and so on and so forth; it's just not practical
2297 mobj = re.search(r'<title>(.*)</title>', webpage)
2299 self._downloader.trouble(u'ERROR: unable to extract title')
2301 video_title = mobj.group(1).decode('utf-8')
2302 video_title = sanitize_title(video_title)
2303 simple_title = _simplify_title(video_title)
2305 # video uploader is domain name
2306 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2308 self._downloader.trouble(u'ERROR: unable to extract title')
2310 video_uploader = mobj.group(1).decode('utf-8')
2313 # Process video information
2314 self._downloader.process_info({
2315 'id': video_id.decode('utf-8'),
2316 'url': video_url.decode('utf-8'),
2317 'uploader': video_uploader,
2318 'upload_date': u'NA',
2319 'title': video_title,
2320 'stitle': simple_title,
2321 'ext': video_extension.decode('utf-8'),
2325 except UnavailableVideoError, err:
2326 self._downloader.trouble(u'\nERROR: unable to download video')
2329 class YoutubeSearchIE(InfoExtractor):
2330 """Information Extractor for YouTube search queries."""
2331 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2332 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
2334 _max_youtube_results = 1000
2335 IE_NAME = u'youtube:search'
2337 def __init__(self, youtube_ie, downloader=None):
2338 InfoExtractor.__init__(self, downloader)
2339 self._youtube_ie = youtube_ie
2341 def report_download_page(self, query, pagenum):
2342 """Report attempt to download playlist page with given number."""
2343 query = query.decode(preferredencoding())
2344 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2346 def _real_initialize(self):
2347 self._youtube_ie.initialize()
2349 def _real_extract(self, query):
2350 mobj = re.match(self._VALID_URL, query)
2352 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2355 prefix, query = query.split(':')
2357 query = query.encode('utf-8')
2359 self._download_n_results(query, 1)
2361 elif prefix == 'all':
2362 self._download_n_results(query, self._max_youtube_results)
2368 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2370 elif n > self._max_youtube_results:
2371 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2372 n = self._max_youtube_results
2373 self._download_n_results(query, n)
2375 except ValueError: # parsing prefix as integer fails
2376 self._download_n_results(query, 1)
2379 def _download_n_results(self, query, n):
2380 """Downloads a specified number of results for a query"""
2386 while (50 * pagenum) < limit:
2387 self.report_download_page(query, pagenum+1)
2388 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
2389 request = urllib2.Request(result_url)
2391 data = urllib2.urlopen(request).read()
2392 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2393 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
2395 api_response = json.loads(data)['data']
2397 new_ids = list(video['id'] for video in api_response['items'])
2398 video_ids += new_ids
2400 limit = min(n, api_response['totalItems'])
2403 if len(video_ids) > n:
2404 video_ids = video_ids[:n]
2405 for id in video_ids:
2406 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2410 class GoogleSearchIE(InfoExtractor):
2411 """Information Extractor for Google Video search queries."""
2412 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2413 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2414 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2415 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2417 _max_google_results = 1000
2418 IE_NAME = u'video.google:search'
2420 def __init__(self, google_ie, downloader=None):
2421 InfoExtractor.__init__(self, downloader)
2422 self._google_ie = google_ie
2424 def report_download_page(self, query, pagenum):
2425 """Report attempt to download playlist page with given number."""
2426 query = query.decode(preferredencoding())
2427 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2429 def _real_initialize(self):
2430 self._google_ie.initialize()
2432 def _real_extract(self, query):
2433 mobj = re.match(self._VALID_URL, query)
2435 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2438 prefix, query = query.split(':')
2440 query = query.encode('utf-8')
2442 self._download_n_results(query, 1)
2444 elif prefix == 'all':
2445 self._download_n_results(query, self._max_google_results)
2451 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2453 elif n > self._max_google_results:
2454 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2455 n = self._max_google_results
2456 self._download_n_results(query, n)
2458 except ValueError: # parsing prefix as integer fails
2459 self._download_n_results(query, 1)
2462 def _download_n_results(self, query, n):
2463 """Downloads a specified number of results for a query"""
2469 self.report_download_page(query, pagenum)
2470 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2471 request = urllib2.Request(result_url)
2473 page = urllib2.urlopen(request).read()
2474 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2475 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2478 # Extract video identifiers
2479 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2480 video_id = mobj.group(1)
2481 if video_id not in video_ids:
2482 video_ids.append(video_id)
2483 if len(video_ids) == n:
2484 # Specified n videos reached
2485 for id in video_ids:
2486 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2489 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2490 for id in video_ids:
2491 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2494 pagenum = pagenum + 1
2497 class YahooSearchIE(InfoExtractor):
2498 """Information Extractor for Yahoo! Video search queries."""
2499 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2500 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2501 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2502 _MORE_PAGES_INDICATOR = r'\s*Next'
2504 _max_yahoo_results = 1000
2505 IE_NAME = u'video.yahoo:search'
2507 def __init__(self, yahoo_ie, downloader=None):
2508 InfoExtractor.__init__(self, downloader)
2509 self._yahoo_ie = yahoo_ie
2511 def report_download_page(self, query, pagenum):
2512 """Report attempt to download playlist page with given number."""
2513 query = query.decode(preferredencoding())
2514 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2516 def _real_initialize(self):
2517 self._yahoo_ie.initialize()
2519 def _real_extract(self, query):
2520 mobj = re.match(self._VALID_URL, query)
2522 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2525 prefix, query = query.split(':')
2527 query = query.encode('utf-8')
2529 self._download_n_results(query, 1)
2531 elif prefix == 'all':
2532 self._download_n_results(query, self._max_yahoo_results)
2538 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2540 elif n > self._max_yahoo_results:
2541 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2542 n = self._max_yahoo_results
2543 self._download_n_results(query, n)
2545 except ValueError: # parsing prefix as integer fails
2546 self._download_n_results(query, 1)
2549 def _download_n_results(self, query, n):
2550 """Downloads a specified number of results for a query"""
2553 already_seen = set()
2557 self.report_download_page(query, pagenum)
2558 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2559 request = urllib2.Request(result_url)
2561 page = urllib2.urlopen(request).read()
2562 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2563 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2566 # Extract video identifiers
2567 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2568 video_id = mobj.group(1)
2569 if video_id not in already_seen:
2570 video_ids.append(video_id)
2571 already_seen.add(video_id)
2572 if len(video_ids) == n:
2573 # Specified n videos reached
2574 for id in video_ids:
2575 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2578 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2579 for id in video_ids:
2580 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2583 pagenum = pagenum + 1
2586 class YoutubePlaylistIE(InfoExtractor):
2587 """Information Extractor for YouTube playlists."""
2589 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2590 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2591 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=PL%s&'
2592 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2594 IE_NAME = u'youtube:playlist'
2596 def __init__(self, youtube_ie, downloader=None):
2597 InfoExtractor.__init__(self, downloader)
2598 self._youtube_ie = youtube_ie
2600 def report_download_page(self, playlist_id, pagenum):
2601 """Report attempt to download playlist page with given number."""
2602 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2604 def _real_initialize(self):
2605 self._youtube_ie.initialize()
2607 def _real_extract(self, url):
2608 # Extract playlist id
2609 mobj = re.match(self._VALID_URL, url)
2611 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2615 if mobj.group(3) is not None:
2616 self._youtube_ie.extract(mobj.group(3))
2619 # Download playlist pages
2620 # prefix is 'p' as default for playlists but there are other types that need extra care
2621 playlist_prefix = mobj.group(1)
2622 if playlist_prefix == 'a':
2623 playlist_access = 'artist'
2625 playlist_prefix = 'p'
2626 playlist_access = 'view_play_list'
2627 playlist_id = mobj.group(2)
2632 self.report_download_page(playlist_id, pagenum)
2633 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2634 request = urllib2.Request(url)
2636 page = urllib2.urlopen(request).read()
2637 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2638 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2641 # Extract video identifiers
2643 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
2644 if mobj.group(1) not in ids_in_page:
2645 ids_in_page.append(mobj.group(1))
2646 video_ids.extend(ids_in_page)
2648 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2650 pagenum = pagenum + 1
2652 playliststart = self._downloader.params.get('playliststart', 1) - 1
2653 playlistend = self._downloader.params.get('playlistend', -1)
2654 if playlistend == -1:
2655 video_ids = video_ids[playliststart:]
2657 video_ids = video_ids[playliststart:playlistend]
2659 for id in video_ids:
2660 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2664 class YoutubeUserIE(InfoExtractor):
2665 """Information Extractor for YouTube users."""
2667 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2668 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2669 _GDATA_PAGE_SIZE = 50
2670 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2671 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2673 IE_NAME = u'youtube:user'
2675 def __init__(self, youtube_ie, downloader=None):
2676 InfoExtractor.__init__(self, downloader)
2677 self._youtube_ie = youtube_ie
2679 def report_download_page(self, username, start_index):
2680 """Report attempt to download user page."""
2681 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2682 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2684 def _real_initialize(self):
2685 self._youtube_ie.initialize()
2687 def _real_extract(self, url):
2689 mobj = re.match(self._VALID_URL, url)
2691 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2694 username = mobj.group(1)
2696 # Download video ids using YouTube Data API. Result size per
2697 # query is limited (currently to 50 videos) so we need to query
2698 # page by page until there are no video ids - it means we got
2705 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2706 self.report_download_page(username, start_index)
2708 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2711 page = urllib2.urlopen(request).read()
2712 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2713 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2716 # Extract video identifiers
2719 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2720 if mobj.group(1) not in ids_in_page:
2721 ids_in_page.append(mobj.group(1))
2723 video_ids.extend(ids_in_page)
2725 # A little optimization - if current page is not
2726 # "full", ie. does not contain PAGE_SIZE video ids then
2727 # we can assume that this page is the last one - there
2728 # are no more ids on further pages - no need to query
2731 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2736 all_ids_count = len(video_ids)
2737 playliststart = self._downloader.params.get('playliststart', 1) - 1
2738 playlistend = self._downloader.params.get('playlistend', -1)
2740 if playlistend == -1:
2741 video_ids = video_ids[playliststart:]
2743 video_ids = video_ids[playliststart:playlistend]
2745 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2746 (username, all_ids_count, len(video_ids)))
2748 for video_id in video_ids:
2749 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2752 class DepositFilesIE(InfoExtractor):
2753 """Information extractor for depositfiles.com"""
2755 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2756 IE_NAME = u'DepositFiles'
2758 def __init__(self, downloader=None):
2759 InfoExtractor.__init__(self, downloader)
2761 def report_download_webpage(self, file_id):
2762 """Report webpage download."""
2763 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2765 def report_extraction(self, file_id):
2766 """Report information extraction."""
2767 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2769 def _real_extract(self, url):
2770 # At this point we have a new file
2771 self._downloader.increment_downloads()
2773 file_id = url.split('/')[-1]
2774 # Rebuild url in english locale
2775 url = 'http://depositfiles.com/en/files/' + file_id
2777 # Retrieve file webpage with 'Free download' button pressed
2778 free_download_indication = { 'gateway_result' : '1' }
2779 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2781 self.report_download_webpage(file_id)
2782 webpage = urllib2.urlopen(request).read()
2783 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2784 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2787 # Search for the real file URL
2788 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2789 if (mobj is None) or (mobj.group(1) is None):
2790 # Try to figure out reason of the error.
2791 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2792 if (mobj is not None) and (mobj.group(1) is not None):
2793 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2794 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2796 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2799 file_url = mobj.group(1)
2800 file_extension = os.path.splitext(file_url)[1][1:]
2802 # Search for file title
2803 mobj = re.search(r'<b title="(.*?)">', webpage)
2805 self._downloader.trouble(u'ERROR: unable to extract title')
2807 file_title = mobj.group(1).decode('utf-8')
2810 # Process file information
2811 self._downloader.process_info({
2812 'id': file_id.decode('utf-8'),
2813 'url': file_url.decode('utf-8'),
2815 'upload_date': u'NA',
2816 'title': file_title,
2817 'stitle': file_title,
2818 'ext': file_extension.decode('utf-8'),
2822 except UnavailableVideoError, err:
2823 self._downloader.trouble(u'ERROR: unable to download file')
2826 class FacebookIE(InfoExtractor):
2827 """Information Extractor for Facebook"""
2829 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2830 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2831 _NETRC_MACHINE = 'facebook'
2832 _available_formats = ['video', 'highqual', 'lowqual']
2833 _video_extensions = {
2838 IE_NAME = u'facebook'
2840 def __init__(self, downloader=None):
2841 InfoExtractor.__init__(self, downloader)
2843 def _reporter(self, message):
2844 """Add header and report message."""
2845 self._downloader.to_screen(u'[facebook] %s' % message)
2847 def report_login(self):
2848 """Report attempt to log in."""
2849 self._reporter(u'Logging in')
2851 def report_video_webpage_download(self, video_id):
2852 """Report attempt to download video webpage."""
2853 self._reporter(u'%s: Downloading video webpage' % video_id)
2855 def report_information_extraction(self, video_id):
2856 """Report attempt to extract video information."""
2857 self._reporter(u'%s: Extracting video information' % video_id)
2859 def _parse_page(self, video_webpage):
2860 """Extract video information from page"""
2862 data = {'title': r'\("video_title", "(.*?)"\)',
2863 'description': r'<div class="datawrap">(.*?)</div>',
2864 'owner': r'\("video_owner_name", "(.*?)"\)',
2865 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2868 for piece in data.keys():
2869 mobj = re.search(data[piece], video_webpage)
2870 if mobj is not None:
2871 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2875 for fmt in self._available_formats:
2876 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2877 if mobj is not None:
2878 # URL is in a Javascript segment inside an escaped Unicode format within
2879 # the generally utf-8 page
2880 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2881 video_info['video_urls'] = video_urls
2885 def _real_initialize(self):
2886 if self._downloader is None:
2891 downloader_params = self._downloader.params
2893 # Attempt to use provided username and password or .netrc data
2894 if downloader_params.get('username', None) is not None:
2895 useremail = downloader_params['username']
2896 password = downloader_params['password']
2897 elif downloader_params.get('usenetrc', False):
2899 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2900 if info is not None:
2904 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2905 except (IOError, netrc.NetrcParseError), err:
2906 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2909 if useremail is None:
2918 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2921 login_results = urllib2.urlopen(request).read()
2922 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2923 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2925 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2926 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2929 def _real_extract(self, url):
2930 mobj = re.match(self._VALID_URL, url)
2932 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2934 video_id = mobj.group('ID')
2937 self.report_video_webpage_download(video_id)
2938 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2940 page = urllib2.urlopen(request)
2941 video_webpage = page.read()
2942 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2943 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2946 # Start extracting information
2947 self.report_information_extraction(video_id)
2949 # Extract information
2950 video_info = self._parse_page(video_webpage)
2953 if 'owner' not in video_info:
2954 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2956 video_uploader = video_info['owner']
2959 if 'title' not in video_info:
2960 self._downloader.trouble(u'ERROR: unable to extract video title')
2962 video_title = video_info['title']
2963 video_title = video_title.decode('utf-8')
2964 video_title = sanitize_title(video_title)
2966 simple_title = _simplify_title(video_title)
2969 if 'thumbnail' not in video_info:
2970 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2971 video_thumbnail = ''
2973 video_thumbnail = video_info['thumbnail']
2977 if 'upload_date' in video_info:
2978 upload_time = video_info['upload_date']
2979 timetuple = email.utils.parsedate_tz(upload_time)
2980 if timetuple is not None:
2982 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2987 video_description = video_info.get('description', 'No description available.')
2989 url_map = video_info['video_urls']
2990 if len(url_map.keys()) > 0:
2991 # Decide which formats to download
2992 req_format = self._downloader.params.get('format', None)
2993 format_limit = self._downloader.params.get('format_limit', None)
2995 if format_limit is not None and format_limit in self._available_formats:
2996 format_list = self._available_formats[self._available_formats.index(format_limit):]
2998 format_list = self._available_formats
2999 existing_formats = [x for x in format_list if x in url_map]
3000 if len(existing_formats) == 0:
3001 self._downloader.trouble(u'ERROR: no known formats available for video')
3003 if req_format is None:
3004 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
3005 elif req_format == 'worst':
3006 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3007 elif req_format == '-1':
3008 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3011 if req_format not in url_map:
3012 self._downloader.trouble(u'ERROR: requested format not available')
3014 video_url_list = [(req_format, url_map[req_format])] # Specific format
3016 for format_param, video_real_url in video_url_list:
3018 # At this point we have a new video
3019 self._downloader.increment_downloads()
3022 video_extension = self._video_extensions.get(format_param, 'mp4')
3025 # Process video information
3026 self._downloader.process_info({
3027 'id': video_id.decode('utf-8'),
3028 'url': video_real_url.decode('utf-8'),
3029 'uploader': video_uploader.decode('utf-8'),
3030 'upload_date': upload_date,
3031 'title': video_title,
3032 'stitle': simple_title,
3033 'ext': video_extension.decode('utf-8'),
3034 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3035 'thumbnail': video_thumbnail.decode('utf-8'),
3036 'description': video_description.decode('utf-8'),
3039 except UnavailableVideoError, err:
3040 self._downloader.trouble(u'\nERROR: unable to download video')
3042 class BlipTVIE(InfoExtractor):
3043 """Information extractor for blip.tv"""
3045 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3046 _URL_EXT = r'^.*\.([a-z0-9]+)$'
3047 IE_NAME = u'blip.tv'
3049 def report_extraction(self, file_id):
3050 """Report information extraction."""
3051 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3053 def report_direct_download(self, title):
3054 """Report information extraction."""
3055 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3057 def _real_extract(self, url):
3058 mobj = re.match(self._VALID_URL, url)
3060 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3067 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3068 request = urllib2.Request(json_url)
3069 self.report_extraction(mobj.group(1))
3072 urlh = urllib2.urlopen(request)
3073 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3074 basename = url.split('/')[-1]
3075 title,ext = os.path.splitext(basename)
3076 title = title.decode('UTF-8')
3077 ext = ext.replace('.', '')
3078 self.report_direct_download(title)
3083 'stitle': _simplify_title(title),
3087 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3088 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3090 if info is None: # Regular URL
3092 json_code = urlh.read()
3093 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3094 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3098 json_data = json.loads(json_code)
3099 if 'Post' in json_data:
3100 data = json_data['Post']
3104 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3105 video_url = data['media']['url']
3106 umobj = re.match(self._URL_EXT, video_url)
3108 raise ValueError('Can not determine filename extension')
3109 ext = umobj.group(1)
3112 'id': data['item_id'],
3114 'uploader': data['display_name'],
3115 'upload_date': upload_date,
3116 'title': data['title'],
3117 'stitle': _simplify_title(data['title']),
3119 'format': data['media']['mimeType'],
3120 'thumbnail': data['thumbnailUrl'],
3121 'description': data['description'],
3122 'player_url': data['embedUrl']
3124 except (ValueError,KeyError), err:
3125 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3128 self._downloader.increment_downloads()
3131 self._downloader.process_info(info)
3132 except UnavailableVideoError, err:
3133 self._downloader.trouble(u'\nERROR: unable to download video')
3136 class MyVideoIE(InfoExtractor):
3137 """Information Extractor for myvideo.de."""
3139 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3140 IE_NAME = u'myvideo'
3142 def __init__(self, downloader=None):
3143 InfoExtractor.__init__(self, downloader)
3145 def report_download_webpage(self, video_id):
3146 """Report webpage download."""
3147 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3149 def report_extraction(self, video_id):
3150 """Report information extraction."""
3151 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3153 def _real_extract(self,url):
3154 mobj = re.match(self._VALID_URL, url)
3156 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3159 video_id = mobj.group(1)
3162 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3164 self.report_download_webpage(video_id)
3165 webpage = urllib2.urlopen(request).read()
3166 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3167 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3170 self.report_extraction(video_id)
3171 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3174 self._downloader.trouble(u'ERROR: unable to extract media URL')
3176 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3178 mobj = re.search('<title>([^<]+)</title>', webpage)
3180 self._downloader.trouble(u'ERROR: unable to extract title')
3183 video_title = mobj.group(1)
3184 video_title = sanitize_title(video_title)
3186 simple_title = _simplify_title(video_title)
3189 self._downloader.process_info({
3193 'upload_date': u'NA',
3194 'title': video_title,
3195 'stitle': simple_title,
3200 except UnavailableVideoError:
3201 self._downloader.trouble(u'\nERROR: Unable to download video')
3203 class ComedyCentralIE(InfoExtractor):
3204 """Information extractor for The Daily Show and Colbert Report """
3206 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3207 IE_NAME = u'comedycentral'
3209 def report_extraction(self, episode_id):
3210 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3212 def report_config_download(self, episode_id):
3213 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3215 def report_index_download(self, episode_id):
3216 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3218 def report_player_url(self, episode_id):
3219 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3221 def _real_extract(self, url):
3222 mobj = re.match(self._VALID_URL, url)
3224 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3227 if mobj.group('shortname'):
3228 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3229 url = u'http://www.thedailyshow.com/full-episodes/'
3231 url = u'http://www.colbertnation.com/full-episodes/'
3232 mobj = re.match(self._VALID_URL, url)
3233 assert mobj is not None
3235 dlNewest = not mobj.group('episode')
3237 epTitle = mobj.group('showname')
3239 epTitle = mobj.group('episode')
3241 req = urllib2.Request(url)
3242 self.report_extraction(epTitle)
3244 htmlHandle = urllib2.urlopen(req)
3245 html = htmlHandle.read()
3246 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3247 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3250 url = htmlHandle.geturl()
3251 mobj = re.match(self._VALID_URL, url)
3253 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3255 if mobj.group('episode') == '':
3256 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3258 epTitle = mobj.group('episode')
3260 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3261 if len(mMovieParams) == 0:
3262 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3265 playerUrl_raw = mMovieParams[0][0]
3266 self.report_player_url(epTitle)
3268 urlHandle = urllib2.urlopen(playerUrl_raw)
3269 playerUrl = urlHandle.geturl()
3270 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3271 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3274 uri = mMovieParams[0][1]
3275 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3276 self.report_index_download(epTitle)
3278 indexXml = urllib2.urlopen(indexUrl).read()
3279 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3280 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3283 idoc = xml.etree.ElementTree.fromstring(indexXml)
3284 itemEls = idoc.findall('.//item')
3285 for itemEl in itemEls:
3286 mediaId = itemEl.findall('./guid')[0].text
3287 shortMediaId = mediaId.split(':')[-1]
3288 showId = mediaId.split(':')[-2].replace('.com', '')
3289 officialTitle = itemEl.findall('./title')[0].text
3290 officialDate = itemEl.findall('./pubDate')[0].text
3292 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3293 urllib.urlencode({'uri': mediaId}))
3294 configReq = urllib2.Request(configUrl)
3295 self.report_config_download(epTitle)
3297 configXml = urllib2.urlopen(configReq).read()
3298 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3299 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3302 cdoc = xml.etree.ElementTree.fromstring(configXml)
3304 for rendition in cdoc.findall('.//rendition'):
3305 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3309 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3312 # For now, just pick the highest bitrate
3313 format,video_url = turls[-1]
3315 self._downloader.increment_downloads()
3317 effTitle = showId + u'-' + epTitle
3322 'upload_date': officialDate,
3324 'stitle': _simplify_title(effTitle),
3328 'description': officialTitle,
3329 'player_url': playerUrl
3333 self._downloader.process_info(info)
3334 except UnavailableVideoError, err:
3335 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3339 class EscapistIE(InfoExtractor):
3340 """Information extractor for The Escapist """
3342 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3343 IE_NAME = u'escapist'
3345 def report_extraction(self, showName):
3346 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3348 def report_config_download(self, showName):
3349 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3351 def _real_extract(self, url):
3352 htmlParser = HTMLParser.HTMLParser()
3354 mobj = re.match(self._VALID_URL, url)
3356 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3358 showName = mobj.group('showname')
3359 videoId = mobj.group('episode')
3361 self.report_extraction(showName)
3363 webPage = urllib2.urlopen(url).read()
3364 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3365 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3368 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3369 description = htmlParser.unescape(descMatch.group(1))
3370 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3371 imgUrl = htmlParser.unescape(imgMatch.group(1))
3372 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3373 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3374 configUrlMatch = re.search('config=(.*)$', playerUrl)
3375 configUrl = urllib2.unquote(configUrlMatch.group(1))
3377 self.report_config_download(showName)
3379 configJSON = urllib2.urlopen(configUrl).read()
3380 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3381 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3384 # Technically, it's JavaScript, not JSON
3385 configJSON = configJSON.replace("'", '"')
3388 config = json.loads(configJSON)
3389 except (ValueError,), err:
3390 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3393 playlist = config['playlist']
3394 videoUrl = playlist[1]['url']
3396 self._downloader.increment_downloads()
3400 'uploader': showName,
3401 'upload_date': None,
3403 'stitle': _simplify_title(showName),
3406 'thumbnail': imgUrl,
3407 'description': description,
3408 'player_url': playerUrl,
3412 self._downloader.process_info(info)
3413 except UnavailableVideoError, err:
3414 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3417 class CollegeHumorIE(InfoExtractor):
3418 """Information extractor for collegehumor.com"""
3420 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3421 IE_NAME = u'collegehumor'
3423 def report_webpage(self, video_id):
3424 """Report information extraction."""
3425 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3427 def report_extraction(self, video_id):
3428 """Report information extraction."""
3429 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3431 def _real_extract(self, url):
3432 htmlParser = HTMLParser.HTMLParser()
3434 mobj = re.match(self._VALID_URL, url)
3436 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3438 video_id = mobj.group('videoid')
3440 self.report_webpage(video_id)
3441 request = urllib2.Request(url)
3443 webpage = urllib2.urlopen(request).read()
3444 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3445 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3448 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3450 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3452 internal_video_id = m.group('internalvideoid')
3456 'internal_id': internal_video_id,
3459 self.report_extraction(video_id)
3460 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3462 metaXml = urllib2.urlopen(xmlUrl).read()
3463 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3464 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3467 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3469 videoNode = mdoc.findall('./video')[0]
3470 info['description'] = videoNode.findall('./description')[0].text
3471 info['title'] = videoNode.findall('./caption')[0].text
3472 info['stitle'] = _simplify_title(info['title'])
3473 info['url'] = videoNode.findall('./file')[0].text
3474 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3475 info['ext'] = info['url'].rpartition('.')[2]
3476 info['format'] = info['ext']
3478 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3481 self._downloader.increment_downloads()
3484 self._downloader.process_info(info)
3485 except UnavailableVideoError, err:
3486 self._downloader.trouble(u'\nERROR: unable to download video')
3489 class XVideosIE(InfoExtractor):
3490 """Information extractor for xvideos.com"""
3492 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3493 IE_NAME = u'xvideos'
3495 def report_webpage(self, video_id):
3496 """Report information extraction."""
3497 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3499 def report_extraction(self, video_id):
3500 """Report information extraction."""
3501 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3503 def _real_extract(self, url):
3504 htmlParser = HTMLParser.HTMLParser()
3506 mobj = re.match(self._VALID_URL, url)
3508 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3510 video_id = mobj.group(1).decode('utf-8')
3512 self.report_webpage(video_id)
3514 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3516 webpage = urllib2.urlopen(request).read()
3517 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3518 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3521 self.report_extraction(video_id)
3525 mobj = re.search(r'flv_url=(.+?)&', webpage)
3527 self._downloader.trouble(u'ERROR: unable to extract video url')
3529 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3533 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3535 self._downloader.trouble(u'ERROR: unable to extract video title')
3537 video_title = mobj.group(1).decode('utf-8')
3540 # Extract video thumbnail
3541 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3543 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3545 video_thumbnail = mobj.group(1).decode('utf-8')
3549 self._downloader.increment_downloads()
3554 'upload_date': None,
3555 'title': video_title,
3556 'stitle': _simplify_title(video_title),
3559 'thumbnail': video_thumbnail,
3560 'description': None,
3565 self._downloader.process_info(info)
3566 except UnavailableVideoError, err:
3567 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3570 class SoundcloudIE(InfoExtractor):
3571 """Information extractor for soundcloud.com
3572 To access the media, the uid of the song and a stream token
3573 must be extracted from the page source and the script must make
3574 a request to media.soundcloud.com/crossdomain.xml. Then
3575 the media can be grabbed by requesting from an url composed
3576 of the stream token and uid
3579 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3580 IE_NAME = u'soundcloud'
3582 def __init__(self, downloader=None):
3583 InfoExtractor.__init__(self, downloader)
3585 def report_webpage(self, video_id):
3586 """Report information extraction."""
3587 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3589 def report_extraction(self, video_id):
3590 """Report information extraction."""
3591 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3593 def _real_extract(self, url):
3594 htmlParser = HTMLParser.HTMLParser()
3596 mobj = re.match(self._VALID_URL, url)
3598 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3601 # extract uploader (which is in the url)
3602 uploader = mobj.group(1).decode('utf-8')
3603 # extract simple title (uploader + slug of song title)
3604 slug_title = mobj.group(2).decode('utf-8')
3605 simple_title = uploader + '-' + slug_title
3607 self.report_webpage('%s/%s' % (uploader, slug_title))
3609 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3611 webpage = urllib2.urlopen(request).read()
3612 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3613 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3616 self.report_extraction('%s/%s' % (uploader, slug_title))
3618 # extract uid and stream token that soundcloud hands out for access
3619 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3621 video_id = mobj.group(1)
3622 stream_token = mobj.group(2)
3624 # extract unsimplified title
3625 mobj = re.search('"title":"(.*?)",', webpage)
3627 title = mobj.group(1)
3629 # construct media url (with uid/token)
3630 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3631 mediaURL = mediaURL % (video_id, stream_token)
3634 description = u'No description available'
3635 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3637 description = mobj.group(1)
3641 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3644 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3645 except Exception, e:
3648 # for soundcloud, a request to a cross domain is required for cookies
3649 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3652 self._downloader.process_info({
3653 'id': video_id.decode('utf-8'),
3655 'uploader': uploader.decode('utf-8'),
3656 'upload_date': upload_date,
3657 'title': simple_title.decode('utf-8'),
3658 'stitle': simple_title.decode('utf-8'),
3662 'description': description.decode('utf-8')
3664 except UnavailableVideoError:
3665 self._downloader.trouble(u'\nERROR: unable to download video')
3668 class InfoQIE(InfoExtractor):
3669 """Information extractor for infoq.com"""
3671 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3674 def report_webpage(self, video_id):
3675 """Report information extraction."""
3676 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3678 def report_extraction(self, video_id):
3679 """Report information extraction."""
3680 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3682 def _real_extract(self, url):
3683 htmlParser = HTMLParser.HTMLParser()
3685 mobj = re.match(self._VALID_URL, url)
3687 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3690 self.report_webpage(url)
3692 request = urllib2.Request(url)
3694 webpage = urllib2.urlopen(request).read()
3695 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3696 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3699 self.report_extraction(url)
3703 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3705 self._downloader.trouble(u'ERROR: unable to extract video url')
3707 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3711 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3713 self._downloader.trouble(u'ERROR: unable to extract video title')
3715 video_title = mobj.group(1).decode('utf-8')
3717 # Extract description
3718 video_description = u'No description available.'
3719 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3720 if mobj is not None:
3721 video_description = mobj.group(1).decode('utf-8')
3723 video_filename = video_url.split('/')[-1]
3724 video_id, extension = video_filename.split('.')
3726 self._downloader.increment_downloads()
3731 'upload_date': None,
3732 'title': video_title,
3733 'stitle': _simplify_title(video_title),
3735 'format': extension, # Extension is always(?) mp4, but seems to be flv
3737 'description': video_description,
3742 self._downloader.process_info(info)
3743 except UnavailableVideoError, err:
3744 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3746 class MixcloudIE(InfoExtractor):
3747 """Information extractor for www.mixcloud.com"""
3748 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3749 IE_NAME = u'mixcloud'
3751 def __init__(self, downloader=None):
3752 InfoExtractor.__init__(self, downloader)
3754 def report_download_json(self, file_id):
3755 """Report JSON download."""
3756 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3758 def report_extraction(self, file_id):
3759 """Report information extraction."""
3760 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3762 def get_urls(self, jsonData, fmt, bitrate='best'):
3763 """Get urls from 'audio_formats' section in json"""
3766 bitrate_list = jsonData[fmt]
3767 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3768 bitrate = max(bitrate_list) # select highest
3770 url_list = jsonData[fmt][bitrate]
3771 except TypeError: # we have no bitrate info.
3772 url_list = jsonData[fmt]
3776 def check_urls(self, url_list):
3777 """Returns 1st active url from list"""
3778 for url in url_list:
3780 urllib2.urlopen(url)
3782 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3787 def _print_formats(self, formats):
3788 print 'Available formats:'
3789 for fmt in formats.keys():
3790 for b in formats[fmt]:
3792 ext = formats[fmt][b][0]
3793 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3794 except TypeError: # we have no bitrate info
3795 ext = formats[fmt][0]
3796 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3799 def _real_extract(self, url):
3800 mobj = re.match(self._VALID_URL, url)
3802 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3804 # extract uploader & filename from url
3805 uploader = mobj.group(1).decode('utf-8')
3806 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3808 # construct API request
3809 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3810 # retrieve .json file with links to files
3811 request = urllib2.Request(file_url)
3813 self.report_download_json(file_url)
3814 jsonData = urllib2.urlopen(request).read()
3815 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3816 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3820 json_data = json.loads(jsonData)
3821 player_url = json_data['player_swf_url']
3822 formats = dict(json_data['audio_formats'])
3824 req_format = self._downloader.params.get('format', None)
3827 if self._downloader.params.get('listformats', None):
3828 self._print_formats(formats)
3831 if req_format is None or req_format == 'best':
3832 for format_param in formats.keys():
3833 url_list = self.get_urls(formats, format_param)
3835 file_url = self.check_urls(url_list)
3836 if file_url is not None:
3839 if req_format not in formats.keys():
3840 self._downloader.trouble(u'ERROR: format is not available')
3843 url_list = self.get_urls(formats, req_format)
3844 file_url = self.check_urls(url_list)
3845 format_param = req_format
3848 self._downloader.increment_downloads()
3850 # Process file information
3851 self._downloader.process_info({
3852 'id': file_id.decode('utf-8'),
3853 'url': file_url.decode('utf-8'),
3854 'uploader': uploader.decode('utf-8'),
3855 'upload_date': u'NA',
3856 'title': json_data['name'],
3857 'stitle': _simplify_title(json_data['name']),
3858 'ext': file_url.split('.')[-1].decode('utf-8'),
3859 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3860 'thumbnail': json_data['thumbnail_url'],
3861 'description': json_data['description'],
3862 'player_url': player_url.decode('utf-8'),
3864 except UnavailableVideoError, err:
3865 self._downloader.trouble(u'ERROR: unable to download file')
3867 class StanfordOpenClassroomIE(InfoExtractor):
3868 """Information extractor for Stanford's Open ClassRoom"""
3870 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3871 IE_NAME = u'stanfordoc'
3873 def report_download_webpage(self, objid):
3874 """Report information extraction."""
3875 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3877 def report_extraction(self, video_id):
3878 """Report information extraction."""
3879 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3881 def _real_extract(self, url):
3882 mobj = re.match(self._VALID_URL, url)
3884 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3887 if mobj.group('course') and mobj.group('video'): # A specific video
3888 course = mobj.group('course')
3889 video = mobj.group('video')
3891 'id': _simplify_title(course + '_' + video),
3894 self.report_extraction(info['id'])
3895 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3896 xmlUrl = baseUrl + video + '.xml'
3898 metaXml = urllib2.urlopen(xmlUrl).read()
3899 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3900 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3902 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3904 info['title'] = mdoc.findall('./title')[0].text
3905 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3907 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3909 info['stitle'] = _simplify_title(info['title'])
3910 info['ext'] = info['url'].rpartition('.')[2]
3911 info['format'] = info['ext']
3912 self._downloader.increment_downloads()
3914 self._downloader.process_info(info)
3915 except UnavailableVideoError, err:
3916 self._downloader.trouble(u'\nERROR: unable to download video')
3917 elif mobj.group('course'): # A course page
3918 unescapeHTML = HTMLParser.HTMLParser().unescape
3920 course = mobj.group('course')
3922 'id': _simplify_title(course),
3926 self.report_download_webpage(info['id'])
3928 coursepage = urllib2.urlopen(url).read()
3929 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3930 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3933 m = re.search('<h1>([^<]+)</h1>', coursepage)
3935 info['title'] = unescapeHTML(m.group(1))
3937 info['title'] = info['id']
3938 info['stitle'] = _simplify_title(info['title'])
3940 m = re.search('<description>([^<]+)</description>', coursepage)
3942 info['description'] = unescapeHTML(m.group(1))
3944 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3947 'type': 'reference',
3948 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3952 for entry in info['list']:
3953 assert entry['type'] == 'reference'
3954 self.extract(entry['url'])
3956 unescapeHTML = HTMLParser.HTMLParser().unescape
3959 'id': 'Stanford OpenClassroom',
3963 self.report_download_webpage(info['id'])
3964 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3966 rootpage = urllib2.urlopen(rootURL).read()
3967 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3968 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3971 info['title'] = info['id']
3972 info['stitle'] = _simplify_title(info['title'])
3974 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3977 'type': 'reference',
3978 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3982 for entry in info['list']:
3983 assert entry['type'] == 'reference'
3984 self.extract(entry['url'])
3986 class MTVIE(InfoExtractor):
3987 """Information extractor for MTV.com"""
3989 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3992 def report_webpage(self, video_id):
3993 """Report information extraction."""
3994 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3996 def report_extraction(self, video_id):
3997 """Report information extraction."""
3998 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
4000 def _real_extract(self, url):
4001 mobj = re.match(self._VALID_URL, url)
4003 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4005 if not mobj.group('proto'):
4006 url = 'http://' + url
4007 video_id = mobj.group('videoid')
4008 self.report_webpage(video_id)
4010 request = urllib2.Request(url)
4012 webpage = urllib2.urlopen(request).read()
4013 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4014 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4017 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4019 self._downloader.trouble(u'ERROR: unable to extract song name')
4021 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4022 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4024 self._downloader.trouble(u'ERROR: unable to extract performer')
4026 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4027 video_title = performer + ' - ' + song_name
4029 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4031 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4033 mtvn_uri = mobj.group(1)
4035 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4037 self._downloader.trouble(u'ERROR: unable to extract content id')
4039 content_id = mobj.group(1)
4041 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4042 self.report_extraction(video_id)
4043 request = urllib2.Request(videogen_url)
4045 metadataXml = urllib2.urlopen(request).read()
4046 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4047 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4050 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4051 renditions = mdoc.findall('.//rendition')
4053 # For now, always pick the highest quality.
4054 rendition = renditions[-1]
4057 _,_,ext = rendition.attrib['type'].partition('/')
4058 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4059 video_url = rendition.find('./src').text
4061 self._downloader.trouble('Invalid rendition field.')
4064 self._downloader.increment_downloads()
4068 'uploader': performer,
4069 'title': video_title,
4070 'stitle': _simplify_title(video_title),
4076 self._downloader.process_info(info)
4077 except UnavailableVideoError, err:
4078 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4081 class PostProcessor(object):
4082 """Post Processor class.
4084 PostProcessor objects can be added to downloaders with their
4085 add_post_processor() method. When the downloader has finished a
4086 successful download, it will take its internal chain of PostProcessors
4087 and start calling the run() method on each one of them, first with
4088 an initial argument and then with the returned value of the previous
4091 The chain will be stopped if one of them ever returns None or the end
4092 of the chain is reached.
4094 PostProcessor objects follow a "mutual registration" process similar
4095 to InfoExtractor objects.
4100 def __init__(self, downloader=None):
4101 self._downloader = downloader
4103 def set_downloader(self, downloader):
4104 """Sets the downloader for this PP."""
4105 self._downloader = downloader
4107 def run(self, information):
4108 """Run the PostProcessor.
4110 The "information" argument is a dictionary like the ones
4111 composed by InfoExtractors. The only difference is that this
4112 one has an extra field called "filepath" that points to the
4115 When this method returns None, the postprocessing chain is
4116 stopped. However, this method may return an information
4117 dictionary that will be passed to the next postprocessing
4118 object in the chain. It can be the one it received after
4119 changing some fields.
4121 In addition, this method may raise a PostProcessingError
4122 exception that will be taken into account by the downloader
4125 return information # by default, do nothing
4127 class AudioConversionError(BaseException):
4128 def __init__(self, message):
4129 self.message = message
4131 class FFmpegExtractAudioPP(PostProcessor):
4133 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4134 PostProcessor.__init__(self, downloader)
4135 if preferredcodec is None:
4136 preferredcodec = 'best'
4137 self._preferredcodec = preferredcodec
4138 self._preferredquality = preferredquality
4139 self._keepvideo = keepvideo
4142 def get_audio_codec(path):
4144 cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4145 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4146 output = handle.communicate()[0]
4147 if handle.wait() != 0:
4149 except (IOError, OSError):
4152 for line in output.split('\n'):
4153 if line.startswith('codec_name='):
4154 audio_codec = line.split('=')[1].strip()
4155 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4160 def run_ffmpeg(path, out_path, codec, more_opts):
4164 acodec_opts = ['-acodec', codec]
4165 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4167 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4168 stdout,stderr = p.communicate()
4169 except (IOError, OSError):
4170 e = sys.exc_info()[1]
4171 if isinstance(e, OSError) and e.errno == 2:
4172 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4175 if p.returncode != 0:
4176 msg = stderr.strip().split('\n')[-1]
4177 raise AudioConversionError(msg)
4179 def run(self, information):
4180 path = information['filepath']
4182 filecodec = self.get_audio_codec(path)
4183 if filecodec is None:
4184 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4188 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4189 if self._preferredcodec == 'm4a' and filecodec == 'aac':
4190 # Lossless, but in another container
4192 extension = self._preferredcodec
4193 more_opts = ['-absf', 'aac_adtstoasc']
4194 elif filecodec in ['aac', 'mp3', 'vorbis']:
4195 # Lossless if possible
4197 extension = filecodec
4198 if filecodec == 'aac':
4199 more_opts = ['-f', 'adts']
4200 if filecodec == 'vorbis':
4204 acodec = 'libmp3lame'
4207 if self._preferredquality is not None:
4208 more_opts += ['-ab', self._preferredquality]
4210 # We convert the audio (lossy)
4211 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4212 extension = self._preferredcodec
4214 if self._preferredquality is not None:
4215 more_opts += ['-ab', self._preferredquality]
4216 if self._preferredcodec == 'aac':
4217 more_opts += ['-f', 'adts']
4218 if self._preferredcodec == 'm4a':
4219 more_opts += ['-absf', 'aac_adtstoasc']
4220 if self._preferredcodec == 'vorbis':
4222 if self._preferredcodec == 'wav':
4224 more_opts += ['-f', 'wav']
4226 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4227 new_path = prefix + sep + extension
4228 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4230 self.run_ffmpeg(path, new_path, acodec, more_opts)
4232 etype,e,tb = sys.exc_info()
4233 if isinstance(e, AudioConversionError):
4234 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4236 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4239 # Try to update the date time for extracted audio file.
4240 if information.get('filetime') is not None:
4242 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4244 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4246 if not self._keepvideo:
4248 os.remove(_encodeFilename(path))
4249 except (IOError, OSError):
4250 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4253 information['filepath'] = new_path
4257 def updateSelf(downloader, filename):
4258 ''' Update the program file with the latest version from the repository '''
4259 # Note: downloader only used for options
4260 if not os.access(filename, os.W_OK):
4261 sys.exit('ERROR: no write permissions on %s' % filename)
4263 downloader.to_screen(u'Updating to latest version...')
4267 urlh = urllib.urlopen(UPDATE_URL)
4268 newcontent = urlh.read()
4270 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4271 if vmatch is not None and vmatch.group(1) == __version__:
4272 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4276 except (IOError, OSError), err:
4277 sys.exit('ERROR: unable to download latest version')
4280 outf = open(filename, 'wb')
4282 outf.write(newcontent)
4285 except (IOError, OSError), err:
4286 sys.exit('ERROR: unable to overwrite current version')
4288 downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4291 def _readOptions(filename_bytes):
4293 optionf = open(filename_bytes)
4295 return [] # silently skip if file is not present
4299 res += shlex.split(l, comments=True)
4304 def _format_option_string(option):
4305 ''' ('-o', '--option') -> -o, --format METAVAR'''
4309 if option._short_opts: opts.append(option._short_opts[0])
4310 if option._long_opts: opts.append(option._long_opts[0])
4311 if len(opts) > 1: opts.insert(1, ', ')
4313 if option.takes_value(): opts.append(' %s' % option.metavar)
4315 return "".join(opts)
4317 def _find_term_columns():
4318 columns = os.environ.get('COLUMNS', None)
4323 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4324 out,err = sp.communicate()
4325 return int(out.split()[1])
4331 max_help_position = 80
4333 # No need to wrap help messages if we're on a wide console
4334 columns = _find_term_columns()
4335 if columns: max_width = columns
4337 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4338 fmt.format_option_strings = _format_option_string
4341 'version' : __version__,
4343 'usage' : '%prog [options] url [url...]',
4344 'conflict_handler' : 'resolve',
4347 parser = optparse.OptionParser(**kw)
4350 general = optparse.OptionGroup(parser, 'General Options')
4351 selection = optparse.OptionGroup(parser, 'Video Selection')
4352 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4353 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4354 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4355 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4356 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4358 general.add_option('-h', '--help',
4359 action='help', help='print this help text and exit')
4360 general.add_option('-v', '--version',
4361 action='version', help='print program version and exit')
4362 general.add_option('-U', '--update',
4363 action='store_true', dest='update_self', help='update this program to latest version')
4364 general.add_option('-i', '--ignore-errors',
4365 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4366 general.add_option('-r', '--rate-limit',
4367 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4368 general.add_option('-R', '--retries',
4369 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4370 general.add_option('--dump-user-agent',
4371 action='store_true', dest='dump_user_agent',
4372 help='display the current browser identification', default=False)
4373 general.add_option('--list-extractors',
4374 action='store_true', dest='list_extractors',
4375 help='List all supported extractors and the URLs they would handle', default=False)
4377 selection.add_option('--playlist-start',
4378 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4379 selection.add_option('--playlist-end',
4380 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4381 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4382 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4383 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4385 authentication.add_option('-u', '--username',
4386 dest='username', metavar='USERNAME', help='account username')
4387 authentication.add_option('-p', '--password',
4388 dest='password', metavar='PASSWORD', help='account password')
4389 authentication.add_option('-n', '--netrc',
4390 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4393 video_format.add_option('-f', '--format',
4394 action='store', dest='format', metavar='FORMAT', help='video format code')
4395 video_format.add_option('--all-formats',
4396 action='store_const', dest='format', help='download all available video formats', const='all')
4397 video_format.add_option('--prefer-free-formats',
4398 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4399 video_format.add_option('--max-quality',
4400 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4401 video_format.add_option('-F', '--list-formats',
4402 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4403 video_format.add_option('--write-srt',
4404 action='store_true', dest='writesubtitles',
4405 help='write video closed captions to a .srt file (currently youtube only)', default=False)
4406 video_format.add_option('--srt-lang',
4407 action='store', dest='subtitleslang', metavar='LANG',
4408 help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
4411 verbosity.add_option('-q', '--quiet',
4412 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4413 verbosity.add_option('-s', '--simulate',
4414 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4415 verbosity.add_option('--skip-download',
4416 action='store_true', dest='skip_download', help='do not download the video', default=False)
4417 verbosity.add_option('-g', '--get-url',
4418 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4419 verbosity.add_option('-e', '--get-title',
4420 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4421 verbosity.add_option('--get-thumbnail',
4422 action='store_true', dest='getthumbnail',
4423 help='simulate, quiet but print thumbnail URL', default=False)
4424 verbosity.add_option('--get-description',
4425 action='store_true', dest='getdescription',
4426 help='simulate, quiet but print video description', default=False)
4427 verbosity.add_option('--get-filename',
4428 action='store_true', dest='getfilename',
4429 help='simulate, quiet but print output filename', default=False)
4430 verbosity.add_option('--get-format',
4431 action='store_true', dest='getformat',
4432 help='simulate, quiet but print output format', default=False)
4433 verbosity.add_option('--no-progress',
4434 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4435 verbosity.add_option('--console-title',
4436 action='store_true', dest='consoletitle',
4437 help='display progress in console titlebar', default=False)
4438 verbosity.add_option('-v', '--verbose',
4439 action='store_true', dest='verbose', help='print various debugging information', default=False)
4442 filesystem.add_option('-t', '--title',
4443 action='store_true', dest='usetitle', help='use title in file name', default=False)
4444 filesystem.add_option('-l', '--literal',
4445 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4446 filesystem.add_option('-A', '--auto-number',
4447 action='store_true', dest='autonumber',
4448 help='number downloaded files starting from 00000', default=False)
4449 filesystem.add_option('-o', '--output',
4450 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4451 filesystem.add_option('-a', '--batch-file',
4452 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4453 filesystem.add_option('-w', '--no-overwrites',
4454 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4455 filesystem.add_option('-c', '--continue',
4456 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4457 filesystem.add_option('--no-continue',
4458 action='store_false', dest='continue_dl',
4459 help='do not resume partially downloaded files (restart from beginning)')
4460 filesystem.add_option('--cookies',
4461 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4462 filesystem.add_option('--no-part',
4463 action='store_true', dest='nopart', help='do not use .part files', default=False)
4464 filesystem.add_option('--no-mtime',
4465 action='store_false', dest='updatetime',
4466 help='do not use the Last-modified header to set the file modification time', default=True)
4467 filesystem.add_option('--write-description',
4468 action='store_true', dest='writedescription',
4469 help='write video description to a .description file', default=False)
4470 filesystem.add_option('--write-info-json',
4471 action='store_true', dest='writeinfojson',
4472 help='write video metadata to a .info.json file', default=False)
4475 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4476 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4477 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4478 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4479 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4480 help='ffmpeg audio bitrate specification, 128k by default')
4481 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4482 help='keeps the video file on disk after the post-processing; the video is erased by default')
4485 parser.add_option_group(general)
4486 parser.add_option_group(selection)
4487 parser.add_option_group(filesystem)
4488 parser.add_option_group(verbosity)
4489 parser.add_option_group(video_format)
4490 parser.add_option_group(authentication)
4491 parser.add_option_group(postproc)
4493 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4495 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4497 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4498 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4499 opts, args = parser.parse_args(argv)
4501 return parser, opts, args
4503 def gen_extractors():
4504 """ Return a list of an instance of every supported extractor.
4505 The order does matter; the first extractor matched is the one handling the URL.
4507 youtube_ie = YoutubeIE()
4508 google_ie = GoogleIE()
4509 yahoo_ie = YahooIE()
4511 YoutubePlaylistIE(youtube_ie),
4512 YoutubeUserIE(youtube_ie),
4513 YoutubeSearchIE(youtube_ie),
4515 MetacafeIE(youtube_ie),
4518 GoogleSearchIE(google_ie),
4521 YahooSearchIE(yahoo_ie),
4534 StanfordOpenClassroomIE(),
4541 parser, opts, args = parseOpts()
4543 # Open appropriate CookieJar
4544 if opts.cookiefile is None:
4545 jar = cookielib.CookieJar()
4548 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4549 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4551 except (IOError, OSError), err:
4552 sys.exit(u'ERROR: unable to open cookie file')
4555 if opts.dump_user_agent:
4556 print std_headers['User-Agent']
4559 # Batch file verification
4561 if opts.batchfile is not None:
4563 if opts.batchfile == '-':
4566 batchfd = open(opts.batchfile, 'r')
4567 batchurls = batchfd.readlines()
4568 batchurls = [x.strip() for x in batchurls]
4569 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4571 sys.exit(u'ERROR: batch file could not be read')
4572 all_urls = batchurls + args
4573 all_urls = map(lambda url: url.strip(), all_urls)
4575 # General configuration
4576 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4577 proxy_handler = urllib2.ProxyHandler()
4578 opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4579 urllib2.install_opener(opener)
4580 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4583 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4585 extractors = gen_extractors()
4587 if opts.list_extractors:
4588 for ie in extractors:
4590 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4591 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4592 for mu in matchedUrls:
4596 # Conflicting, missing and erroneous options
4597 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4598 parser.error(u'using .netrc conflicts with giving username/password')
4599 if opts.password is not None and opts.username is None:
4600 parser.error(u'account username missing')
4601 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4602 parser.error(u'using output template conflicts with using title, literal title or auto number')
4603 if opts.usetitle and opts.useliteral:
4604 parser.error(u'using title conflicts with using literal title')
4605 if opts.username is not None and opts.password is None:
4606 opts.password = getpass.getpass(u'Type account password and press return:')
4607 if opts.ratelimit is not None:
4608 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4609 if numeric_limit is None:
4610 parser.error(u'invalid rate limit specified')
4611 opts.ratelimit = numeric_limit
4612 if opts.retries is not None:
4614 opts.retries = long(opts.retries)
4615 except (TypeError, ValueError), err:
4616 parser.error(u'invalid retry count specified')
4618 opts.playliststart = int(opts.playliststart)
4619 if opts.playliststart <= 0:
4620 raise ValueError(u'Playlist start must be positive')
4621 except (TypeError, ValueError), err:
4622 parser.error(u'invalid playlist start number specified')
4624 opts.playlistend = int(opts.playlistend)
4625 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4626 raise ValueError(u'Playlist end must be greater than playlist start')
4627 except (TypeError, ValueError), err:
4628 parser.error(u'invalid playlist end number specified')
4629 if opts.extractaudio:
4630 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4631 parser.error(u'invalid audio format specified')
4634 fd = FileDownloader({
4635 'usenetrc': opts.usenetrc,
4636 'username': opts.username,
4637 'password': opts.password,
4638 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4639 'forceurl': opts.geturl,
4640 'forcetitle': opts.gettitle,
4641 'forcethumbnail': opts.getthumbnail,
4642 'forcedescription': opts.getdescription,
4643 'forcefilename': opts.getfilename,
4644 'forceformat': opts.getformat,
4645 'simulate': opts.simulate,
4646 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4647 'format': opts.format,
4648 'format_limit': opts.format_limit,
4649 'listformats': opts.listformats,
4650 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4651 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4652 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4653 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4654 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4655 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4656 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4657 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4658 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4659 or u'%(id)s.%(ext)s'),
4660 'ignoreerrors': opts.ignoreerrors,
4661 'ratelimit': opts.ratelimit,
4662 'nooverwrites': opts.nooverwrites,
4663 'retries': opts.retries,
4664 'continuedl': opts.continue_dl,
4665 'noprogress': opts.noprogress,
4666 'playliststart': opts.playliststart,
4667 'playlistend': opts.playlistend,
4668 'logtostderr': opts.outtmpl == '-',
4669 'consoletitle': opts.consoletitle,
4670 'nopart': opts.nopart,
4671 'updatetime': opts.updatetime,
4672 'writedescription': opts.writedescription,
4673 'writeinfojson': opts.writeinfojson,
4674 'writesubtitles': opts.writesubtitles,
4675 'subtitleslang': opts.subtitleslang,
4676 'matchtitle': opts.matchtitle,
4677 'rejecttitle': opts.rejecttitle,
4678 'max_downloads': opts.max_downloads,
4679 'prefer_free_formats': opts.prefer_free_formats,
4680 'verbose': opts.verbose,
4682 for extractor in extractors:
4683 fd.add_info_extractor(extractor)
4686 if opts.extractaudio:
4687 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4690 if opts.update_self:
4691 updateSelf(fd, sys.argv[0])
4694 if len(all_urls) < 1:
4695 if not opts.update_self:
4696 parser.error(u'you must provide at least one URL')
4701 retcode = fd.download(all_urls)
4702 except MaxDownloadsReached:
4703 fd.to_screen(u'--max-download limit reached, aborting.')
4706 # Dump cookie jar if requested
4707 if opts.cookiefile is not None:
4710 except (IOError, OSError), err:
4711 sys.exit(u'ERROR: unable to save cookie jar')
4718 except DownloadError:
4720 except SameFileError:
4721 sys.exit(u'ERROR: fixed output name but more than one file to download')
4722 except KeyboardInterrupt:
4723 sys.exit(u'\nERROR: Interrupted by user')
4725 if __name__ == '__main__':
4728 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: