2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
21 __license__ = 'Public Domain'
22 __version__ = '2012.02.27'
24 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
57 except ImportError: # Python 2.4
60 import cStringIO as StringIO
64 # parse_qs was moved from the cgi module to the urlparse module recently.
66 from urlparse import parse_qs
68 from cgi import parse_qs
76 import xml.etree.ElementTree
77 except ImportError: # Python<2.5: Not officially supported, but let it slip
78 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
81 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
82 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
83 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Encoding': 'gzip, deflate',
85 'Accept-Language': 'en-us,en;q=0.5',
90 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
96 def raiseError(msg, i):
97 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
98 def skipSpace(i, expectMore=True):
99 while i < len(s) and s[i] in ' \t\r\n':
103 raiseError('Premature end', i)
105 def decodeEscape(match):
121 return unichr(int(esc[1:5], 16))
122 if len(esc) == 5+6 and esc[5:7] == '\\u':
123 hi = int(esc[1:5], 16)
124 low = int(esc[7:11], 16)
125 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
126 raise ValueError('Unknown escape ' + str(esc))
133 while s[e-bslashes-1] == '\\':
135 if bslashes % 2 == 1:
139 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
140 stri = rexp.sub(decodeEscape, s[i:e])
146 if s[i] == '}': # Empty dictionary
150 raiseError('Expected a string object key', i)
151 i,key = parseString(i)
153 if i >= len(s) or s[i] != ':':
154 raiseError('Expected a colon', i)
161 raiseError('Expected comma or closing curly brace', i)
166 if s[i] == ']': # Empty array
171 i = skipSpace(i) # Raise exception if premature end
175 raiseError('Expected a comma or closing bracket', i)
177 def parseDiscrete(i):
178 for k,v in {'true': True, 'false': False, 'null': None}.items():
179 if s.startswith(k, i):
181 raiseError('Not a boolean (or null)', i)
183 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
185 raiseError('Not a number', i)
187 if '.' in nums or 'e' in nums or 'E' in nums:
188 return (i+len(nums), float(nums))
189 return (i+len(nums), int(nums))
190 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
193 i,res = CHARMAP.get(s[i], parseNumber)(i)
194 i = skipSpace(i, False)
198 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
201 def preferredencoding():
202 """Get preferred encoding.
204 Returns the best encoding scheme for the system, based on
205 locale.getpreferredencoding() and some further tweaks.
207 def yield_preferredencoding():
209 pref = locale.getpreferredencoding()
215 return yield_preferredencoding().next()
218 def htmlentity_transform(matchobj):
219 """Transforms an HTML entity to a Unicode character.
221 This function receives a match object and is intended to be used with
222 the re.sub() function.
224 entity = matchobj.group(1)
226 # Known non-numeric HTML entity
227 if entity in htmlentitydefs.name2codepoint:
228 return unichr(htmlentitydefs.name2codepoint[entity])
231 mobj = re.match(ur'(?u)#(x?\d+)', entity)
233 numstr = mobj.group(1)
234 if numstr.startswith(u'x'):
236 numstr = u'0%s' % numstr
239 return unichr(long(numstr, base))
241 # Unknown entity in name, return its literal representation
242 return (u'&%s;' % entity)
245 def sanitize_title(utitle):
246 """Sanitizes a video title so it could be used as part of a filename."""
247 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
248 return utitle.replace(unicode(os.sep), u'%')
251 def sanitize_open(filename, open_mode):
252 """Try to open the given filename, and slightly tweak it if this fails.
254 Attempts to open the given filename. If this fails, it tries to change
255 the filename slightly, step by step, until it's either able to open it
256 or it fails and raises a final exception, like the standard open()
259 It returns the tuple (stream, definitive_file_name).
263 if sys.platform == 'win32':
265 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
266 return (sys.stdout, filename)
267 stream = open(_encodeFilename(filename), open_mode)
268 return (stream, filename)
269 except (IOError, OSError), err:
270 # In case of error, try to remove win32 forbidden chars
271 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
273 # An exception here should be caught in the caller
274 stream = open(_encodeFilename(filename), open_mode)
275 return (stream, filename)
278 def timeconvert(timestr):
279 """Convert RFC 2822 defined time string into system timestamp"""
281 timetuple = email.utils.parsedate_tz(timestr)
282 if timetuple is not None:
283 timestamp = email.utils.mktime_tz(timetuple)
286 def _simplify_title(title):
287 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
288 return expr.sub(u'_', title).strip(u'_')
290 def _orderedSet(iterable):
291 """ Remove all duplicates from the input iterable """
298 def _unescapeHTML(s):
300 @param s a string (of type unicode)
302 assert type(s) == type(u'')
304 htmlParser = HTMLParser.HTMLParser()
305 return htmlParser.unescape(s)
307 def _encodeFilename(s):
309 @param s The name of the file (of type unicode)
312 assert type(s) == type(u'')
314 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
315 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
316 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
317 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
320 return s.encode(sys.getfilesystemencoding(), 'ignore')
322 class DownloadError(Exception):
323 """Download Error exception.
325 This exception may be thrown by FileDownloader objects if they are not
326 configured to continue on errors. They will contain the appropriate
332 class SameFileError(Exception):
333 """Same File exception.
335 This exception will be thrown by FileDownloader objects if they detect
336 multiple files would have to be downloaded to the same file on disk.
341 class PostProcessingError(Exception):
342 """Post Processing exception.
344 This exception may be raised by PostProcessor's .run() method to
345 indicate an error in the postprocessing task.
349 class MaxDownloadsReached(Exception):
350 """ --max-downloads limit has been reached. """
354 class UnavailableVideoError(Exception):
355 """Unavailable Format exception.
357 This exception will be thrown when a video is requested
358 in a format that is not available for that video.
363 class ContentTooShortError(Exception):
364 """Content Too Short exception.
366 This exception may be raised by FileDownloader objects when a file they
367 download is too small for what the server announced first, indicating
368 the connection was probably interrupted.
374 def __init__(self, downloaded, expected):
375 self.downloaded = downloaded
376 self.expected = expected
379 class YoutubeDLHandler(urllib2.HTTPHandler):
380 """Handler for HTTP requests and responses.
382 This class, when installed with an OpenerDirector, automatically adds
383 the standard headers to every HTTP request and handles gzipped and
384 deflated responses from web servers. If compression is to be avoided in
385 a particular request, the original request in the program code only has
386 to include the HTTP header "Youtubedl-No-Compression", which will be
387 removed before making the real request.
389 Part of this code was copied from:
391 http://techknack.net/python-urllib2-handlers/
393 Andrew Rowls, the author of that code, agreed to release it to the
400 return zlib.decompress(data, -zlib.MAX_WBITS)
402 return zlib.decompress(data)
405 def addinfourl_wrapper(stream, headers, url, code):
406 if hasattr(urllib2.addinfourl, 'getcode'):
407 return urllib2.addinfourl(stream, headers, url, code)
408 ret = urllib2.addinfourl(stream, headers, url)
412 def http_request(self, req):
413 for h in std_headers:
416 req.add_header(h, std_headers[h])
417 if 'Youtubedl-no-compression' in req.headers:
418 if 'Accept-encoding' in req.headers:
419 del req.headers['Accept-encoding']
420 del req.headers['Youtubedl-no-compression']
423 def http_response(self, req, resp):
426 if resp.headers.get('Content-encoding', '') == 'gzip':
427 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
428 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
429 resp.msg = old_resp.msg
431 if resp.headers.get('Content-encoding', '') == 'deflate':
432 gz = StringIO.StringIO(self.deflate(resp.read()))
433 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
434 resp.msg = old_resp.msg
438 class FileDownloader(object):
439 """File Downloader class.
441 File downloader objects are the ones responsible of downloading the
442 actual video file and writing it to disk if the user has requested
443 it, among some other tasks. In most cases there should be one per
444 program. As, given a video URL, the downloader doesn't know how to
445 extract all the needed information, task that InfoExtractors do, it
446 has to pass the URL to one of them.
448 For this, file downloader objects have a method that allows
449 InfoExtractors to be registered in a given order. When it is passed
450 a URL, the file downloader handles it to the first InfoExtractor it
451 finds that reports being able to handle it. The InfoExtractor extracts
452 all the information about the video or videos the URL refers to, and
453 asks the FileDownloader to process the video information, possibly
454 downloading the video.
456 File downloaders accept a lot of parameters. In order not to saturate
457 the object constructor with arguments, it receives a dictionary of
458 options instead. These options are available through the params
459 attribute for the InfoExtractors to use. The FileDownloader also
460 registers itself as the downloader in charge for the InfoExtractors
461 that are added to it, so this is a "mutual registration".
465 username: Username for authentication purposes.
466 password: Password for authentication purposes.
467 usenetrc: Use netrc for authentication instead.
468 quiet: Do not print messages to stdout.
469 forceurl: Force printing final URL.
470 forcetitle: Force printing title.
471 forcethumbnail: Force printing thumbnail URL.
472 forcedescription: Force printing description.
473 forcefilename: Force printing final filename.
474 simulate: Do not download the video files.
475 format: Video format code.
476 format_limit: Highest quality format to try.
477 outtmpl: Template for output names.
478 ignoreerrors: Do not stop on download errors.
479 ratelimit: Download speed limit, in bytes/sec.
480 nooverwrites: Prevent overwriting files.
481 retries: Number of times to retry for HTTP error 5xx
482 continuedl: Try to continue downloads if possible.
483 noprogress: Do not print the progress bar.
484 playliststart: Playlist item to start at.
485 playlistend: Playlist item to end at.
486 matchtitle: Download only matching titles.
487 rejecttitle: Reject downloads for matching titles.
488 logtostderr: Log messages to stderr instead of stdout.
489 consoletitle: Display progress in console window's titlebar.
490 nopart: Do not use temporary .part files.
491 updatetime: Use the Last-modified header to set output file timestamps.
492 writedescription: Write the video description to a .description file
493 writeinfojson: Write the video description to a .info.json file
494 writesubtitles: Write the video subtitles to a .srt file
495 subtitleslang: Language of the subtitles to download
501 _download_retcode = None
502 _num_downloads = None
505 def __init__(self, params):
506 """Create a FileDownloader object with the given options."""
509 self._download_retcode = 0
510 self._num_downloads = 0
511 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
515 def format_bytes(bytes):
518 if type(bytes) is str:
523 exponent = long(math.log(bytes, 1024.0))
524 suffix = 'bkMGTPEZY'[exponent]
525 converted = float(bytes) / float(1024 ** exponent)
526 return '%.2f%s' % (converted, suffix)
529 def calc_percent(byte_counter, data_len):
532 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
535 def calc_eta(start, now, total, current):
539 if current == 0 or dif < 0.001: # One millisecond
541 rate = float(current) / dif
542 eta = long((float(total) - float(current)) / rate)
543 (eta_mins, eta_secs) = divmod(eta, 60)
546 return '%02d:%02d' % (eta_mins, eta_secs)
549 def calc_speed(start, now, bytes):
551 if bytes == 0 or dif < 0.001: # One millisecond
552 return '%10s' % '---b/s'
553 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
556 def best_block_size(elapsed_time, bytes):
557 new_min = max(bytes / 2.0, 1.0)
558 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
559 if elapsed_time < 0.001:
561 rate = bytes / elapsed_time
569 def parse_bytes(bytestr):
570 """Parse a string indicating a byte quantity into a long integer."""
571 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
574 number = float(matchobj.group(1))
575 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
576 return long(round(number * multiplier))
578 def add_info_extractor(self, ie):
579 """Add an InfoExtractor object to the end of the list."""
581 ie.set_downloader(self)
583 def add_post_processor(self, pp):
584 """Add a PostProcessor object to the end of the chain."""
586 pp.set_downloader(self)
588 def to_screen(self, message, skip_eol=False):
589 """Print message to stdout if not in quiet mode."""
590 assert type(message) == type(u'')
591 if not self.params.get('quiet', False):
592 terminator = [u'\n', u''][skip_eol]
593 output = message + terminator
595 if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
596 output = output.encode(preferredencoding(), 'ignore')
597 self._screen_file.write(output)
598 self._screen_file.flush()
600 def to_stderr(self, message):
601 """Print message to stderr."""
602 print >>sys.stderr, message.encode(preferredencoding())
604 def to_cons_title(self, message):
605 """Set console/terminal window title to message."""
606 if not self.params.get('consoletitle', False):
608 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
609 # c_wchar_p() might not be necessary if `message` is
610 # already of type unicode()
611 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
612 elif 'TERM' in os.environ:
613 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
615 def fixed_template(self):
616 """Checks if the output template is fixed."""
617 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
619 def trouble(self, message=None):
620 """Determine action to take when a download problem appears.
622 Depending on if the downloader has been configured to ignore
623 download errors or not, this method may throw an exception or
624 not when errors are found, after printing the message.
626 if message is not None:
627 self.to_stderr(message)
628 if not self.params.get('ignoreerrors', False):
629 raise DownloadError(message)
630 self._download_retcode = 1
632 def slow_down(self, start_time, byte_counter):
633 """Sleep if the download speed is over the rate limit."""
634 rate_limit = self.params.get('ratelimit', None)
635 if rate_limit is None or byte_counter == 0:
638 elapsed = now - start_time
641 speed = float(byte_counter) / elapsed
642 if speed > rate_limit:
643 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
645 def temp_name(self, filename):
646 """Returns a temporary filename for the given filename."""
647 if self.params.get('nopart', False) or filename == u'-' or \
648 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
650 return filename + u'.part'
652 def undo_temp_name(self, filename):
653 if filename.endswith(u'.part'):
654 return filename[:-len(u'.part')]
657 def try_rename(self, old_filename, new_filename):
659 if old_filename == new_filename:
661 os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
662 except (IOError, OSError), err:
663 self.trouble(u'ERROR: unable to rename file')
665 def try_utime(self, filename, last_modified_hdr):
666 """Try to set the last-modified time of the given file."""
667 if last_modified_hdr is None:
669 if not os.path.isfile(_encodeFilename(filename)):
671 timestr = last_modified_hdr
674 filetime = timeconvert(timestr)
678 os.utime(filename, (time.time(), filetime))
683 def report_writedescription(self, descfn):
684 """ Report that the description file is being written """
685 self.to_screen(u'[info] Writing video description to: ' + descfn)
687 def report_writesubtitles(self, srtfn):
688 """ Report that the subtitles file is being written """
689 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
691 def report_writeinfojson(self, infofn):
692 """ Report that the metadata file has been written """
693 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
695 def report_destination(self, filename):
696 """Report destination filename."""
697 self.to_screen(u'[download] Destination: ' + filename)
699 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
700 """Report download progress."""
701 if self.params.get('noprogress', False):
703 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
704 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
705 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
706 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
708 def report_resuming_byte(self, resume_len):
709 """Report attempt to resume at given byte."""
710 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
712 def report_retry(self, count, retries):
713 """Report retry in case of HTTP error 5xx"""
714 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
716 def report_file_already_downloaded(self, file_name):
717 """Report file has already been fully downloaded."""
719 self.to_screen(u'[download] %s has already been downloaded' % file_name)
720 except (UnicodeEncodeError), err:
721 self.to_screen(u'[download] The file has already been downloaded')
723 def report_unable_to_resume(self):
724 """Report it was impossible to resume download."""
725 self.to_screen(u'[download] Unable to resume')
727 def report_finish(self):
728 """Report download finished."""
729 if self.params.get('noprogress', False):
730 self.to_screen(u'[download] Download completed')
734 def increment_downloads(self):
735 """Increment the ordinal that assigns a number to each file."""
736 self._num_downloads += 1
738 def prepare_filename(self, info_dict):
739 """Generate the output filename."""
741 template_dict = dict(info_dict)
742 template_dict['epoch'] = unicode(long(time.time()))
743 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
744 filename = self.params['outtmpl'] % template_dict
746 except (ValueError, KeyError), err:
747 self.trouble(u'ERROR: invalid system charset or erroneous output template')
750 def _match_entry(self, info_dict):
751 """ Returns None iff the file should be downloaded """
753 title = info_dict['title']
754 matchtitle = self.params.get('matchtitle', False)
755 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
756 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
757 rejecttitle = self.params.get('rejecttitle', False)
758 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
759 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
762 def process_info(self, info_dict):
763 """Process a single dictionary returned by an InfoExtractor."""
765 reason = self._match_entry(info_dict)
766 if reason is not None:
767 self.to_screen(u'[download] ' + reason)
770 max_downloads = self.params.get('max_downloads')
771 if max_downloads is not None:
772 if self._num_downloads > int(max_downloads):
773 raise MaxDownloadsReached()
775 filename = self.prepare_filename(info_dict)
778 if self.params.get('forcetitle', False):
779 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
780 if self.params.get('forceurl', False):
781 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
782 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
783 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
784 if self.params.get('forcedescription', False) and 'description' in info_dict:
785 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
786 if self.params.get('forcefilename', False) and filename is not None:
787 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
788 if self.params.get('forceformat', False):
789 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
791 # Do nothing else if in simulate mode
792 if self.params.get('simulate', False):
799 dn = os.path.dirname(_encodeFilename(filename))
800 if dn != '' and not os.path.exists(dn): # dn is already encoded
802 except (OSError, IOError), err:
803 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
806 if self.params.get('writedescription', False):
808 descfn = filename + u'.description'
809 self.report_writedescription(descfn)
810 descfile = open(_encodeFilename(descfn), 'wb')
812 descfile.write(info_dict['description'].encode('utf-8'))
815 except (OSError, IOError):
816 self.trouble(u'ERROR: Cannot write description file ' + descfn)
819 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
820 # subtitles download errors are already managed as troubles in relevant IE
821 # that way it will silently go on when used with unsupporting IE
823 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
824 self.report_writesubtitles(srtfn)
825 srtfile = open(_encodeFilename(srtfn), 'wb')
827 srtfile.write(info_dict['subtitles'].encode('utf-8'))
830 except (OSError, IOError):
831 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
834 if self.params.get('writeinfojson', False):
835 infofn = filename + u'.info.json'
836 self.report_writeinfojson(infofn)
839 except (NameError,AttributeError):
840 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
843 infof = open(_encodeFilename(infofn), 'wb')
845 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
846 json.dump(json_info_dict, infof)
849 except (OSError, IOError):
850 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
853 if not self.params.get('skip_download', False):
854 if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
858 success = self._do_download(filename, info_dict)
859 except (OSError, IOError), err:
860 raise UnavailableVideoError
861 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
862 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
864 except (ContentTooShortError, ), err:
865 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
870 self.post_process(filename, info_dict)
871 except (PostProcessingError), err:
872 self.trouble(u'ERROR: postprocessing: %s' % str(err))
875 def download(self, url_list):
876 """Download a given list of URLs."""
877 if len(url_list) > 1 and self.fixed_template():
878 raise SameFileError(self.params['outtmpl'])
881 suitable_found = False
883 # Go to next InfoExtractor if not suitable
884 if not ie.suitable(url):
887 # Suitable InfoExtractor found
888 suitable_found = True
890 # Extract information from URL and process it
893 # Suitable InfoExtractor had been found; go to next URL
896 if not suitable_found:
897 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
899 return self._download_retcode
901 def post_process(self, filename, ie_info):
902 """Run the postprocessing chain on the given file."""
904 info['filepath'] = filename
910 def _download_with_rtmpdump(self, filename, url, player_url):
911 self.report_destination(filename)
912 tmpfilename = self.temp_name(filename)
914 # Check for rtmpdump first
916 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
917 except (OSError, IOError):
918 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
921 # Download using rtmpdump. rtmpdump returns exit code 2 when
922 # the connection was interrumpted and resuming appears to be
923 # possible. This is part of rtmpdump's normal usage, AFAIK.
924 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
925 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
926 if self.params.get('verbose', False):
929 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
932 self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
933 retval = subprocess.call(args)
934 while retval == 2 or retval == 1:
935 prevsize = os.path.getsize(_encodeFilename(tmpfilename))
936 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
937 time.sleep(5.0) # This seems to be needed
938 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
939 cursize = os.path.getsize(_encodeFilename(tmpfilename))
940 if prevsize == cursize and retval == 1:
942 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
943 if prevsize == cursize and retval == 2 and cursize > 1024:
944 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
948 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
949 self.try_rename(tmpfilename, filename)
952 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
955 def _do_download(self, filename, info_dict):
956 url = info_dict['url']
957 player_url = info_dict.get('player_url', None)
959 # Check file already present
960 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
961 self.report_file_already_downloaded(filename)
964 # Attempt to download using rtmpdump
965 if url.startswith('rtmp'):
966 return self._download_with_rtmpdump(filename, url, player_url)
968 tmpfilename = self.temp_name(filename)
971 # Do not include the Accept-Encoding header
972 headers = {'Youtubedl-no-compression': 'True'}
973 basic_request = urllib2.Request(url, None, headers)
974 request = urllib2.Request(url, None, headers)
976 # Establish possible resume length
977 if os.path.isfile(_encodeFilename(tmpfilename)):
978 resume_len = os.path.getsize(_encodeFilename(tmpfilename))
984 if self.params.get('continuedl', False):
985 self.report_resuming_byte(resume_len)
986 request.add_header('Range','bytes=%d-' % resume_len)
992 retries = self.params.get('retries', 0)
993 while count <= retries:
994 # Establish connection
996 if count == 0 and 'urlhandle' in info_dict:
997 data = info_dict['urlhandle']
998 data = urllib2.urlopen(request)
1000 except (urllib2.HTTPError, ), err:
1001 if (err.code < 500 or err.code >= 600) and err.code != 416:
1002 # Unexpected HTTP error
1004 elif err.code == 416:
1005 # Unable to resume (requested range not satisfiable)
1007 # Open the connection again without the range header
1008 data = urllib2.urlopen(basic_request)
1009 content_length = data.info()['Content-Length']
1010 except (urllib2.HTTPError, ), err:
1011 if err.code < 500 or err.code >= 600:
1014 # Examine the reported length
1015 if (content_length is not None and
1016 (resume_len - 100 < long(content_length) < resume_len + 100)):
1017 # The file had already been fully downloaded.
1018 # Explanation to the above condition: in issue #175 it was revealed that
1019 # YouTube sometimes adds or removes a few bytes from the end of the file,
1020 # changing the file size slightly and causing problems for some users. So
1021 # I decided to implement a suggested change and consider the file
1022 # completely downloaded if the file size differs less than 100 bytes from
1023 # the one in the hard drive.
1024 self.report_file_already_downloaded(filename)
1025 self.try_rename(tmpfilename, filename)
1028 # The length does not match, we start the download over
1029 self.report_unable_to_resume()
1034 if count <= retries:
1035 self.report_retry(count, retries)
1038 self.trouble(u'ERROR: giving up after %s retries' % retries)
1041 data_len = data.info().get('Content-length', None)
1042 if data_len is not None:
1043 data_len = long(data_len) + resume_len
1044 data_len_str = self.format_bytes(data_len)
1045 byte_counter = 0 + resume_len
1049 # Download and write
1050 before = time.time()
1051 data_block = data.read(block_size)
1053 if len(data_block) == 0:
1055 byte_counter += len(data_block)
1057 # Open file just in time
1060 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1061 assert stream is not None
1062 filename = self.undo_temp_name(tmpfilename)
1063 self.report_destination(filename)
1064 except (OSError, IOError), err:
1065 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1068 stream.write(data_block)
1069 except (IOError, OSError), err:
1070 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1072 block_size = self.best_block_size(after - before, len(data_block))
1075 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1076 if data_len is None:
1077 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1079 percent_str = self.calc_percent(byte_counter, data_len)
1080 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1081 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1084 self.slow_down(start, byte_counter - resume_len)
1087 self.trouble(u'\nERROR: Did not get any data blocks')
1090 self.report_finish()
1091 if data_len is not None and byte_counter != data_len:
1092 raise ContentTooShortError(byte_counter, long(data_len))
1093 self.try_rename(tmpfilename, filename)
1095 # Update file modification time
1096 if self.params.get('updatetime', True):
1097 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1102 class InfoExtractor(object):
1103 """Information Extractor class.
1105 Information extractors are the classes that, given a URL, extract
1106 information from the video (or videos) the URL refers to. This
1107 information includes the real video URL, the video title and simplified
1108 title, author and others. The information is stored in a dictionary
1109 which is then passed to the FileDownloader. The FileDownloader
1110 processes this information possibly downloading the video to the file
1111 system, among other possible outcomes. The dictionaries must include
1112 the following fields:
1114 id: Video identifier.
1115 url: Final video URL.
1116 uploader: Nickname of the video uploader.
1117 title: Literal title.
1118 stitle: Simplified title.
1119 ext: Video filename extension.
1120 format: Video format.
1121 player_url: SWF Player URL (may be None).
1123 The following fields are optional. Their primary purpose is to allow
1124 youtube-dl to serve as the backend for a video search function, such
1125 as the one in youtube2mp3. They are only used when their respective
1126 forced printing functions are called:
1128 thumbnail: Full URL to a video thumbnail image.
1129 description: One-line video description.
1131 Subclasses of this one should re-define the _real_initialize() and
1132 _real_extract() methods and define a _VALID_URL regexp.
1133 Probably, they should also be added to the list of extractors.
1139 def __init__(self, downloader=None):
1140 """Constructor. Receives an optional downloader."""
1142 self.set_downloader(downloader)
1144 def suitable(self, url):
1145 """Receives a URL and returns True if suitable for this IE."""
1146 return re.match(self._VALID_URL, url) is not None
1148 def initialize(self):
1149 """Initializes an instance (authentication, etc)."""
1151 self._real_initialize()
1154 def extract(self, url):
1155 """Extracts URL information and returns it in list of dicts."""
1157 return self._real_extract(url)
1159 def set_downloader(self, downloader):
1160 """Sets the downloader for this IE."""
1161 self._downloader = downloader
1163 def _real_initialize(self):
1164 """Real initialization process. Redefine in subclasses."""
1167 def _real_extract(self, url):
1168 """Real extraction process. Redefine in subclasses."""
1172 class YoutubeIE(InfoExtractor):
1173 """Information extractor for youtube.com."""
1175 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1176 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1177 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1178 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1179 _NETRC_MACHINE = 'youtube'
1180 # Listed in order of quality
1181 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1182 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1183 _video_extensions = {
1189 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1194 _video_dimensions = {
1209 IE_NAME = u'youtube'
1211 def report_lang(self):
1212 """Report attempt to set language."""
1213 self._downloader.to_screen(u'[youtube] Setting language')
1215 def report_login(self):
1216 """Report attempt to log in."""
1217 self._downloader.to_screen(u'[youtube] Logging in')
1219 def report_age_confirmation(self):
1220 """Report attempt to confirm age."""
1221 self._downloader.to_screen(u'[youtube] Confirming age')
1223 def report_video_webpage_download(self, video_id):
1224 """Report attempt to download video webpage."""
1225 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1227 def report_video_info_webpage_download(self, video_id):
1228 """Report attempt to download video info webpage."""
1229 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1231 def report_video_subtitles_download(self, video_id):
1232 """Report attempt to download video info webpage."""
1233 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1235 def report_information_extraction(self, video_id):
1236 """Report attempt to extract video information."""
1237 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1239 def report_unavailable_format(self, video_id, format):
1240 """Report extracted video URL."""
1241 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1243 def report_rtmp_download(self):
1244 """Indicate the download will use the RTMP protocol."""
1245 self._downloader.to_screen(u'[youtube] RTMP download detected')
1247 def _closed_captions_xml_to_srt(self, xml_string):
1249 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1250 # TODO parse xml instead of regex
1251 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1252 if not dur: dur = '4'
1253 start = float(start)
1254 end = start + float(dur)
1255 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1256 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1257 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
1258 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
1259 srt += str(n) + '\n'
1260 srt += start + ' --> ' + end + '\n'
1261 srt += caption + '\n\n'
1264 def _print_formats(self, formats):
1265 print 'Available formats:'
1267 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1269 def _real_initialize(self):
1270 if self._downloader is None:
1275 downloader_params = self._downloader.params
1277 # Attempt to use provided username and password or .netrc data
1278 if downloader_params.get('username', None) is not None:
1279 username = downloader_params['username']
1280 password = downloader_params['password']
1281 elif downloader_params.get('usenetrc', False):
1283 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1284 if info is not None:
1288 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1289 except (IOError, netrc.NetrcParseError), err:
1290 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1294 request = urllib2.Request(self._LANG_URL)
1297 urllib2.urlopen(request).read()
1298 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1299 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1302 # No authentication to be performed
1303 if username is None:
1308 'current_form': 'loginForm',
1310 'action_login': 'Log In',
1311 'username': username,
1312 'password': password,
1314 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1317 login_results = urllib2.urlopen(request).read()
1318 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1319 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1321 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1322 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1328 'action_confirm': 'Confirm',
1330 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1332 self.report_age_confirmation()
1333 age_results = urllib2.urlopen(request).read()
1334 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1335 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1338 def _real_extract(self, url):
1339 # Extract video id from URL
1340 mobj = re.match(self._VALID_URL, url)
1342 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1344 video_id = mobj.group(2)
1347 self.report_video_webpage_download(video_id)
1348 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1350 video_webpage = urllib2.urlopen(request).read()
1351 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1352 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1355 # Attempt to extract SWF player URL
1356 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1357 if mobj is not None:
1358 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1363 self.report_video_info_webpage_download(video_id)
1364 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1365 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1366 % (video_id, el_type))
1367 request = urllib2.Request(video_info_url)
1369 video_info_webpage = urllib2.urlopen(request).read()
1370 video_info = parse_qs(video_info_webpage)
1371 if 'token' in video_info:
1373 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1374 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1376 if 'token' not in video_info:
1377 if 'reason' in video_info:
1378 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1380 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1383 # Start extracting information
1384 self.report_information_extraction(video_id)
1387 if 'author' not in video_info:
1388 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1390 video_uploader = urllib.unquote_plus(video_info['author'][0])
1393 if 'title' not in video_info:
1394 self._downloader.trouble(u'ERROR: unable to extract video title')
1396 video_title = urllib.unquote_plus(video_info['title'][0])
1397 video_title = video_title.decode('utf-8')
1398 video_title = sanitize_title(video_title)
1401 simple_title = _simplify_title(video_title)
1404 if 'thumbnail_url' not in video_info:
1405 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1406 video_thumbnail = ''
1407 else: # don't panic if we can't find it
1408 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1412 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1413 if mobj is not None:
1414 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1415 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1416 for expression in format_expressions:
1418 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1426 video_description = u'No description available.'
1427 mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1428 if mobj is not None:
1429 video_description = mobj.group(1).decode('utf-8')
1431 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1432 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1433 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1434 # TODO use another parser
1437 video_subtitles = None
1438 if self._downloader.params.get('writesubtitles', False):
1439 self.report_video_subtitles_download(video_id)
1440 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1442 srt_list = urllib2.urlopen(request).read()
1443 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1444 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1446 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1448 if self._downloader.params.get('subtitleslang', False):
1449 srt_lang = self._downloader.params.get('subtitleslang')
1450 elif 'en' in srt_lang_list:
1453 srt_lang = srt_lang_list[0]
1454 if not srt_lang in srt_lang_list:
1455 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
1457 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1459 srt_xml = urllib2.urlopen(request).read()
1460 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1461 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1463 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1465 self._downloader.trouble(u'WARNING: video has no closed captions')
1468 video_token = urllib.unquote_plus(video_info['token'][0])
1470 # Decide which formats to download
1471 req_format = self._downloader.params.get('format', None)
1473 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1474 self.report_rtmp_download()
1475 video_url_list = [(None, video_info['conn'][0])]
1476 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1477 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1478 url_data = [parse_qs(uds) for uds in url_data_strs]
1479 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1480 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1482 format_limit = self._downloader.params.get('format_limit', None)
1483 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1484 if format_limit is not None and format_limit in available_formats:
1485 format_list = available_formats[available_formats.index(format_limit):]
1487 format_list = available_formats
1488 existing_formats = [x for x in format_list if x in url_map]
1489 if len(existing_formats) == 0:
1490 self._downloader.trouble(u'ERROR: no known formats available for video')
1492 if self._downloader.params.get('listformats', None):
1493 self._print_formats(existing_formats)
1495 if req_format is None or req_format == 'best':
1496 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1497 elif req_format == 'worst':
1498 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1499 elif req_format in ('-1', 'all'):
1500 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1502 # Specific formats. We pick the first in a slash-delimeted sequence.
1503 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1504 req_formats = req_format.split('/')
1505 video_url_list = None
1506 for rf in req_formats:
1508 video_url_list = [(rf, url_map[rf])]
1510 if video_url_list is None:
1511 self._downloader.trouble(u'ERROR: requested format not available')
1514 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1517 for format_param, video_real_url in video_url_list:
1518 # At this point we have a new video
1519 self._downloader.increment_downloads()
1522 video_extension = self._video_extensions.get(format_param, 'flv')
1525 # Process video information
1526 self._downloader.process_info({
1527 'id': video_id.decode('utf-8'),
1528 'url': video_real_url.decode('utf-8'),
1529 'uploader': video_uploader.decode('utf-8'),
1530 'upload_date': upload_date,
1531 'title': video_title,
1532 'stitle': simple_title,
1533 'ext': video_extension.decode('utf-8'),
1534 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1535 'thumbnail': video_thumbnail.decode('utf-8'),
1536 'description': video_description,
1537 'player_url': player_url,
1538 'subtitles': video_subtitles
1540 except UnavailableVideoError, err:
1541 self._downloader.trouble(u'\nERROR: unable to download video')
1544 class MetacafeIE(InfoExtractor):
1545 """Information Extractor for metacafe.com."""
1547 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1548 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1549 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1551 IE_NAME = u'metacafe'
1553 def __init__(self, youtube_ie, downloader=None):
1554 InfoExtractor.__init__(self, downloader)
1555 self._youtube_ie = youtube_ie
1557 def report_disclaimer(self):
1558 """Report disclaimer retrieval."""
1559 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1561 def report_age_confirmation(self):
1562 """Report attempt to confirm age."""
1563 self._downloader.to_screen(u'[metacafe] Confirming age')
1565 def report_download_webpage(self, video_id):
1566 """Report webpage download."""
1567 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1569 def report_extraction(self, video_id):
1570 """Report information extraction."""
1571 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1573 def _real_initialize(self):
1574 # Retrieve disclaimer
1575 request = urllib2.Request(self._DISCLAIMER)
1577 self.report_disclaimer()
1578 disclaimer = urllib2.urlopen(request).read()
1579 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1580 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1586 'submit': "Continue - I'm over 18",
1588 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1590 self.report_age_confirmation()
1591 disclaimer = urllib2.urlopen(request).read()
1592 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1593 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1596 def _real_extract(self, url):
1597 # Extract id and simplified title from URL
1598 mobj = re.match(self._VALID_URL, url)
1600 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1603 video_id = mobj.group(1)
1605 # Check if video comes from YouTube
1606 mobj2 = re.match(r'^yt-(.*)$', video_id)
1607 if mobj2 is not None:
1608 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1611 # At this point we have a new video
1612 self._downloader.increment_downloads()
1614 simple_title = mobj.group(2).decode('utf-8')
1616 # Retrieve video webpage to extract further information
1617 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1619 self.report_download_webpage(video_id)
1620 webpage = urllib2.urlopen(request).read()
1621 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1622 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1625 # Extract URL, uploader and title from webpage
1626 self.report_extraction(video_id)
1627 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1628 if mobj is not None:
1629 mediaURL = urllib.unquote(mobj.group(1))
1630 video_extension = mediaURL[-3:]
1632 # Extract gdaKey if available
1633 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1635 video_url = mediaURL
1637 gdaKey = mobj.group(1)
1638 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1640 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1642 self._downloader.trouble(u'ERROR: unable to extract media URL')
1644 vardict = parse_qs(mobj.group(1))
1645 if 'mediaData' not in vardict:
1646 self._downloader.trouble(u'ERROR: unable to extract media URL')
1648 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1650 self._downloader.trouble(u'ERROR: unable to extract media URL')
1652 mediaURL = mobj.group(1).replace('\\/', '/')
1653 video_extension = mediaURL[-3:]
1654 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1656 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1658 self._downloader.trouble(u'ERROR: unable to extract title')
1660 video_title = mobj.group(1).decode('utf-8')
1661 video_title = sanitize_title(video_title)
1663 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1665 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1667 video_uploader = mobj.group(1)
1670 # Process video information
1671 self._downloader.process_info({
1672 'id': video_id.decode('utf-8'),
1673 'url': video_url.decode('utf-8'),
1674 'uploader': video_uploader.decode('utf-8'),
1675 'upload_date': u'NA',
1676 'title': video_title,
1677 'stitle': simple_title,
1678 'ext': video_extension.decode('utf-8'),
1682 except UnavailableVideoError:
1683 self._downloader.trouble(u'\nERROR: unable to download video')
1686 class DailymotionIE(InfoExtractor):
1687 """Information Extractor for Dailymotion"""
1689 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1690 IE_NAME = u'dailymotion'
1692 def __init__(self, downloader=None):
1693 InfoExtractor.__init__(self, downloader)
1695 def report_download_webpage(self, video_id):
1696 """Report webpage download."""
1697 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1699 def report_extraction(self, video_id):
1700 """Report information extraction."""
1701 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1703 def _real_extract(self, url):
1704 # Extract id and simplified title from URL
1705 mobj = re.match(self._VALID_URL, url)
1707 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1710 # At this point we have a new video
1711 self._downloader.increment_downloads()
1712 video_id = mobj.group(1)
1714 video_extension = 'flv'
1716 # Retrieve video webpage to extract further information
1717 request = urllib2.Request(url)
1718 request.add_header('Cookie', 'family_filter=off')
1720 self.report_download_webpage(video_id)
1721 webpage = urllib2.urlopen(request).read()
1722 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1723 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1726 # Extract URL, uploader and title from webpage
1727 self.report_extraction(video_id)
1728 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1730 self._downloader.trouble(u'ERROR: unable to extract media URL')
1732 sequence = urllib.unquote(mobj.group(1))
1733 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1735 self._downloader.trouble(u'ERROR: unable to extract media URL')
1737 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1739 # if needed add http://www.dailymotion.com/ if relative URL
1741 video_url = mediaURL
1743 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1745 self._downloader.trouble(u'ERROR: unable to extract title')
1747 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1748 video_title = sanitize_title(video_title)
1749 simple_title = _simplify_title(video_title)
1751 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1753 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1755 video_uploader = mobj.group(1)
1758 # Process video information
1759 self._downloader.process_info({
1760 'id': video_id.decode('utf-8'),
1761 'url': video_url.decode('utf-8'),
1762 'uploader': video_uploader.decode('utf-8'),
1763 'upload_date': u'NA',
1764 'title': video_title,
1765 'stitle': simple_title,
1766 'ext': video_extension.decode('utf-8'),
1770 except UnavailableVideoError:
1771 self._downloader.trouble(u'\nERROR: unable to download video')
1774 class GoogleIE(InfoExtractor):
1775 """Information extractor for video.google.com."""
1777 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1778 IE_NAME = u'video.google'
1780 def __init__(self, downloader=None):
1781 InfoExtractor.__init__(self, downloader)
1783 def report_download_webpage(self, video_id):
1784 """Report webpage download."""
1785 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1787 def report_extraction(self, video_id):
1788 """Report information extraction."""
1789 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1791 def _real_extract(self, url):
1792 # Extract id from URL
1793 mobj = re.match(self._VALID_URL, url)
1795 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1798 # At this point we have a new video
1799 self._downloader.increment_downloads()
1800 video_id = mobj.group(1)
1802 video_extension = 'mp4'
1804 # Retrieve video webpage to extract further information
1805 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1807 self.report_download_webpage(video_id)
1808 webpage = urllib2.urlopen(request).read()
1809 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1810 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1813 # Extract URL, uploader, and title from webpage
1814 self.report_extraction(video_id)
1815 mobj = re.search(r"download_url:'([^']+)'", webpage)
1817 video_extension = 'flv'
1818 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1820 self._downloader.trouble(u'ERROR: unable to extract media URL')
1822 mediaURL = urllib.unquote(mobj.group(1))
1823 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1824 mediaURL = mediaURL.replace('\\x26', '\x26')
1826 video_url = mediaURL
1828 mobj = re.search(r'<title>(.*)</title>', webpage)
1830 self._downloader.trouble(u'ERROR: unable to extract title')
1832 video_title = mobj.group(1).decode('utf-8')
1833 video_title = sanitize_title(video_title)
1834 simple_title = _simplify_title(video_title)
1836 # Extract video description
1837 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1839 self._downloader.trouble(u'ERROR: unable to extract video description')
1841 video_description = mobj.group(1).decode('utf-8')
1842 if not video_description:
1843 video_description = 'No description available.'
1845 # Extract video thumbnail
1846 if self._downloader.params.get('forcethumbnail', False):
1847 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1849 webpage = urllib2.urlopen(request).read()
1850 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1851 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1853 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1855 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1857 video_thumbnail = mobj.group(1)
1858 else: # we need something to pass to process_info
1859 video_thumbnail = ''
1862 # Process video information
1863 self._downloader.process_info({
1864 'id': video_id.decode('utf-8'),
1865 'url': video_url.decode('utf-8'),
1867 'upload_date': u'NA',
1868 'title': video_title,
1869 'stitle': simple_title,
1870 'ext': video_extension.decode('utf-8'),
1874 except UnavailableVideoError:
1875 self._downloader.trouble(u'\nERROR: unable to download video')
1878 class PhotobucketIE(InfoExtractor):
1879 """Information extractor for photobucket.com."""
1881 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1882 IE_NAME = u'photobucket'
1884 def __init__(self, downloader=None):
1885 InfoExtractor.__init__(self, downloader)
1887 def report_download_webpage(self, video_id):
1888 """Report webpage download."""
1889 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1891 def report_extraction(self, video_id):
1892 """Report information extraction."""
1893 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1895 def _real_extract(self, url):
1896 # Extract id from URL
1897 mobj = re.match(self._VALID_URL, url)
1899 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1902 # At this point we have a new video
1903 self._downloader.increment_downloads()
1904 video_id = mobj.group(1)
1906 video_extension = 'flv'
1908 # Retrieve video webpage to extract further information
1909 request = urllib2.Request(url)
1911 self.report_download_webpage(video_id)
1912 webpage = urllib2.urlopen(request).read()
1913 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1914 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1917 # Extract URL, uploader, and title from webpage
1918 self.report_extraction(video_id)
1919 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1921 self._downloader.trouble(u'ERROR: unable to extract media URL')
1923 mediaURL = urllib.unquote(mobj.group(1))
1925 video_url = mediaURL
1927 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1929 self._downloader.trouble(u'ERROR: unable to extract title')
1931 video_title = mobj.group(1).decode('utf-8')
1932 video_title = sanitize_title(video_title)
1933 simple_title = _simplify_title(vide_title)
1935 video_uploader = mobj.group(2).decode('utf-8')
1938 # Process video information
1939 self._downloader.process_info({
1940 'id': video_id.decode('utf-8'),
1941 'url': video_url.decode('utf-8'),
1942 'uploader': video_uploader,
1943 'upload_date': u'NA',
1944 'title': video_title,
1945 'stitle': simple_title,
1946 'ext': video_extension.decode('utf-8'),
1950 except UnavailableVideoError:
1951 self._downloader.trouble(u'\nERROR: unable to download video')
1954 class YahooIE(InfoExtractor):
1955 """Information extractor for video.yahoo.com."""
1957 # _VALID_URL matches all Yahoo! Video URLs
1958 # _VPAGE_URL matches only the extractable '/watch/' URLs
1959 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1960 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1961 IE_NAME = u'video.yahoo'
1963 def __init__(self, downloader=None):
1964 InfoExtractor.__init__(self, downloader)
1966 def report_download_webpage(self, video_id):
1967 """Report webpage download."""
1968 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1970 def report_extraction(self, video_id):
1971 """Report information extraction."""
1972 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1974 def _real_extract(self, url, new_video=True):
1975 # Extract ID from URL
1976 mobj = re.match(self._VALID_URL, url)
1978 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1981 # At this point we have a new video
1982 self._downloader.increment_downloads()
1983 video_id = mobj.group(2)
1984 video_extension = 'flv'
1986 # Rewrite valid but non-extractable URLs as
1987 # extractable English language /watch/ URLs
1988 if re.match(self._VPAGE_URL, url) is None:
1989 request = urllib2.Request(url)
1991 webpage = urllib2.urlopen(request).read()
1992 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1993 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1996 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1998 self._downloader.trouble(u'ERROR: Unable to extract id field')
2000 yahoo_id = mobj.group(1)
2002 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2004 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2006 yahoo_vid = mobj.group(1)
2008 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2009 return self._real_extract(url, new_video=False)
2011 # Retrieve video webpage to extract further information
2012 request = urllib2.Request(url)
2014 self.report_download_webpage(video_id)
2015 webpage = urllib2.urlopen(request).read()
2016 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2017 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2020 # Extract uploader and title from webpage
2021 self.report_extraction(video_id)
2022 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2024 self._downloader.trouble(u'ERROR: unable to extract video title')
2026 video_title = mobj.group(1).decode('utf-8')
2027 simple_title = _simplify_title(video_title)
2029 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2031 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2033 video_uploader = mobj.group(1).decode('utf-8')
2035 # Extract video thumbnail
2036 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2038 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2040 video_thumbnail = mobj.group(1).decode('utf-8')
2042 # Extract video description
2043 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2045 self._downloader.trouble(u'ERROR: unable to extract video description')
2047 video_description = mobj.group(1).decode('utf-8')
2048 if not video_description:
2049 video_description = 'No description available.'
2051 # Extract video height and width
2052 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2054 self._downloader.trouble(u'ERROR: unable to extract video height')
2056 yv_video_height = mobj.group(1)
2058 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2060 self._downloader.trouble(u'ERROR: unable to extract video width')
2062 yv_video_width = mobj.group(1)
2064 # Retrieve video playlist to extract media URL
2065 # I'm not completely sure what all these options are, but we
2066 # seem to need most of them, otherwise the server sends a 401.
2067 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
2068 yv_bitrate = '700' # according to Wikipedia this is hard-coded
2069 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2070 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2071 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2073 self.report_download_webpage(video_id)
2074 webpage = urllib2.urlopen(request).read()
2075 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2076 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2079 # Extract media URL from playlist XML
2080 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2082 self._downloader.trouble(u'ERROR: Unable to extract media URL')
2084 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2085 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2088 # Process video information
2089 self._downloader.process_info({
2090 'id': video_id.decode('utf-8'),
2092 'uploader': video_uploader,
2093 'upload_date': u'NA',
2094 'title': video_title,
2095 'stitle': simple_title,
2096 'ext': video_extension.decode('utf-8'),
2097 'thumbnail': video_thumbnail.decode('utf-8'),
2098 'description': video_description,
2099 'thumbnail': video_thumbnail,
2102 except UnavailableVideoError:
2103 self._downloader.trouble(u'\nERROR: unable to download video')
2106 class VimeoIE(InfoExtractor):
2107 """Information extractor for vimeo.com."""
2109 # _VALID_URL matches Vimeo URLs
2110 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2113 def __init__(self, downloader=None):
2114 InfoExtractor.__init__(self, downloader)
2116 def report_download_webpage(self, video_id):
2117 """Report webpage download."""
2118 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2120 def report_extraction(self, video_id):
2121 """Report information extraction."""
2122 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2124 def _real_extract(self, url, new_video=True):
2125 # Extract ID from URL
2126 mobj = re.match(self._VALID_URL, url)
2128 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2131 # At this point we have a new video
2132 self._downloader.increment_downloads()
2133 video_id = mobj.group(1)
2135 # Retrieve video webpage to extract further information
2136 request = urllib2.Request(url, None, std_headers)
2138 self.report_download_webpage(video_id)
2139 webpage = urllib2.urlopen(request).read()
2140 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2141 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2144 # Now we begin extracting as much information as we can from what we
2145 # retrieved. First we extract the information common to all extractors,
2146 # and latter we extract those that are Vimeo specific.
2147 self.report_extraction(video_id)
2149 # Extract the config JSON
2150 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2152 config = json.loads(config)
2154 self._downloader.trouble(u'ERROR: unable to extract info section')
2158 video_title = config["video"]["title"]
2159 simple_title = _simplify_title(video_title)
2162 video_uploader = config["video"]["owner"]["name"]
2164 # Extract video thumbnail
2165 video_thumbnail = config["video"]["thumbnail"]
2167 # Extract video description
2171 video_description = u'No description available.'
2172 mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2173 if mobj is not None:
2174 video_description = mobj.group(1)
2176 html_parser = lxml.etree.HTMLParser()
2177 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2178 video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2179 # TODO use another parser
2181 # Extract upload date
2182 video_upload_date = u'NA'
2183 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2184 if mobj is not None:
2185 video_upload_date = mobj.group(1)
2187 # Vimeo specific: extract request signature and timestamp
2188 sig = config['request']['signature']
2189 timestamp = config['request']['timestamp']
2191 # Vimeo specific: extract video codec and quality information
2192 # TODO bind to format param
2193 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2194 for codec in codecs:
2195 if codec[0] in config["video"]["files"]:
2196 video_codec = codec[0]
2197 video_extension = codec[1]
2198 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2199 else: quality = 'sd'
2202 self._downloader.trouble(u'ERROR: no known codec found')
2205 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2206 %(video_id, sig, timestamp, quality, video_codec.upper())
2209 # Process video information
2210 self._downloader.process_info({
2213 'uploader': video_uploader,
2214 'upload_date': video_upload_date,
2215 'title': video_title,
2216 'stitle': simple_title,
2217 'ext': video_extension,
2218 'thumbnail': video_thumbnail,
2219 'description': video_description,
2222 except UnavailableVideoError:
2223 self._downloader.trouble(u'ERROR: unable to download video')
2226 class GenericIE(InfoExtractor):
2227 """Generic last-resort information extractor."""
2230 IE_NAME = u'generic'
2232 def __init__(self, downloader=None):
2233 InfoExtractor.__init__(self, downloader)
2235 def report_download_webpage(self, video_id):
2236 """Report webpage download."""
2237 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2238 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2240 def report_extraction(self, video_id):
2241 """Report information extraction."""
2242 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2244 def _real_extract(self, url):
2245 # At this point we have a new video
2246 self._downloader.increment_downloads()
2248 video_id = url.split('/')[-1]
2249 request = urllib2.Request(url)
2251 self.report_download_webpage(video_id)
2252 webpage = urllib2.urlopen(request).read()
2253 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2254 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2256 except ValueError, err:
2257 # since this is the last-resort InfoExtractor, if
2258 # this error is thrown, it'll be thrown here
2259 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2262 self.report_extraction(video_id)
2263 # Start with something easy: JW Player in SWFObject
2264 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2266 # Broaden the search a little bit
2267 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2269 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2272 # It's possible that one of the regexes
2273 # matched, but returned an empty group:
2274 if mobj.group(1) is None:
2275 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2278 video_url = urllib.unquote(mobj.group(1))
2279 video_id = os.path.basename(video_url)
2281 # here's a fun little line of code for you:
2282 video_extension = os.path.splitext(video_id)[1][1:]
2283 video_id = os.path.splitext(video_id)[0]
2285 # it's tempting to parse this further, but you would
2286 # have to take into account all the variations like
2287 # Video Title - Site Name
2288 # Site Name | Video Title
2289 # Video Title - Tagline | Site Name
2290 # and so on and so forth; it's just not practical
2291 mobj = re.search(r'<title>(.*)</title>', webpage)
2293 self._downloader.trouble(u'ERROR: unable to extract title')
2295 video_title = mobj.group(1).decode('utf-8')
2296 video_title = sanitize_title(video_title)
2297 simple_title = _simplify_title(video_title)
2299 # video uploader is domain name
2300 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2302 self._downloader.trouble(u'ERROR: unable to extract title')
2304 video_uploader = mobj.group(1).decode('utf-8')
2307 # Process video information
2308 self._downloader.process_info({
2309 'id': video_id.decode('utf-8'),
2310 'url': video_url.decode('utf-8'),
2311 'uploader': video_uploader,
2312 'upload_date': u'NA',
2313 'title': video_title,
2314 'stitle': simple_title,
2315 'ext': video_extension.decode('utf-8'),
2319 except UnavailableVideoError, err:
2320 self._downloader.trouble(u'\nERROR: unable to download video')
2323 class YoutubeSearchIE(InfoExtractor):
2324 """Information Extractor for YouTube search queries."""
2325 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2326 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
2328 _max_youtube_results = 1000
2329 IE_NAME = u'youtube:search'
2331 def __init__(self, youtube_ie, downloader=None):
2332 InfoExtractor.__init__(self, downloader)
2333 self._youtube_ie = youtube_ie
2335 def report_download_page(self, query, pagenum):
2336 """Report attempt to download playlist page with given number."""
2337 query = query.decode(preferredencoding())
2338 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2340 def _real_initialize(self):
2341 self._youtube_ie.initialize()
2343 def _real_extract(self, query):
2344 mobj = re.match(self._VALID_URL, query)
2346 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2349 prefix, query = query.split(':')
2351 query = query.encode('utf-8')
2353 self._download_n_results(query, 1)
2355 elif prefix == 'all':
2356 self._download_n_results(query, self._max_youtube_results)
2362 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2364 elif n > self._max_youtube_results:
2365 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2366 n = self._max_youtube_results
2367 self._download_n_results(query, n)
2369 except ValueError: # parsing prefix as integer fails
2370 self._download_n_results(query, 1)
2373 def _download_n_results(self, query, n):
2374 """Downloads a specified number of results for a query"""
2380 while (50 * pagenum) < limit:
2381 self.report_download_page(query, pagenum+1)
2382 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
2383 request = urllib2.Request(result_url)
2385 data = urllib2.urlopen(request).read()
2386 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2387 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
2389 api_response = json.loads(data)['data']
2391 new_ids = list(video['id'] for video in api_response['items'])
2392 video_ids += new_ids
2394 limit = min(n, api_response['totalItems'])
2397 if len(video_ids) > n:
2398 video_ids = video_ids[:n]
2399 for id in video_ids:
2400 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2404 class GoogleSearchIE(InfoExtractor):
2405 """Information Extractor for Google Video search queries."""
2406 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2407 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2408 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2409 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2411 _max_google_results = 1000
2412 IE_NAME = u'video.google:search'
2414 def __init__(self, google_ie, downloader=None):
2415 InfoExtractor.__init__(self, downloader)
2416 self._google_ie = google_ie
2418 def report_download_page(self, query, pagenum):
2419 """Report attempt to download playlist page with given number."""
2420 query = query.decode(preferredencoding())
2421 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2423 def _real_initialize(self):
2424 self._google_ie.initialize()
2426 def _real_extract(self, query):
2427 mobj = re.match(self._VALID_URL, query)
2429 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2432 prefix, query = query.split(':')
2434 query = query.encode('utf-8')
2436 self._download_n_results(query, 1)
2438 elif prefix == 'all':
2439 self._download_n_results(query, self._max_google_results)
2445 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2447 elif n > self._max_google_results:
2448 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2449 n = self._max_google_results
2450 self._download_n_results(query, n)
2452 except ValueError: # parsing prefix as integer fails
2453 self._download_n_results(query, 1)
2456 def _download_n_results(self, query, n):
2457 """Downloads a specified number of results for a query"""
2463 self.report_download_page(query, pagenum)
2464 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2465 request = urllib2.Request(result_url)
2467 page = urllib2.urlopen(request).read()
2468 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2469 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2472 # Extract video identifiers
2473 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2474 video_id = mobj.group(1)
2475 if video_id not in video_ids:
2476 video_ids.append(video_id)
2477 if len(video_ids) == n:
2478 # Specified n videos reached
2479 for id in video_ids:
2480 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2483 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2484 for id in video_ids:
2485 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2488 pagenum = pagenum + 1
2491 class YahooSearchIE(InfoExtractor):
2492 """Information Extractor for Yahoo! Video search queries."""
2493 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2494 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2495 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2496 _MORE_PAGES_INDICATOR = r'\s*Next'
2498 _max_yahoo_results = 1000
2499 IE_NAME = u'video.yahoo:search'
2501 def __init__(self, yahoo_ie, downloader=None):
2502 InfoExtractor.__init__(self, downloader)
2503 self._yahoo_ie = yahoo_ie
2505 def report_download_page(self, query, pagenum):
2506 """Report attempt to download playlist page with given number."""
2507 query = query.decode(preferredencoding())
2508 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2510 def _real_initialize(self):
2511 self._yahoo_ie.initialize()
2513 def _real_extract(self, query):
2514 mobj = re.match(self._VALID_URL, query)
2516 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2519 prefix, query = query.split(':')
2521 query = query.encode('utf-8')
2523 self._download_n_results(query, 1)
2525 elif prefix == 'all':
2526 self._download_n_results(query, self._max_yahoo_results)
2532 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2534 elif n > self._max_yahoo_results:
2535 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2536 n = self._max_yahoo_results
2537 self._download_n_results(query, n)
2539 except ValueError: # parsing prefix as integer fails
2540 self._download_n_results(query, 1)
2543 def _download_n_results(self, query, n):
2544 """Downloads a specified number of results for a query"""
2547 already_seen = set()
2551 self.report_download_page(query, pagenum)
2552 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2553 request = urllib2.Request(result_url)
2555 page = urllib2.urlopen(request).read()
2556 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2557 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2560 # Extract video identifiers
2561 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2562 video_id = mobj.group(1)
2563 if video_id not in already_seen:
2564 video_ids.append(video_id)
2565 already_seen.add(video_id)
2566 if len(video_ids) == n:
2567 # Specified n videos reached
2568 for id in video_ids:
2569 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2572 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2573 for id in video_ids:
2574 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2577 pagenum = pagenum + 1
2580 class YoutubePlaylistIE(InfoExtractor):
2581 """Information Extractor for YouTube playlists."""
2583 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2584 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2585 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=PL%s&'
2586 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2588 IE_NAME = u'youtube:playlist'
2590 def __init__(self, youtube_ie, downloader=None):
2591 InfoExtractor.__init__(self, downloader)
2592 self._youtube_ie = youtube_ie
2594 def report_download_page(self, playlist_id, pagenum):
2595 """Report attempt to download playlist page with given number."""
2596 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2598 def _real_initialize(self):
2599 self._youtube_ie.initialize()
2601 def _real_extract(self, url):
2602 # Extract playlist id
2603 mobj = re.match(self._VALID_URL, url)
2605 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2609 if mobj.group(3) is not None:
2610 self._youtube_ie.extract(mobj.group(3))
2613 # Download playlist pages
2614 # prefix is 'p' as default for playlists but there are other types that need extra care
2615 playlist_prefix = mobj.group(1)
2616 if playlist_prefix == 'a':
2617 playlist_access = 'artist'
2619 playlist_prefix = 'p'
2620 playlist_access = 'view_play_list'
2621 playlist_id = mobj.group(2)
2626 self.report_download_page(playlist_id, pagenum)
2627 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2628 request = urllib2.Request(url)
2630 page = urllib2.urlopen(request).read()
2631 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2632 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2635 # Extract video identifiers
2637 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
2638 if mobj.group(1) not in ids_in_page:
2639 ids_in_page.append(mobj.group(1))
2640 video_ids.extend(ids_in_page)
2642 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2644 pagenum = pagenum + 1
2646 playliststart = self._downloader.params.get('playliststart', 1) - 1
2647 playlistend = self._downloader.params.get('playlistend', -1)
2648 if playlistend == -1:
2649 video_ids = video_ids[playliststart:]
2651 video_ids = video_ids[playliststart:playlistend]
2653 for id in video_ids:
2654 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2658 class YoutubeUserIE(InfoExtractor):
2659 """Information Extractor for YouTube users."""
2661 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2662 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2663 _GDATA_PAGE_SIZE = 50
2664 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2665 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2667 IE_NAME = u'youtube:user'
2669 def __init__(self, youtube_ie, downloader=None):
2670 InfoExtractor.__init__(self, downloader)
2671 self._youtube_ie = youtube_ie
2673 def report_download_page(self, username, start_index):
2674 """Report attempt to download user page."""
2675 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2676 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2678 def _real_initialize(self):
2679 self._youtube_ie.initialize()
2681 def _real_extract(self, url):
2683 mobj = re.match(self._VALID_URL, url)
2685 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2688 username = mobj.group(1)
2690 # Download video ids using YouTube Data API. Result size per
2691 # query is limited (currently to 50 videos) so we need to query
2692 # page by page until there are no video ids - it means we got
2699 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2700 self.report_download_page(username, start_index)
2702 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2705 page = urllib2.urlopen(request).read()
2706 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2707 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2710 # Extract video identifiers
2713 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2714 if mobj.group(1) not in ids_in_page:
2715 ids_in_page.append(mobj.group(1))
2717 video_ids.extend(ids_in_page)
2719 # A little optimization - if current page is not
2720 # "full", ie. does not contain PAGE_SIZE video ids then
2721 # we can assume that this page is the last one - there
2722 # are no more ids on further pages - no need to query
2725 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2730 all_ids_count = len(video_ids)
2731 playliststart = self._downloader.params.get('playliststart', 1) - 1
2732 playlistend = self._downloader.params.get('playlistend', -1)
2734 if playlistend == -1:
2735 video_ids = video_ids[playliststart:]
2737 video_ids = video_ids[playliststart:playlistend]
2739 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2740 (username, all_ids_count, len(video_ids)))
2742 for video_id in video_ids:
2743 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2746 class DepositFilesIE(InfoExtractor):
2747 """Information extractor for depositfiles.com"""
2749 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2750 IE_NAME = u'DepositFiles'
2752 def __init__(self, downloader=None):
2753 InfoExtractor.__init__(self, downloader)
2755 def report_download_webpage(self, file_id):
2756 """Report webpage download."""
2757 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2759 def report_extraction(self, file_id):
2760 """Report information extraction."""
2761 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2763 def _real_extract(self, url):
2764 # At this point we have a new file
2765 self._downloader.increment_downloads()
2767 file_id = url.split('/')[-1]
2768 # Rebuild url in english locale
2769 url = 'http://depositfiles.com/en/files/' + file_id
2771 # Retrieve file webpage with 'Free download' button pressed
2772 free_download_indication = { 'gateway_result' : '1' }
2773 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2775 self.report_download_webpage(file_id)
2776 webpage = urllib2.urlopen(request).read()
2777 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2778 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2781 # Search for the real file URL
2782 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2783 if (mobj is None) or (mobj.group(1) is None):
2784 # Try to figure out reason of the error.
2785 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2786 if (mobj is not None) and (mobj.group(1) is not None):
2787 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2788 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2790 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2793 file_url = mobj.group(1)
2794 file_extension = os.path.splitext(file_url)[1][1:]
2796 # Search for file title
2797 mobj = re.search(r'<b title="(.*?)">', webpage)
2799 self._downloader.trouble(u'ERROR: unable to extract title')
2801 file_title = mobj.group(1).decode('utf-8')
2804 # Process file information
2805 self._downloader.process_info({
2806 'id': file_id.decode('utf-8'),
2807 'url': file_url.decode('utf-8'),
2809 'upload_date': u'NA',
2810 'title': file_title,
2811 'stitle': file_title,
2812 'ext': file_extension.decode('utf-8'),
2816 except UnavailableVideoError, err:
2817 self._downloader.trouble(u'ERROR: unable to download file')
2820 class FacebookIE(InfoExtractor):
2821 """Information Extractor for Facebook"""
2823 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2824 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2825 _NETRC_MACHINE = 'facebook'
2826 _available_formats = ['video', 'highqual', 'lowqual']
2827 _video_extensions = {
2832 IE_NAME = u'facebook'
2834 def __init__(self, downloader=None):
2835 InfoExtractor.__init__(self, downloader)
2837 def _reporter(self, message):
2838 """Add header and report message."""
2839 self._downloader.to_screen(u'[facebook] %s' % message)
2841 def report_login(self):
2842 """Report attempt to log in."""
2843 self._reporter(u'Logging in')
2845 def report_video_webpage_download(self, video_id):
2846 """Report attempt to download video webpage."""
2847 self._reporter(u'%s: Downloading video webpage' % video_id)
2849 def report_information_extraction(self, video_id):
2850 """Report attempt to extract video information."""
2851 self._reporter(u'%s: Extracting video information' % video_id)
2853 def _parse_page(self, video_webpage):
2854 """Extract video information from page"""
2856 data = {'title': r'\("video_title", "(.*?)"\)',
2857 'description': r'<div class="datawrap">(.*?)</div>',
2858 'owner': r'\("video_owner_name", "(.*?)"\)',
2859 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2862 for piece in data.keys():
2863 mobj = re.search(data[piece], video_webpage)
2864 if mobj is not None:
2865 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2869 for fmt in self._available_formats:
2870 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2871 if mobj is not None:
2872 # URL is in a Javascript segment inside an escaped Unicode format within
2873 # the generally utf-8 page
2874 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2875 video_info['video_urls'] = video_urls
2879 def _real_initialize(self):
2880 if self._downloader is None:
2885 downloader_params = self._downloader.params
2887 # Attempt to use provided username and password or .netrc data
2888 if downloader_params.get('username', None) is not None:
2889 useremail = downloader_params['username']
2890 password = downloader_params['password']
2891 elif downloader_params.get('usenetrc', False):
2893 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2894 if info is not None:
2898 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2899 except (IOError, netrc.NetrcParseError), err:
2900 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2903 if useremail is None:
2912 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2915 login_results = urllib2.urlopen(request).read()
2916 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2917 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2919 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2920 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2923 def _real_extract(self, url):
2924 mobj = re.match(self._VALID_URL, url)
2926 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2928 video_id = mobj.group('ID')
2931 self.report_video_webpage_download(video_id)
2932 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2934 page = urllib2.urlopen(request)
2935 video_webpage = page.read()
2936 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2937 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2940 # Start extracting information
2941 self.report_information_extraction(video_id)
2943 # Extract information
2944 video_info = self._parse_page(video_webpage)
2947 if 'owner' not in video_info:
2948 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2950 video_uploader = video_info['owner']
2953 if 'title' not in video_info:
2954 self._downloader.trouble(u'ERROR: unable to extract video title')
2956 video_title = video_info['title']
2957 video_title = video_title.decode('utf-8')
2958 video_title = sanitize_title(video_title)
2960 simple_title = _simplify_title(video_title)
2963 if 'thumbnail' not in video_info:
2964 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2965 video_thumbnail = ''
2967 video_thumbnail = video_info['thumbnail']
2971 if 'upload_date' in video_info:
2972 upload_time = video_info['upload_date']
2973 timetuple = email.utils.parsedate_tz(upload_time)
2974 if timetuple is not None:
2976 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2981 video_description = video_info.get('description', 'No description available.')
2983 url_map = video_info['video_urls']
2984 if len(url_map.keys()) > 0:
2985 # Decide which formats to download
2986 req_format = self._downloader.params.get('format', None)
2987 format_limit = self._downloader.params.get('format_limit', None)
2989 if format_limit is not None and format_limit in self._available_formats:
2990 format_list = self._available_formats[self._available_formats.index(format_limit):]
2992 format_list = self._available_formats
2993 existing_formats = [x for x in format_list if x in url_map]
2994 if len(existing_formats) == 0:
2995 self._downloader.trouble(u'ERROR: no known formats available for video')
2997 if req_format is None:
2998 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2999 elif req_format == 'worst':
3000 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3001 elif req_format == '-1':
3002 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3005 if req_format not in url_map:
3006 self._downloader.trouble(u'ERROR: requested format not available')
3008 video_url_list = [(req_format, url_map[req_format])] # Specific format
3010 for format_param, video_real_url in video_url_list:
3012 # At this point we have a new video
3013 self._downloader.increment_downloads()
3016 video_extension = self._video_extensions.get(format_param, 'mp4')
3019 # Process video information
3020 self._downloader.process_info({
3021 'id': video_id.decode('utf-8'),
3022 'url': video_real_url.decode('utf-8'),
3023 'uploader': video_uploader.decode('utf-8'),
3024 'upload_date': upload_date,
3025 'title': video_title,
3026 'stitle': simple_title,
3027 'ext': video_extension.decode('utf-8'),
3028 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3029 'thumbnail': video_thumbnail.decode('utf-8'),
3030 'description': video_description.decode('utf-8'),
3033 except UnavailableVideoError, err:
3034 self._downloader.trouble(u'\nERROR: unable to download video')
3036 class BlipTVIE(InfoExtractor):
3037 """Information extractor for blip.tv"""
3039 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3040 _URL_EXT = r'^.*\.([a-z0-9]+)$'
3041 IE_NAME = u'blip.tv'
3043 def report_extraction(self, file_id):
3044 """Report information extraction."""
3045 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3047 def report_direct_download(self, title):
3048 """Report information extraction."""
3049 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3051 def _real_extract(self, url):
3052 mobj = re.match(self._VALID_URL, url)
3054 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3061 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3062 request = urllib2.Request(json_url)
3063 self.report_extraction(mobj.group(1))
3066 urlh = urllib2.urlopen(request)
3067 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3068 basename = url.split('/')[-1]
3069 title,ext = os.path.splitext(basename)
3070 title = title.decode('UTF-8')
3071 ext = ext.replace('.', '')
3072 self.report_direct_download(title)
3077 'stitle': _simplify_title(title),
3081 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3082 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3084 if info is None: # Regular URL
3086 json_code = urlh.read()
3087 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3088 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3092 json_data = json.loads(json_code)
3093 if 'Post' in json_data:
3094 data = json_data['Post']
3098 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3099 video_url = data['media']['url']
3100 umobj = re.match(self._URL_EXT, video_url)
3102 raise ValueError('Can not determine filename extension')
3103 ext = umobj.group(1)
3106 'id': data['item_id'],
3108 'uploader': data['display_name'],
3109 'upload_date': upload_date,
3110 'title': data['title'],
3111 'stitle': _simplify_title(data['title']),
3113 'format': data['media']['mimeType'],
3114 'thumbnail': data['thumbnailUrl'],
3115 'description': data['description'],
3116 'player_url': data['embedUrl']
3118 except (ValueError,KeyError), err:
3119 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3122 self._downloader.increment_downloads()
3125 self._downloader.process_info(info)
3126 except UnavailableVideoError, err:
3127 self._downloader.trouble(u'\nERROR: unable to download video')
3130 class MyVideoIE(InfoExtractor):
3131 """Information Extractor for myvideo.de."""
3133 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3134 IE_NAME = u'myvideo'
3136 def __init__(self, downloader=None):
3137 InfoExtractor.__init__(self, downloader)
3139 def report_download_webpage(self, video_id):
3140 """Report webpage download."""
3141 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3143 def report_extraction(self, video_id):
3144 """Report information extraction."""
3145 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3147 def _real_extract(self,url):
3148 mobj = re.match(self._VALID_URL, url)
3150 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3153 video_id = mobj.group(1)
3156 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3158 self.report_download_webpage(video_id)
3159 webpage = urllib2.urlopen(request).read()
3160 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3161 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3164 self.report_extraction(video_id)
3165 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3168 self._downloader.trouble(u'ERROR: unable to extract media URL')
3170 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3172 mobj = re.search('<title>([^<]+)</title>', webpage)
3174 self._downloader.trouble(u'ERROR: unable to extract title')
3177 video_title = mobj.group(1)
3178 video_title = sanitize_title(video_title)
3180 simple_title = _simplify_title(video_title)
3183 self._downloader.process_info({
3187 'upload_date': u'NA',
3188 'title': video_title,
3189 'stitle': simple_title,
3194 except UnavailableVideoError:
3195 self._downloader.trouble(u'\nERROR: Unable to download video')
3197 class ComedyCentralIE(InfoExtractor):
3198 """Information extractor for The Daily Show and Colbert Report """
3200 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3201 IE_NAME = u'comedycentral'
3203 def report_extraction(self, episode_id):
3204 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3206 def report_config_download(self, episode_id):
3207 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3209 def report_index_download(self, episode_id):
3210 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3212 def report_player_url(self, episode_id):
3213 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3215 def _real_extract(self, url):
3216 mobj = re.match(self._VALID_URL, url)
3218 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3221 if mobj.group('shortname'):
3222 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3223 url = u'http://www.thedailyshow.com/full-episodes/'
3225 url = u'http://www.colbertnation.com/full-episodes/'
3226 mobj = re.match(self._VALID_URL, url)
3227 assert mobj is not None
3229 dlNewest = not mobj.group('episode')
3231 epTitle = mobj.group('showname')
3233 epTitle = mobj.group('episode')
3235 req = urllib2.Request(url)
3236 self.report_extraction(epTitle)
3238 htmlHandle = urllib2.urlopen(req)
3239 html = htmlHandle.read()
3240 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3241 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3244 url = htmlHandle.geturl()
3245 mobj = re.match(self._VALID_URL, url)
3247 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3249 if mobj.group('episode') == '':
3250 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3252 epTitle = mobj.group('episode')
3254 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3255 if len(mMovieParams) == 0:
3256 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3259 playerUrl_raw = mMovieParams[0][0]
3260 self.report_player_url(epTitle)
3262 urlHandle = urllib2.urlopen(playerUrl_raw)
3263 playerUrl = urlHandle.geturl()
3264 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3265 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3268 uri = mMovieParams[0][1]
3269 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3270 self.report_index_download(epTitle)
3272 indexXml = urllib2.urlopen(indexUrl).read()
3273 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3274 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3277 idoc = xml.etree.ElementTree.fromstring(indexXml)
3278 itemEls = idoc.findall('.//item')
3279 for itemEl in itemEls:
3280 mediaId = itemEl.findall('./guid')[0].text
3281 shortMediaId = mediaId.split(':')[-1]
3282 showId = mediaId.split(':')[-2].replace('.com', '')
3283 officialTitle = itemEl.findall('./title')[0].text
3284 officialDate = itemEl.findall('./pubDate')[0].text
3286 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3287 urllib.urlencode({'uri': mediaId}))
3288 configReq = urllib2.Request(configUrl)
3289 self.report_config_download(epTitle)
3291 configXml = urllib2.urlopen(configReq).read()
3292 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3293 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3296 cdoc = xml.etree.ElementTree.fromstring(configXml)
3298 for rendition in cdoc.findall('.//rendition'):
3299 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3303 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3306 # For now, just pick the highest bitrate
3307 format,video_url = turls[-1]
3309 self._downloader.increment_downloads()
3311 effTitle = showId + u'-' + epTitle
3316 'upload_date': officialDate,
3318 'stitle': _simplify_title(effTitle),
3322 'description': officialTitle,
3323 'player_url': playerUrl
3327 self._downloader.process_info(info)
3328 except UnavailableVideoError, err:
3329 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3333 class EscapistIE(InfoExtractor):
3334 """Information extractor for The Escapist """
3336 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3337 IE_NAME = u'escapist'
3339 def report_extraction(self, showName):
3340 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3342 def report_config_download(self, showName):
3343 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3345 def _real_extract(self, url):
3346 htmlParser = HTMLParser.HTMLParser()
3348 mobj = re.match(self._VALID_URL, url)
3350 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3352 showName = mobj.group('showname')
3353 videoId = mobj.group('episode')
3355 self.report_extraction(showName)
3357 webPage = urllib2.urlopen(url).read()
3358 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3359 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3362 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3363 description = htmlParser.unescape(descMatch.group(1))
3364 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3365 imgUrl = htmlParser.unescape(imgMatch.group(1))
3366 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3367 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3368 configUrlMatch = re.search('config=(.*)$', playerUrl)
3369 configUrl = urllib2.unquote(configUrlMatch.group(1))
3371 self.report_config_download(showName)
3373 configJSON = urllib2.urlopen(configUrl).read()
3374 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3375 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3378 # Technically, it's JavaScript, not JSON
3379 configJSON = configJSON.replace("'", '"')
3382 config = json.loads(configJSON)
3383 except (ValueError,), err:
3384 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3387 playlist = config['playlist']
3388 videoUrl = playlist[1]['url']
3390 self._downloader.increment_downloads()
3394 'uploader': showName,
3395 'upload_date': None,
3397 'stitle': _simplify_title(showName),
3400 'thumbnail': imgUrl,
3401 'description': description,
3402 'player_url': playerUrl,
3406 self._downloader.process_info(info)
3407 except UnavailableVideoError, err:
3408 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3411 class CollegeHumorIE(InfoExtractor):
3412 """Information extractor for collegehumor.com"""
3414 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3415 IE_NAME = u'collegehumor'
3417 def report_webpage(self, video_id):
3418 """Report information extraction."""
3419 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3421 def report_extraction(self, video_id):
3422 """Report information extraction."""
3423 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3425 def _real_extract(self, url):
3426 htmlParser = HTMLParser.HTMLParser()
3428 mobj = re.match(self._VALID_URL, url)
3430 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3432 video_id = mobj.group('videoid')
3434 self.report_webpage(video_id)
3435 request = urllib2.Request(url)
3437 webpage = urllib2.urlopen(request).read()
3438 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3439 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3442 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3444 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3446 internal_video_id = m.group('internalvideoid')
3450 'internal_id': internal_video_id,
3453 self.report_extraction(video_id)
3454 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3456 metaXml = urllib2.urlopen(xmlUrl).read()
3457 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3458 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3461 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3463 videoNode = mdoc.findall('./video')[0]
3464 info['description'] = videoNode.findall('./description')[0].text
3465 info['title'] = videoNode.findall('./caption')[0].text
3466 info['stitle'] = _simplify_title(info['title'])
3467 info['url'] = videoNode.findall('./file')[0].text
3468 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3469 info['ext'] = info['url'].rpartition('.')[2]
3470 info['format'] = info['ext']
3472 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3475 self._downloader.increment_downloads()
3478 self._downloader.process_info(info)
3479 except UnavailableVideoError, err:
3480 self._downloader.trouble(u'\nERROR: unable to download video')
3483 class XVideosIE(InfoExtractor):
3484 """Information extractor for xvideos.com"""
3486 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3487 IE_NAME = u'xvideos'
3489 def report_webpage(self, video_id):
3490 """Report information extraction."""
3491 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3493 def report_extraction(self, video_id):
3494 """Report information extraction."""
3495 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3497 def _real_extract(self, url):
3498 htmlParser = HTMLParser.HTMLParser()
3500 mobj = re.match(self._VALID_URL, url)
3502 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3504 video_id = mobj.group(1).decode('utf-8')
3506 self.report_webpage(video_id)
3508 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3510 webpage = urllib2.urlopen(request).read()
3511 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3512 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3515 self.report_extraction(video_id)
3519 mobj = re.search(r'flv_url=(.+?)&', webpage)
3521 self._downloader.trouble(u'ERROR: unable to extract video url')
3523 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3527 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3529 self._downloader.trouble(u'ERROR: unable to extract video title')
3531 video_title = mobj.group(1).decode('utf-8')
3534 # Extract video thumbnail
3535 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3537 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3539 video_thumbnail = mobj.group(1).decode('utf-8')
3543 self._downloader.increment_downloads()
3548 'upload_date': None,
3549 'title': video_title,
3550 'stitle': _simplify_title(video_title),
3553 'thumbnail': video_thumbnail,
3554 'description': None,
3559 self._downloader.process_info(info)
3560 except UnavailableVideoError, err:
3561 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3564 class SoundcloudIE(InfoExtractor):
3565 """Information extractor for soundcloud.com
3566 To access the media, the uid of the song and a stream token
3567 must be extracted from the page source and the script must make
3568 a request to media.soundcloud.com/crossdomain.xml. Then
3569 the media can be grabbed by requesting from an url composed
3570 of the stream token and uid
3573 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3574 IE_NAME = u'soundcloud'
3576 def __init__(self, downloader=None):
3577 InfoExtractor.__init__(self, downloader)
3579 def report_webpage(self, video_id):
3580 """Report information extraction."""
3581 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3583 def report_extraction(self, video_id):
3584 """Report information extraction."""
3585 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3587 def _real_extract(self, url):
3588 htmlParser = HTMLParser.HTMLParser()
3590 mobj = re.match(self._VALID_URL, url)
3592 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3595 # extract uploader (which is in the url)
3596 uploader = mobj.group(1).decode('utf-8')
3597 # extract simple title (uploader + slug of song title)
3598 slug_title = mobj.group(2).decode('utf-8')
3599 simple_title = uploader + '-' + slug_title
3601 self.report_webpage('%s/%s' % (uploader, slug_title))
3603 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3605 webpage = urllib2.urlopen(request).read()
3606 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3607 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3610 self.report_extraction('%s/%s' % (uploader, slug_title))
3612 # extract uid and stream token that soundcloud hands out for access
3613 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3615 video_id = mobj.group(1)
3616 stream_token = mobj.group(2)
3618 # extract unsimplified title
3619 mobj = re.search('"title":"(.*?)",', webpage)
3621 title = mobj.group(1)
3623 # construct media url (with uid/token)
3624 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3625 mediaURL = mediaURL % (video_id, stream_token)
3628 description = u'No description available'
3629 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3631 description = mobj.group(1)
3635 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3638 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3639 except Exception, e:
3642 # for soundcloud, a request to a cross domain is required for cookies
3643 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3646 self._downloader.process_info({
3647 'id': video_id.decode('utf-8'),
3649 'uploader': uploader.decode('utf-8'),
3650 'upload_date': upload_date,
3651 'title': simple_title.decode('utf-8'),
3652 'stitle': simple_title.decode('utf-8'),
3656 'description': description.decode('utf-8')
3658 except UnavailableVideoError:
3659 self._downloader.trouble(u'\nERROR: unable to download video')
3662 class InfoQIE(InfoExtractor):
3663 """Information extractor for infoq.com"""
3665 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3668 def report_webpage(self, video_id):
3669 """Report information extraction."""
3670 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3672 def report_extraction(self, video_id):
3673 """Report information extraction."""
3674 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3676 def _real_extract(self, url):
3677 htmlParser = HTMLParser.HTMLParser()
3679 mobj = re.match(self._VALID_URL, url)
3681 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3684 self.report_webpage(url)
3686 request = urllib2.Request(url)
3688 webpage = urllib2.urlopen(request).read()
3689 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3690 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3693 self.report_extraction(url)
3697 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3699 self._downloader.trouble(u'ERROR: unable to extract video url')
3701 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3705 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3707 self._downloader.trouble(u'ERROR: unable to extract video title')
3709 video_title = mobj.group(1).decode('utf-8')
3711 # Extract description
3712 video_description = u'No description available.'
3713 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3714 if mobj is not None:
3715 video_description = mobj.group(1).decode('utf-8')
3717 video_filename = video_url.split('/')[-1]
3718 video_id, extension = video_filename.split('.')
3720 self._downloader.increment_downloads()
3725 'upload_date': None,
3726 'title': video_title,
3727 'stitle': _simplify_title(video_title),
3729 'format': extension, # Extension is always(?) mp4, but seems to be flv
3731 'description': video_description,
3736 self._downloader.process_info(info)
3737 except UnavailableVideoError, err:
3738 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3740 class MixcloudIE(InfoExtractor):
3741 """Information extractor for www.mixcloud.com"""
3742 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3743 IE_NAME = u'mixcloud'
3745 def __init__(self, downloader=None):
3746 InfoExtractor.__init__(self, downloader)
3748 def report_download_json(self, file_id):
3749 """Report JSON download."""
3750 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3752 def report_extraction(self, file_id):
3753 """Report information extraction."""
3754 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3756 def get_urls(self, jsonData, fmt, bitrate='best'):
3757 """Get urls from 'audio_formats' section in json"""
3760 bitrate_list = jsonData[fmt]
3761 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3762 bitrate = max(bitrate_list) # select highest
3764 url_list = jsonData[fmt][bitrate]
3765 except TypeError: # we have no bitrate info.
3766 url_list = jsonData[fmt]
3770 def check_urls(self, url_list):
3771 """Returns 1st active url from list"""
3772 for url in url_list:
3774 urllib2.urlopen(url)
3776 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3781 def _print_formats(self, formats):
3782 print 'Available formats:'
3783 for fmt in formats.keys():
3784 for b in formats[fmt]:
3786 ext = formats[fmt][b][0]
3787 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3788 except TypeError: # we have no bitrate info
3789 ext = formats[fmt][0]
3790 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3793 def _real_extract(self, url):
3794 mobj = re.match(self._VALID_URL, url)
3796 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3798 # extract uploader & filename from url
3799 uploader = mobj.group(1).decode('utf-8')
3800 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3802 # construct API request
3803 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3804 # retrieve .json file with links to files
3805 request = urllib2.Request(file_url)
3807 self.report_download_json(file_url)
3808 jsonData = urllib2.urlopen(request).read()
3809 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3810 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3814 json_data = json.loads(jsonData)
3815 player_url = json_data['player_swf_url']
3816 formats = dict(json_data['audio_formats'])
3818 req_format = self._downloader.params.get('format', None)
3821 if self._downloader.params.get('listformats', None):
3822 self._print_formats(formats)
3825 if req_format is None or req_format == 'best':
3826 for format_param in formats.keys():
3827 url_list = self.get_urls(formats, format_param)
3829 file_url = self.check_urls(url_list)
3830 if file_url is not None:
3833 if req_format not in formats.keys():
3834 self._downloader.trouble(u'ERROR: format is not available')
3837 url_list = self.get_urls(formats, req_format)
3838 file_url = self.check_urls(url_list)
3839 format_param = req_format
3842 self._downloader.increment_downloads()
3844 # Process file information
3845 self._downloader.process_info({
3846 'id': file_id.decode('utf-8'),
3847 'url': file_url.decode('utf-8'),
3848 'uploader': uploader.decode('utf-8'),
3849 'upload_date': u'NA',
3850 'title': json_data['name'],
3851 'stitle': _simplify_title(json_data['name']),
3852 'ext': file_url.split('.')[-1].decode('utf-8'),
3853 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3854 'thumbnail': json_data['thumbnail_url'],
3855 'description': json_data['description'],
3856 'player_url': player_url.decode('utf-8'),
3858 except UnavailableVideoError, err:
3859 self._downloader.trouble(u'ERROR: unable to download file')
3861 class StanfordOpenClassroomIE(InfoExtractor):
3862 """Information extractor for Stanford's Open ClassRoom"""
3864 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3865 IE_NAME = u'stanfordoc'
3867 def report_download_webpage(self, objid):
3868 """Report information extraction."""
3869 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3871 def report_extraction(self, video_id):
3872 """Report information extraction."""
3873 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3875 def _real_extract(self, url):
3876 mobj = re.match(self._VALID_URL, url)
3878 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3881 if mobj.group('course') and mobj.group('video'): # A specific video
3882 course = mobj.group('course')
3883 video = mobj.group('video')
3885 'id': _simplify_title(course + '_' + video),
3888 self.report_extraction(info['id'])
3889 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3890 xmlUrl = baseUrl + video + '.xml'
3892 metaXml = urllib2.urlopen(xmlUrl).read()
3893 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3894 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3896 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3898 info['title'] = mdoc.findall('./title')[0].text
3899 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3901 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3903 info['stitle'] = _simplify_title(info['title'])
3904 info['ext'] = info['url'].rpartition('.')[2]
3905 info['format'] = info['ext']
3906 self._downloader.increment_downloads()
3908 self._downloader.process_info(info)
3909 except UnavailableVideoError, err:
3910 self._downloader.trouble(u'\nERROR: unable to download video')
3911 elif mobj.group('course'): # A course page
3912 unescapeHTML = HTMLParser.HTMLParser().unescape
3914 course = mobj.group('course')
3916 'id': _simplify_title(course),
3920 self.report_download_webpage(info['id'])
3922 coursepage = urllib2.urlopen(url).read()
3923 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3924 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3927 m = re.search('<h1>([^<]+)</h1>', coursepage)
3929 info['title'] = unescapeHTML(m.group(1))
3931 info['title'] = info['id']
3932 info['stitle'] = _simplify_title(info['title'])
3934 m = re.search('<description>([^<]+)</description>', coursepage)
3936 info['description'] = unescapeHTML(m.group(1))
3938 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3941 'type': 'reference',
3942 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3946 for entry in info['list']:
3947 assert entry['type'] == 'reference'
3948 self.extract(entry['url'])
3950 unescapeHTML = HTMLParser.HTMLParser().unescape
3953 'id': 'Stanford OpenClassroom',
3957 self.report_download_webpage(info['id'])
3958 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3960 rootpage = urllib2.urlopen(rootURL).read()
3961 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3962 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3965 info['title'] = info['id']
3966 info['stitle'] = _simplify_title(info['title'])
3968 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3971 'type': 'reference',
3972 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3976 for entry in info['list']:
3977 assert entry['type'] == 'reference'
3978 self.extract(entry['url'])
3980 class MTVIE(InfoExtractor):
3981 """Information extractor for MTV.com"""
3983 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3986 def report_webpage(self, video_id):
3987 """Report information extraction."""
3988 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3990 def report_extraction(self, video_id):
3991 """Report information extraction."""
3992 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3994 def _real_extract(self, url):
3995 mobj = re.match(self._VALID_URL, url)
3997 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3999 if not mobj.group('proto'):
4000 url = 'http://' + url
4001 video_id = mobj.group('videoid')
4002 self.report_webpage(video_id)
4004 request = urllib2.Request(url)
4006 webpage = urllib2.urlopen(request).read()
4007 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4008 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4011 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4013 self._downloader.trouble(u'ERROR: unable to extract song name')
4015 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4016 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4018 self._downloader.trouble(u'ERROR: unable to extract performer')
4020 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4021 video_title = performer + ' - ' + song_name
4023 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4025 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4027 mtvn_uri = mobj.group(1)
4029 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4031 self._downloader.trouble(u'ERROR: unable to extract content id')
4033 content_id = mobj.group(1)
4035 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4036 self.report_extraction(video_id)
4037 request = urllib2.Request(videogen_url)
4039 metadataXml = urllib2.urlopen(request).read()
4040 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4041 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4044 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4045 renditions = mdoc.findall('.//rendition')
4047 # For now, always pick the highest quality.
4048 rendition = renditions[-1]
4051 _,_,ext = rendition.attrib['type'].partition('/')
4052 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4053 video_url = rendition.find('./src').text
4055 self._downloader.trouble('Invalid rendition field.')
4058 self._downloader.increment_downloads()
4062 'uploader': performer,
4063 'title': video_title,
4064 'stitle': _simplify_title(video_title),
4070 self._downloader.process_info(info)
4071 except UnavailableVideoError, err:
4072 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4075 class PostProcessor(object):
4076 """Post Processor class.
4078 PostProcessor objects can be added to downloaders with their
4079 add_post_processor() method. When the downloader has finished a
4080 successful download, it will take its internal chain of PostProcessors
4081 and start calling the run() method on each one of them, first with
4082 an initial argument and then with the returned value of the previous
4085 The chain will be stopped if one of them ever returns None or the end
4086 of the chain is reached.
4088 PostProcessor objects follow a "mutual registration" process similar
4089 to InfoExtractor objects.
4094 def __init__(self, downloader=None):
4095 self._downloader = downloader
4097 def set_downloader(self, downloader):
4098 """Sets the downloader for this PP."""
4099 self._downloader = downloader
4101 def run(self, information):
4102 """Run the PostProcessor.
4104 The "information" argument is a dictionary like the ones
4105 composed by InfoExtractors. The only difference is that this
4106 one has an extra field called "filepath" that points to the
4109 When this method returns None, the postprocessing chain is
4110 stopped. However, this method may return an information
4111 dictionary that will be passed to the next postprocessing
4112 object in the chain. It can be the one it received after
4113 changing some fields.
4115 In addition, this method may raise a PostProcessingError
4116 exception that will be taken into account by the downloader
4119 return information # by default, do nothing
4121 class AudioConversionError(BaseException):
4122 def __init__(self, message):
4123 self.message = message
4125 class FFmpegExtractAudioPP(PostProcessor):
4127 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4128 PostProcessor.__init__(self, downloader)
4129 if preferredcodec is None:
4130 preferredcodec = 'best'
4131 self._preferredcodec = preferredcodec
4132 self._preferredquality = preferredquality
4133 self._keepvideo = keepvideo
4136 def get_audio_codec(path):
4138 cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4139 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4140 output = handle.communicate()[0]
4141 if handle.wait() != 0:
4143 except (IOError, OSError):
4146 for line in output.split('\n'):
4147 if line.startswith('codec_name='):
4148 audio_codec = line.split('=')[1].strip()
4149 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4154 def run_ffmpeg(path, out_path, codec, more_opts):
4158 acodec_opts = ['-acodec', codec]
4159 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4161 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4162 stdout,stderr = p.communicate()
4163 except (IOError, OSError):
4164 e = sys.exc_info()[1]
4165 if isinstance(e, OSError) and e.errno == 2:
4166 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4169 if p.returncode != 0:
4170 msg = stderr.strip().split('\n')[-1]
4171 raise AudioConversionError(msg)
4173 def run(self, information):
4174 path = information['filepath']
4176 filecodec = self.get_audio_codec(path)
4177 if filecodec is None:
4178 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4182 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4183 if self._preferredcodec == 'm4a' and filecodec == 'aac':
4184 # Lossless, but in another container
4186 extension = self._preferredcodec
4187 more_opts = ['-absf', 'aac_adtstoasc']
4188 elif filecodec in ['aac', 'mp3', 'vorbis']:
4189 # Lossless if possible
4191 extension = filecodec
4192 if filecodec == 'aac':
4193 more_opts = ['-f', 'adts']
4194 if filecodec == 'vorbis':
4198 acodec = 'libmp3lame'
4201 if self._preferredquality is not None:
4202 more_opts += ['-ab', self._preferredquality]
4204 # We convert the audio (lossy)
4205 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4206 extension = self._preferredcodec
4208 if self._preferredquality is not None:
4209 more_opts += ['-ab', self._preferredquality]
4210 if self._preferredcodec == 'aac':
4211 more_opts += ['-f', 'adts']
4212 if self._preferredcodec == 'm4a':
4213 more_opts += ['-absf', 'aac_adtstoasc']
4214 if self._preferredcodec == 'vorbis':
4216 if self._preferredcodec == 'wav':
4218 more_opts += ['-f', 'wav']
4220 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4221 new_path = prefix + sep + extension
4222 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4224 self.run_ffmpeg(path, new_path, acodec, more_opts)
4226 etype,e,tb = sys.exc_info()
4227 if isinstance(e, AudioConversionError):
4228 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4230 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4233 # Try to update the date time for extracted audio file.
4234 if information.get('filetime') is not None:
4236 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4238 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4240 if not self._keepvideo:
4242 os.remove(_encodeFilename(path))
4243 except (IOError, OSError):
4244 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4247 information['filepath'] = new_path
4251 def updateSelf(downloader, filename):
4252 ''' Update the program file with the latest version from the repository '''
4253 # Note: downloader only used for options
4254 if not os.access(filename, os.W_OK):
4255 sys.exit('ERROR: no write permissions on %s' % filename)
4257 downloader.to_screen(u'Updating to latest version...')
4261 urlh = urllib.urlopen(UPDATE_URL)
4262 newcontent = urlh.read()
4264 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4265 if vmatch is not None and vmatch.group(1) == __version__:
4266 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4270 except (IOError, OSError), err:
4271 sys.exit('ERROR: unable to download latest version')
4274 outf = open(filename, 'wb')
4276 outf.write(newcontent)
4279 except (IOError, OSError), err:
4280 sys.exit('ERROR: unable to overwrite current version')
4282 downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4285 def _readOptions(filename_bytes):
4287 optionf = open(filename_bytes)
4289 return [] # silently skip if file is not present
4293 res += shlex.split(l, comments=True)
4298 def _format_option_string(option):
4299 ''' ('-o', '--option') -> -o, --format METAVAR'''
4303 if option._short_opts: opts.append(option._short_opts[0])
4304 if option._long_opts: opts.append(option._long_opts[0])
4305 if len(opts) > 1: opts.insert(1, ', ')
4307 if option.takes_value(): opts.append(' %s' % option.metavar)
4309 return "".join(opts)
4311 def _find_term_columns():
4312 columns = os.environ.get('COLUMNS', None)
4317 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4318 out,err = sp.communicate()
4319 return int(out.split()[1])
4325 max_help_position = 80
4327 # No need to wrap help messages if we're on a wide console
4328 columns = _find_term_columns()
4329 if columns: max_width = columns
4331 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4332 fmt.format_option_strings = _format_option_string
4335 'version' : __version__,
4337 'usage' : '%prog [options] url [url...]',
4338 'conflict_handler' : 'resolve',
4341 parser = optparse.OptionParser(**kw)
4344 general = optparse.OptionGroup(parser, 'General Options')
4345 selection = optparse.OptionGroup(parser, 'Video Selection')
4346 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4347 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4348 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4349 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4350 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4352 general.add_option('-h', '--help',
4353 action='help', help='print this help text and exit')
4354 general.add_option('-v', '--version',
4355 action='version', help='print program version and exit')
4356 general.add_option('-U', '--update',
4357 action='store_true', dest='update_self', help='update this program to latest version')
4358 general.add_option('-i', '--ignore-errors',
4359 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4360 general.add_option('-r', '--rate-limit',
4361 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4362 general.add_option('-R', '--retries',
4363 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4364 general.add_option('--dump-user-agent',
4365 action='store_true', dest='dump_user_agent',
4366 help='display the current browser identification', default=False)
4367 general.add_option('--list-extractors',
4368 action='store_true', dest='list_extractors',
4369 help='List all supported extractors and the URLs they would handle', default=False)
4371 selection.add_option('--playlist-start',
4372 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4373 selection.add_option('--playlist-end',
4374 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4375 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4376 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4377 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4379 authentication.add_option('-u', '--username',
4380 dest='username', metavar='USERNAME', help='account username')
4381 authentication.add_option('-p', '--password',
4382 dest='password', metavar='PASSWORD', help='account password')
4383 authentication.add_option('-n', '--netrc',
4384 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4387 video_format.add_option('-f', '--format',
4388 action='store', dest='format', metavar='FORMAT', help='video format code')
4389 video_format.add_option('--all-formats',
4390 action='store_const', dest='format', help='download all available video formats', const='all')
4391 video_format.add_option('--prefer-free-formats',
4392 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4393 video_format.add_option('--max-quality',
4394 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4395 video_format.add_option('-F', '--list-formats',
4396 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4397 video_format.add_option('--write-srt',
4398 action='store_true', dest='writesubtitles',
4399 help='write video closed captions to a .srt file (currently youtube only)', default=False)
4400 video_format.add_option('--srt-lang',
4401 action='store', dest='subtitleslang', metavar='LANG',
4402 help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
4405 verbosity.add_option('-q', '--quiet',
4406 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4407 verbosity.add_option('-s', '--simulate',
4408 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4409 verbosity.add_option('--skip-download',
4410 action='store_true', dest='skip_download', help='do not download the video', default=False)
4411 verbosity.add_option('-g', '--get-url',
4412 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4413 verbosity.add_option('-e', '--get-title',
4414 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4415 verbosity.add_option('--get-thumbnail',
4416 action='store_true', dest='getthumbnail',
4417 help='simulate, quiet but print thumbnail URL', default=False)
4418 verbosity.add_option('--get-description',
4419 action='store_true', dest='getdescription',
4420 help='simulate, quiet but print video description', default=False)
4421 verbosity.add_option('--get-filename',
4422 action='store_true', dest='getfilename',
4423 help='simulate, quiet but print output filename', default=False)
4424 verbosity.add_option('--get-format',
4425 action='store_true', dest='getformat',
4426 help='simulate, quiet but print output format', default=False)
4427 verbosity.add_option('--no-progress',
4428 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4429 verbosity.add_option('--console-title',
4430 action='store_true', dest='consoletitle',
4431 help='display progress in console titlebar', default=False)
4432 verbosity.add_option('-v', '--verbose',
4433 action='store_true', dest='verbose', help='print various debugging information', default=False)
4436 filesystem.add_option('-t', '--title',
4437 action='store_true', dest='usetitle', help='use title in file name', default=False)
4438 filesystem.add_option('-l', '--literal',
4439 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4440 filesystem.add_option('-A', '--auto-number',
4441 action='store_true', dest='autonumber',
4442 help='number downloaded files starting from 00000', default=False)
4443 filesystem.add_option('-o', '--output',
4444 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4445 filesystem.add_option('-a', '--batch-file',
4446 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4447 filesystem.add_option('-w', '--no-overwrites',
4448 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4449 filesystem.add_option('-c', '--continue',
4450 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4451 filesystem.add_option('--no-continue',
4452 action='store_false', dest='continue_dl',
4453 help='do not resume partially downloaded files (restart from beginning)')
4454 filesystem.add_option('--cookies',
4455 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4456 filesystem.add_option('--no-part',
4457 action='store_true', dest='nopart', help='do not use .part files', default=False)
4458 filesystem.add_option('--no-mtime',
4459 action='store_false', dest='updatetime',
4460 help='do not use the Last-modified header to set the file modification time', default=True)
4461 filesystem.add_option('--write-description',
4462 action='store_true', dest='writedescription',
4463 help='write video description to a .description file', default=False)
4464 filesystem.add_option('--write-info-json',
4465 action='store_true', dest='writeinfojson',
4466 help='write video metadata to a .info.json file', default=False)
4469 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4470 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4471 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4472 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4473 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4474 help='ffmpeg audio bitrate specification, 128k by default')
4475 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4476 help='keeps the video file on disk after the post-processing; the video is erased by default')
4479 parser.add_option_group(general)
4480 parser.add_option_group(selection)
4481 parser.add_option_group(filesystem)
4482 parser.add_option_group(verbosity)
4483 parser.add_option_group(video_format)
4484 parser.add_option_group(authentication)
4485 parser.add_option_group(postproc)
4487 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4489 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4491 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4492 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4493 opts, args = parser.parse_args(argv)
4495 return parser, opts, args
4497 def gen_extractors():
4498 """ Return a list of an instance of every supported extractor.
4499 The order does matter; the first extractor matched is the one handling the URL.
4501 youtube_ie = YoutubeIE()
4502 google_ie = GoogleIE()
4503 yahoo_ie = YahooIE()
4505 YoutubePlaylistIE(youtube_ie),
4506 YoutubeUserIE(youtube_ie),
4507 YoutubeSearchIE(youtube_ie),
4509 MetacafeIE(youtube_ie),
4512 GoogleSearchIE(google_ie),
4515 YahooSearchIE(yahoo_ie),
4528 StanfordOpenClassroomIE(),
4535 parser, opts, args = parseOpts()
4537 # Open appropriate CookieJar
4538 if opts.cookiefile is None:
4539 jar = cookielib.CookieJar()
4542 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4543 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4545 except (IOError, OSError), err:
4546 sys.exit(u'ERROR: unable to open cookie file')
4549 if opts.dump_user_agent:
4550 print std_headers['User-Agent']
4553 # Batch file verification
4555 if opts.batchfile is not None:
4557 if opts.batchfile == '-':
4560 batchfd = open(opts.batchfile, 'r')
4561 batchurls = batchfd.readlines()
4562 batchurls = [x.strip() for x in batchurls]
4563 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4565 sys.exit(u'ERROR: batch file could not be read')
4566 all_urls = batchurls + args
4568 # General configuration
4569 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4570 proxy_handler = urllib2.ProxyHandler()
4571 opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4572 urllib2.install_opener(opener)
4573 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4576 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4578 extractors = gen_extractors()
4580 if opts.list_extractors:
4581 for ie in extractors:
4583 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4584 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4585 for mu in matchedUrls:
4589 # Conflicting, missing and erroneous options
4590 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4591 parser.error(u'using .netrc conflicts with giving username/password')
4592 if opts.password is not None and opts.username is None:
4593 parser.error(u'account username missing')
4594 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4595 parser.error(u'using output template conflicts with using title, literal title or auto number')
4596 if opts.usetitle and opts.useliteral:
4597 parser.error(u'using title conflicts with using literal title')
4598 if opts.username is not None and opts.password is None:
4599 opts.password = getpass.getpass(u'Type account password and press return:')
4600 if opts.ratelimit is not None:
4601 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4602 if numeric_limit is None:
4603 parser.error(u'invalid rate limit specified')
4604 opts.ratelimit = numeric_limit
4605 if opts.retries is not None:
4607 opts.retries = long(opts.retries)
4608 except (TypeError, ValueError), err:
4609 parser.error(u'invalid retry count specified')
4611 opts.playliststart = int(opts.playliststart)
4612 if opts.playliststart <= 0:
4613 raise ValueError(u'Playlist start must be positive')
4614 except (TypeError, ValueError), err:
4615 parser.error(u'invalid playlist start number specified')
4617 opts.playlistend = int(opts.playlistend)
4618 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4619 raise ValueError(u'Playlist end must be greater than playlist start')
4620 except (TypeError, ValueError), err:
4621 parser.error(u'invalid playlist end number specified')
4622 if opts.extractaudio:
4623 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4624 parser.error(u'invalid audio format specified')
4627 fd = FileDownloader({
4628 'usenetrc': opts.usenetrc,
4629 'username': opts.username,
4630 'password': opts.password,
4631 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4632 'forceurl': opts.geturl,
4633 'forcetitle': opts.gettitle,
4634 'forcethumbnail': opts.getthumbnail,
4635 'forcedescription': opts.getdescription,
4636 'forcefilename': opts.getfilename,
4637 'forceformat': opts.getformat,
4638 'simulate': opts.simulate,
4639 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4640 'format': opts.format,
4641 'format_limit': opts.format_limit,
4642 'listformats': opts.listformats,
4643 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4644 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4645 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4646 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4647 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4648 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4649 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4650 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4651 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4652 or u'%(id)s.%(ext)s'),
4653 'ignoreerrors': opts.ignoreerrors,
4654 'ratelimit': opts.ratelimit,
4655 'nooverwrites': opts.nooverwrites,
4656 'retries': opts.retries,
4657 'continuedl': opts.continue_dl,
4658 'noprogress': opts.noprogress,
4659 'playliststart': opts.playliststart,
4660 'playlistend': opts.playlistend,
4661 'logtostderr': opts.outtmpl == '-',
4662 'consoletitle': opts.consoletitle,
4663 'nopart': opts.nopart,
4664 'updatetime': opts.updatetime,
4665 'writedescription': opts.writedescription,
4666 'writeinfojson': opts.writeinfojson,
4667 'writesubtitles': opts.writesubtitles,
4668 'subtitleslang': opts.subtitleslang,
4669 'matchtitle': opts.matchtitle,
4670 'rejecttitle': opts.rejecttitle,
4671 'max_downloads': opts.max_downloads,
4672 'prefer_free_formats': opts.prefer_free_formats,
4673 'verbose': opts.verbose,
4675 for extractor in extractors:
4676 fd.add_info_extractor(extractor)
4679 if opts.extractaudio:
4680 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4683 if opts.update_self:
4684 updateSelf(fd, sys.argv[0])
4687 if len(all_urls) < 1:
4688 if not opts.update_self:
4689 parser.error(u'you must provide at least one URL')
4694 retcode = fd.download(all_urls)
4695 except MaxDownloadsReached:
4696 fd.to_screen(u'--max-download limit reached, aborting.')
4699 # Dump cookie jar if requested
4700 if opts.cookiefile is not None:
4703 except (IOError, OSError), err:
4704 sys.exit(u'ERROR: unable to save cookie jar')
4711 except DownloadError:
4713 except SameFileError:
4714 sys.exit(u'ERROR: fixed output name but more than one file to download')
4715 except KeyboardInterrupt:
4716 sys.exit(u'\nERROR: Interrupted by user')
4718 if __name__ == '__main__':
4721 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: