2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
21 __license__ = 'Public Domain'
22 __version__ = '2012.02.27'
24 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
57 except ImportError: # Python 2.4
60 import cStringIO as StringIO
64 # parse_qs was moved from the cgi module to the urlparse module recently.
66 from urlparse import parse_qs
68 from cgi import parse_qs
76 import xml.etree.ElementTree
77 except ImportError: # Python<2.5: Not officially supported, but let it slip
78 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
81 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
82 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
83 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Encoding': 'gzip, deflate',
85 'Accept-Language': 'en-us,en;q=0.5',
90 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
96 def raiseError(msg, i):
97 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
98 def skipSpace(i, expectMore=True):
99 while i < len(s) and s[i] in ' \t\r\n':
103 raiseError('Premature end', i)
105 def decodeEscape(match):
121 return unichr(int(esc[1:5], 16))
122 if len(esc) == 5+6 and esc[5:7] == '\\u':
123 hi = int(esc[1:5], 16)
124 low = int(esc[7:11], 16)
125 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
126 raise ValueError('Unknown escape ' + str(esc))
133 while s[e-bslashes-1] == '\\':
135 if bslashes % 2 == 1:
139 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
140 stri = rexp.sub(decodeEscape, s[i:e])
146 if s[i] == '}': # Empty dictionary
150 raiseError('Expected a string object key', i)
151 i,key = parseString(i)
153 if i >= len(s) or s[i] != ':':
154 raiseError('Expected a colon', i)
161 raiseError('Expected comma or closing curly brace', i)
166 if s[i] == ']': # Empty array
171 i = skipSpace(i) # Raise exception if premature end
175 raiseError('Expected a comma or closing bracket', i)
177 def parseDiscrete(i):
178 for k,v in {'true': True, 'false': False, 'null': None}.items():
179 if s.startswith(k, i):
181 raiseError('Not a boolean (or null)', i)
183 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
185 raiseError('Not a number', i)
187 if '.' in nums or 'e' in nums or 'E' in nums:
188 return (i+len(nums), float(nums))
189 return (i+len(nums), int(nums))
190 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
193 i,res = CHARMAP.get(s[i], parseNumber)(i)
194 i = skipSpace(i, False)
198 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
201 def preferredencoding():
202 """Get preferred encoding.
204 Returns the best encoding scheme for the system, based on
205 locale.getpreferredencoding() and some further tweaks.
207 def yield_preferredencoding():
209 pref = locale.getpreferredencoding()
215 return yield_preferredencoding().next()
218 def htmlentity_transform(matchobj):
219 """Transforms an HTML entity to a Unicode character.
221 This function receives a match object and is intended to be used with
222 the re.sub() function.
224 entity = matchobj.group(1)
226 # Known non-numeric HTML entity
227 if entity in htmlentitydefs.name2codepoint:
228 return unichr(htmlentitydefs.name2codepoint[entity])
231 mobj = re.match(ur'(?u)#(x?\d+)', entity)
233 numstr = mobj.group(1)
234 if numstr.startswith(u'x'):
236 numstr = u'0%s' % numstr
239 return unichr(long(numstr, base))
241 # Unknown entity in name, return its literal representation
242 return (u'&%s;' % entity)
245 def sanitize_title(utitle):
246 """Sanitizes a video title so it could be used as part of a filename."""
247 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
248 return utitle.replace(unicode(os.sep), u'%')
251 def sanitize_open(filename, open_mode):
252 """Try to open the given filename, and slightly tweak it if this fails.
254 Attempts to open the given filename. If this fails, it tries to change
255 the filename slightly, step by step, until it's either able to open it
256 or it fails and raises a final exception, like the standard open()
259 It returns the tuple (stream, definitive_file_name).
263 if sys.platform == 'win32':
265 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
266 return (sys.stdout, filename)
267 stream = open(_encodeFilename(filename), open_mode)
268 return (stream, filename)
269 except (IOError, OSError), err:
270 # In case of error, try to remove win32 forbidden chars
271 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
273 # An exception here should be caught in the caller
274 stream = open(_encodeFilename(filename), open_mode)
275 return (stream, filename)
278 def timeconvert(timestr):
279 """Convert RFC 2822 defined time string into system timestamp"""
281 timetuple = email.utils.parsedate_tz(timestr)
282 if timetuple is not None:
283 timestamp = email.utils.mktime_tz(timetuple)
286 def _simplify_title(title):
287 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
288 return expr.sub(u'_', title).strip(u'_')
290 def _orderedSet(iterable):
291 """ Remove all duplicates from the input iterable """
298 def _unescapeHTML(s):
300 @param s a string (of type unicode)
302 assert type(s) == type(u'')
304 htmlParser = HTMLParser.HTMLParser()
305 return htmlParser.unescape(s)
307 def _encodeFilename(s):
309 @param s The name of the file (of type unicode)
312 assert type(s) == type(u'')
314 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
315 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
316 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
317 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
320 return s.encode(sys.getfilesystemencoding(), 'ignore')
322 class DownloadError(Exception):
323 """Download Error exception.
325 This exception may be thrown by FileDownloader objects if they are not
326 configured to continue on errors. They will contain the appropriate
332 class SameFileError(Exception):
333 """Same File exception.
335 This exception will be thrown by FileDownloader objects if they detect
336 multiple files would have to be downloaded to the same file on disk.
341 class PostProcessingError(Exception):
342 """Post Processing exception.
344 This exception may be raised by PostProcessor's .run() method to
345 indicate an error in the postprocessing task.
349 class MaxDownloadsReached(Exception):
350 """ --max-downloads limit has been reached. """
354 class UnavailableVideoError(Exception):
355 """Unavailable Format exception.
357 This exception will be thrown when a video is requested
358 in a format that is not available for that video.
363 class ContentTooShortError(Exception):
364 """Content Too Short exception.
366 This exception may be raised by FileDownloader objects when a file they
367 download is too small for what the server announced first, indicating
368 the connection was probably interrupted.
374 def __init__(self, downloaded, expected):
375 self.downloaded = downloaded
376 self.expected = expected
379 class YoutubeDLHandler(urllib2.HTTPHandler):
380 """Handler for HTTP requests and responses.
382 This class, when installed with an OpenerDirector, automatically adds
383 the standard headers to every HTTP request and handles gzipped and
384 deflated responses from web servers. If compression is to be avoided in
385 a particular request, the original request in the program code only has
386 to include the HTTP header "Youtubedl-No-Compression", which will be
387 removed before making the real request.
389 Part of this code was copied from:
391 http://techknack.net/python-urllib2-handlers/
393 Andrew Rowls, the author of that code, agreed to release it to the
400 return zlib.decompress(data, -zlib.MAX_WBITS)
402 return zlib.decompress(data)
405 def addinfourl_wrapper(stream, headers, url, code):
406 if hasattr(urllib2.addinfourl, 'getcode'):
407 return urllib2.addinfourl(stream, headers, url, code)
408 ret = urllib2.addinfourl(stream, headers, url)
412 def http_request(self, req):
413 for h in std_headers:
416 req.add_header(h, std_headers[h])
417 if 'Youtubedl-no-compression' in req.headers:
418 if 'Accept-encoding' in req.headers:
419 del req.headers['Accept-encoding']
420 del req.headers['Youtubedl-no-compression']
423 def http_response(self, req, resp):
426 if resp.headers.get('Content-encoding', '') == 'gzip':
427 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
428 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
429 resp.msg = old_resp.msg
431 if resp.headers.get('Content-encoding', '') == 'deflate':
432 gz = StringIO.StringIO(self.deflate(resp.read()))
433 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
434 resp.msg = old_resp.msg
438 class FileDownloader(object):
439 """File Downloader class.
441 File downloader objects are the ones responsible of downloading the
442 actual video file and writing it to disk if the user has requested
443 it, among some other tasks. In most cases there should be one per
444 program. As, given a video URL, the downloader doesn't know how to
445 extract all the needed information, task that InfoExtractors do, it
446 has to pass the URL to one of them.
448 For this, file downloader objects have a method that allows
449 InfoExtractors to be registered in a given order. When it is passed
450 a URL, the file downloader handles it to the first InfoExtractor it
451 finds that reports being able to handle it. The InfoExtractor extracts
452 all the information about the video or videos the URL refers to, and
453 asks the FileDownloader to process the video information, possibly
454 downloading the video.
456 File downloaders accept a lot of parameters. In order not to saturate
457 the object constructor with arguments, it receives a dictionary of
458 options instead. These options are available through the params
459 attribute for the InfoExtractors to use. The FileDownloader also
460 registers itself as the downloader in charge for the InfoExtractors
461 that are added to it, so this is a "mutual registration".
465 username: Username for authentication purposes.
466 password: Password for authentication purposes.
467 usenetrc: Use netrc for authentication instead.
468 quiet: Do not print messages to stdout.
469 forceurl: Force printing final URL.
470 forcetitle: Force printing title.
471 forcethumbnail: Force printing thumbnail URL.
472 forcedescription: Force printing description.
473 forcefilename: Force printing final filename.
474 simulate: Do not download the video files.
475 format: Video format code.
476 format_limit: Highest quality format to try.
477 outtmpl: Template for output names.
478 ignoreerrors: Do not stop on download errors.
479 ratelimit: Download speed limit, in bytes/sec.
480 nooverwrites: Prevent overwriting files.
481 retries: Number of times to retry for HTTP error 5xx
482 continuedl: Try to continue downloads if possible.
483 noprogress: Do not print the progress bar.
484 playliststart: Playlist item to start at.
485 playlistend: Playlist item to end at.
486 matchtitle: Download only matching titles.
487 rejecttitle: Reject downloads for matching titles.
488 logtostderr: Log messages to stderr instead of stdout.
489 consoletitle: Display progress in console window's titlebar.
490 nopart: Do not use temporary .part files.
491 updatetime: Use the Last-modified header to set output file timestamps.
492 writedescription: Write the video description to a .description file
493 writeinfojson: Write the video description to a .info.json file
494 writesubtitles: Write the video subtitles to a .srt file
495 subtitleslang: Language of the subtitles to download
501 _download_retcode = None
502 _num_downloads = None
505 def __init__(self, params):
506 """Create a FileDownloader object with the given options."""
509 self._download_retcode = 0
510 self._num_downloads = 0
511 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
515 def format_bytes(bytes):
518 if type(bytes) is str:
523 exponent = long(math.log(bytes, 1024.0))
524 suffix = 'bkMGTPEZY'[exponent]
525 converted = float(bytes) / float(1024 ** exponent)
526 return '%.2f%s' % (converted, suffix)
529 def calc_percent(byte_counter, data_len):
532 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
535 def calc_eta(start, now, total, current):
539 if current == 0 or dif < 0.001: # One millisecond
541 rate = float(current) / dif
542 eta = long((float(total) - float(current)) / rate)
543 (eta_mins, eta_secs) = divmod(eta, 60)
546 return '%02d:%02d' % (eta_mins, eta_secs)
549 def calc_speed(start, now, bytes):
551 if bytes == 0 or dif < 0.001: # One millisecond
552 return '%10s' % '---b/s'
553 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
556 def best_block_size(elapsed_time, bytes):
557 new_min = max(bytes / 2.0, 1.0)
558 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
559 if elapsed_time < 0.001:
561 rate = bytes / elapsed_time
569 def parse_bytes(bytestr):
570 """Parse a string indicating a byte quantity into a long integer."""
571 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
574 number = float(matchobj.group(1))
575 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
576 return long(round(number * multiplier))
578 def add_info_extractor(self, ie):
579 """Add an InfoExtractor object to the end of the list."""
581 ie.set_downloader(self)
583 def add_post_processor(self, pp):
584 """Add a PostProcessor object to the end of the chain."""
586 pp.set_downloader(self)
588 def to_screen(self, message, skip_eol=False):
589 """Print message to stdout if not in quiet mode."""
590 assert type(message) == type(u'')
591 if not self.params.get('quiet', False):
592 terminator = [u'\n', u''][skip_eol]
593 output = message + terminator
595 if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
596 output = output.encode(preferredencoding(), 'ignore')
597 self._screen_file.write(output)
598 self._screen_file.flush()
600 def to_stderr(self, message):
601 """Print message to stderr."""
602 print >>sys.stderr, message.encode(preferredencoding())
604 def to_cons_title(self, message):
605 """Set console/terminal window title to message."""
606 if not self.params.get('consoletitle', False):
608 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
609 # c_wchar_p() might not be necessary if `message` is
610 # already of type unicode()
611 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
612 elif 'TERM' in os.environ:
613 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
615 def fixed_template(self):
616 """Checks if the output template is fixed."""
617 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
619 def trouble(self, message=None):
620 """Determine action to take when a download problem appears.
622 Depending on if the downloader has been configured to ignore
623 download errors or not, this method may throw an exception or
624 not when errors are found, after printing the message.
626 if message is not None:
627 self.to_stderr(message)
628 if not self.params.get('ignoreerrors', False):
629 raise DownloadError(message)
630 self._download_retcode = 1
632 def slow_down(self, start_time, byte_counter):
633 """Sleep if the download speed is over the rate limit."""
634 rate_limit = self.params.get('ratelimit', None)
635 if rate_limit is None or byte_counter == 0:
638 elapsed = now - start_time
641 speed = float(byte_counter) / elapsed
642 if speed > rate_limit:
643 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
645 def temp_name(self, filename):
646 """Returns a temporary filename for the given filename."""
647 if self.params.get('nopart', False) or filename == u'-' or \
648 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
650 return filename + u'.part'
652 def undo_temp_name(self, filename):
653 if filename.endswith(u'.part'):
654 return filename[:-len(u'.part')]
657 def try_rename(self, old_filename, new_filename):
659 if old_filename == new_filename:
661 os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
662 except (IOError, OSError), err:
663 self.trouble(u'ERROR: unable to rename file')
665 def try_utime(self, filename, last_modified_hdr):
666 """Try to set the last-modified time of the given file."""
667 if last_modified_hdr is None:
669 if not os.path.isfile(_encodeFilename(filename)):
671 timestr = last_modified_hdr
674 filetime = timeconvert(timestr)
678 os.utime(filename, (time.time(), filetime))
683 def report_writedescription(self, descfn):
684 """ Report that the description file is being written """
685 self.to_screen(u'[info] Writing video description to: ' + descfn)
687 def report_writesubtitles(self, srtfn):
688 """ Report that the subtitles file is being written """
689 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
691 def report_writeinfojson(self, infofn):
692 """ Report that the metadata file has been written """
693 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
695 def report_destination(self, filename):
696 """Report destination filename."""
697 self.to_screen(u'[download] Destination: ' + filename)
699 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
700 """Report download progress."""
701 if self.params.get('noprogress', False):
703 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
704 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
705 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
706 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
708 def report_resuming_byte(self, resume_len):
709 """Report attempt to resume at given byte."""
710 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
712 def report_retry(self, count, retries):
713 """Report retry in case of HTTP error 5xx"""
714 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
716 def report_file_already_downloaded(self, file_name):
717 """Report file has already been fully downloaded."""
719 self.to_screen(u'[download] %s has already been downloaded' % file_name)
720 except (UnicodeEncodeError), err:
721 self.to_screen(u'[download] The file has already been downloaded')
723 def report_unable_to_resume(self):
724 """Report it was impossible to resume download."""
725 self.to_screen(u'[download] Unable to resume')
727 def report_finish(self):
728 """Report download finished."""
729 if self.params.get('noprogress', False):
730 self.to_screen(u'[download] Download completed')
734 def increment_downloads(self):
735 """Increment the ordinal that assigns a number to each file."""
736 self._num_downloads += 1
738 def prepare_filename(self, info_dict):
739 """Generate the output filename."""
741 template_dict = dict(info_dict)
742 template_dict['epoch'] = unicode(long(time.time()))
743 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
744 filename = self.params['outtmpl'] % template_dict
746 except (ValueError, KeyError), err:
747 self.trouble(u'ERROR: invalid system charset or erroneous output template')
750 def _match_entry(self, info_dict):
751 """ Returns None iff the file should be downloaded """
753 title = info_dict['title']
754 matchtitle = self.params.get('matchtitle', False)
755 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
756 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
757 rejecttitle = self.params.get('rejecttitle', False)
758 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
759 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
762 def process_info(self, info_dict):
763 """Process a single dictionary returned by an InfoExtractor."""
765 reason = self._match_entry(info_dict)
766 if reason is not None:
767 self.to_screen(u'[download] ' + reason)
770 max_downloads = self.params.get('max_downloads')
771 if max_downloads is not None:
772 if self._num_downloads > int(max_downloads):
773 raise MaxDownloadsReached()
775 filename = self.prepare_filename(info_dict)
778 if self.params.get('forcetitle', False):
779 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
780 if self.params.get('forceurl', False):
781 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
782 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
783 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
784 if self.params.get('forcedescription', False) and 'description' in info_dict:
785 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
786 if self.params.get('forcefilename', False) and filename is not None:
787 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
788 if self.params.get('forceformat', False):
789 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
791 # Do nothing else if in simulate mode
792 if self.params.get('simulate', False):
799 dn = os.path.dirname(_encodeFilename(filename))
800 if dn != '' and not os.path.exists(dn): # dn is already encoded
802 except (OSError, IOError), err:
803 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
806 if self.params.get('writedescription', False):
808 descfn = filename + u'.description'
809 self.report_writedescription(descfn)
810 descfile = open(_encodeFilename(descfn), 'wb')
812 descfile.write(info_dict['description'].encode('utf-8'))
815 except (OSError, IOError):
816 self.trouble(u'ERROR: Cannot write description file ' + descfn)
819 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
820 # subtitles download errors are already managed as troubles in relevant IE
821 # that way it will silently go on when used with unsupporting IE
823 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
824 self.report_writesubtitles(srtfn)
825 srtfile = open(_encodeFilename(srtfn), 'wb')
827 srtfile.write(info_dict['subtitles'].encode('utf-8'))
830 except (OSError, IOError):
831 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
834 if self.params.get('writeinfojson', False):
835 infofn = filename + u'.info.json'
836 self.report_writeinfojson(infofn)
839 except (NameError,AttributeError):
840 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
843 infof = open(_encodeFilename(infofn), 'wb')
845 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
846 json.dump(json_info_dict, infof)
849 except (OSError, IOError):
850 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
853 if not self.params.get('skip_download', False):
854 if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
858 success = self._do_download(filename, info_dict)
859 except (OSError, IOError), err:
860 raise UnavailableVideoError
861 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
862 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
864 except (ContentTooShortError, ), err:
865 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
870 self.post_process(filename, info_dict)
871 except (PostProcessingError), err:
872 self.trouble(u'ERROR: postprocessing: %s' % str(err))
875 def download(self, url_list):
876 """Download a given list of URLs."""
877 if len(url_list) > 1 and self.fixed_template():
878 raise SameFileError(self.params['outtmpl'])
881 suitable_found = False
883 # Go to next InfoExtractor if not suitable
884 if not ie.suitable(url):
887 # Suitable InfoExtractor found
888 suitable_found = True
890 # Extract information from URL and process it
893 # Suitable InfoExtractor had been found; go to next URL
896 if not suitable_found:
897 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
899 return self._download_retcode
901 def post_process(self, filename, ie_info):
902 """Run the postprocessing chain on the given file."""
904 info['filepath'] = filename
910 def _download_with_rtmpdump(self, filename, url, player_url):
911 self.report_destination(filename)
912 tmpfilename = self.temp_name(filename)
914 # Check for rtmpdump first
916 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
917 except (OSError, IOError):
918 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
921 # Download using rtmpdump. rtmpdump returns exit code 2 when
922 # the connection was interrumpted and resuming appears to be
923 # possible. This is part of rtmpdump's normal usage, AFAIK.
924 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
925 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
926 if self.params.get('verbose', False):
929 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
932 self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
933 retval = subprocess.call(args)
934 while retval == 2 or retval == 1:
935 prevsize = os.path.getsize(_encodeFilename(tmpfilename))
936 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
937 time.sleep(5.0) # This seems to be needed
938 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
939 cursize = os.path.getsize(_encodeFilename(tmpfilename))
940 if prevsize == cursize and retval == 1:
942 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
943 if prevsize == cursize and retval == 2 and cursize > 1024:
944 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
948 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
949 self.try_rename(tmpfilename, filename)
952 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
955 def _do_download(self, filename, info_dict):
956 url = info_dict['url']
957 player_url = info_dict.get('player_url', None)
959 # Check file already present
960 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
961 self.report_file_already_downloaded(filename)
964 # Attempt to download using rtmpdump
965 if url.startswith('rtmp'):
966 return self._download_with_rtmpdump(filename, url, player_url)
968 tmpfilename = self.temp_name(filename)
971 # Do not include the Accept-Encoding header
972 headers = {'Youtubedl-no-compression': 'True'}
973 basic_request = urllib2.Request(url, None, headers)
974 request = urllib2.Request(url, None, headers)
976 # Establish possible resume length
977 if os.path.isfile(_encodeFilename(tmpfilename)):
978 resume_len = os.path.getsize(_encodeFilename(tmpfilename))
984 if self.params.get('continuedl', False):
985 self.report_resuming_byte(resume_len)
986 request.add_header('Range','bytes=%d-' % resume_len)
992 retries = self.params.get('retries', 0)
993 while count <= retries:
994 # Establish connection
996 if count == 0 and 'urlhandle' in info_dict:
997 data = info_dict['urlhandle']
998 data = urllib2.urlopen(request)
1000 except (urllib2.HTTPError, ), err:
1001 if (err.code < 500 or err.code >= 600) and err.code != 416:
1002 # Unexpected HTTP error
1004 elif err.code == 416:
1005 # Unable to resume (requested range not satisfiable)
1007 # Open the connection again without the range header
1008 data = urllib2.urlopen(basic_request)
1009 content_length = data.info()['Content-Length']
1010 except (urllib2.HTTPError, ), err:
1011 if err.code < 500 or err.code >= 600:
1014 # Examine the reported length
1015 if (content_length is not None and
1016 (resume_len - 100 < long(content_length) < resume_len + 100)):
1017 # The file had already been fully downloaded.
1018 # Explanation to the above condition: in issue #175 it was revealed that
1019 # YouTube sometimes adds or removes a few bytes from the end of the file,
1020 # changing the file size slightly and causing problems for some users. So
1021 # I decided to implement a suggested change and consider the file
1022 # completely downloaded if the file size differs less than 100 bytes from
1023 # the one in the hard drive.
1024 self.report_file_already_downloaded(filename)
1025 self.try_rename(tmpfilename, filename)
1028 # The length does not match, we start the download over
1029 self.report_unable_to_resume()
1034 if count <= retries:
1035 self.report_retry(count, retries)
1038 self.trouble(u'ERROR: giving up after %s retries' % retries)
1041 data_len = data.info().get('Content-length', None)
1042 if data_len is not None:
1043 data_len = long(data_len) + resume_len
1044 data_len_str = self.format_bytes(data_len)
1045 byte_counter = 0 + resume_len
1049 # Download and write
1050 before = time.time()
1051 data_block = data.read(block_size)
1053 if len(data_block) == 0:
1055 byte_counter += len(data_block)
1057 # Open file just in time
1060 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1061 assert stream is not None
1062 filename = self.undo_temp_name(tmpfilename)
1063 self.report_destination(filename)
1064 except (OSError, IOError), err:
1065 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1068 stream.write(data_block)
1069 except (IOError, OSError), err:
1070 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1072 block_size = self.best_block_size(after - before, len(data_block))
1075 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1076 if data_len is None:
1077 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1079 percent_str = self.calc_percent(byte_counter, data_len)
1080 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1081 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1084 self.slow_down(start, byte_counter - resume_len)
1087 self.trouble(u'\nERROR: Did not get any data blocks')
1090 self.report_finish()
1091 if data_len is not None and byte_counter != data_len:
1092 raise ContentTooShortError(byte_counter, long(data_len))
1093 self.try_rename(tmpfilename, filename)
1095 # Update file modification time
1096 if self.params.get('updatetime', True):
1097 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1102 class InfoExtractor(object):
1103 """Information Extractor class.
1105 Information extractors are the classes that, given a URL, extract
1106 information from the video (or videos) the URL refers to. This
1107 information includes the real video URL, the video title and simplified
1108 title, author and others. The information is stored in a dictionary
1109 which is then passed to the FileDownloader. The FileDownloader
1110 processes this information possibly downloading the video to the file
1111 system, among other possible outcomes. The dictionaries must include
1112 the following fields:
1114 id: Video identifier.
1115 url: Final video URL.
1116 uploader: Nickname of the video uploader.
1117 title: Literal title.
1118 stitle: Simplified title.
1119 ext: Video filename extension.
1120 format: Video format.
1121 player_url: SWF Player URL (may be None).
1123 The following fields are optional. Their primary purpose is to allow
1124 youtube-dl to serve as the backend for a video search function, such
1125 as the one in youtube2mp3. They are only used when their respective
1126 forced printing functions are called:
1128 thumbnail: Full URL to a video thumbnail image.
1129 description: One-line video description.
1131 Subclasses of this one should re-define the _real_initialize() and
1132 _real_extract() methods and define a _VALID_URL regexp.
1133 Probably, they should also be added to the list of extractors.
1139 def __init__(self, downloader=None):
1140 """Constructor. Receives an optional downloader."""
1142 self.set_downloader(downloader)
1144 def suitable(self, url):
1145 """Receives a URL and returns True if suitable for this IE."""
1146 return re.match(self._VALID_URL, url) is not None
1148 def initialize(self):
1149 """Initializes an instance (authentication, etc)."""
1151 self._real_initialize()
1154 def extract(self, url):
1155 """Extracts URL information and returns it in list of dicts."""
1157 return self._real_extract(url)
1159 def set_downloader(self, downloader):
1160 """Sets the downloader for this IE."""
1161 self._downloader = downloader
1163 def _real_initialize(self):
1164 """Real initialization process. Redefine in subclasses."""
1167 def _real_extract(self, url):
1168 """Real extraction process. Redefine in subclasses."""
1172 class YoutubeIE(InfoExtractor):
1173 """Information extractor for youtube.com."""
1175 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1176 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1177 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1178 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1179 _NETRC_MACHINE = 'youtube'
1180 # Listed in order of quality
1181 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1182 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1183 _video_extensions = {
1189 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1195 _video_dimensions = {
1211 IE_NAME = u'youtube'
1213 def report_lang(self):
1214 """Report attempt to set language."""
1215 self._downloader.to_screen(u'[youtube] Setting language')
1217 def report_login(self):
1218 """Report attempt to log in."""
1219 self._downloader.to_screen(u'[youtube] Logging in')
1221 def report_age_confirmation(self):
1222 """Report attempt to confirm age."""
1223 self._downloader.to_screen(u'[youtube] Confirming age')
1225 def report_video_webpage_download(self, video_id):
1226 """Report attempt to download video webpage."""
1227 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1229 def report_video_info_webpage_download(self, video_id):
1230 """Report attempt to download video info webpage."""
1231 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1233 def report_video_subtitles_download(self, video_id):
1234 """Report attempt to download video info webpage."""
1235 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1237 def report_information_extraction(self, video_id):
1238 """Report attempt to extract video information."""
1239 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1241 def report_unavailable_format(self, video_id, format):
1242 """Report extracted video URL."""
1243 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1245 def report_rtmp_download(self):
1246 """Indicate the download will use the RTMP protocol."""
1247 self._downloader.to_screen(u'[youtube] RTMP download detected')
1249 def _closed_captions_xml_to_srt(self, xml_string):
1251 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1252 # TODO parse xml instead of regex
1253 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1254 if not dur: dur = '4'
1255 start = float(start)
1256 end = start + float(dur)
1257 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1258 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1259 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
1260 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
1261 srt += str(n) + '\n'
1262 srt += start + ' --> ' + end + '\n'
1263 srt += caption + '\n\n'
1266 def _print_formats(self, formats):
1267 print 'Available formats:'
1269 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1271 def _real_initialize(self):
1272 if self._downloader is None:
1277 downloader_params = self._downloader.params
1279 # Attempt to use provided username and password or .netrc data
1280 if downloader_params.get('username', None) is not None:
1281 username = downloader_params['username']
1282 password = downloader_params['password']
1283 elif downloader_params.get('usenetrc', False):
1285 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1286 if info is not None:
1290 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1291 except (IOError, netrc.NetrcParseError), err:
1292 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1296 request = urllib2.Request(self._LANG_URL)
1299 urllib2.urlopen(request).read()
1300 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1301 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1304 # No authentication to be performed
1305 if username is None:
1310 'current_form': 'loginForm',
1312 'action_login': 'Log In',
1313 'username': username,
1314 'password': password,
1316 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1319 login_results = urllib2.urlopen(request).read()
1320 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1321 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1323 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1324 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1330 'action_confirm': 'Confirm',
1332 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1334 self.report_age_confirmation()
1335 age_results = urllib2.urlopen(request).read()
1336 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1337 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1340 def _real_extract(self, url):
1341 # Extract video id from URL
1342 mobj = re.match(self._VALID_URL, url)
1344 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1346 video_id = mobj.group(2)
1349 self.report_video_webpage_download(video_id)
1350 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1352 video_webpage = urllib2.urlopen(request).read()
1353 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1354 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1357 # Attempt to extract SWF player URL
1358 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1359 if mobj is not None:
1360 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1365 self.report_video_info_webpage_download(video_id)
1366 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1367 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1368 % (video_id, el_type))
1369 request = urllib2.Request(video_info_url)
1371 video_info_webpage = urllib2.urlopen(request).read()
1372 video_info = parse_qs(video_info_webpage)
1373 if 'token' in video_info:
1375 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1376 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1378 if 'token' not in video_info:
1379 if 'reason' in video_info:
1380 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1382 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1385 # Start extracting information
1386 self.report_information_extraction(video_id)
1389 if 'author' not in video_info:
1390 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1392 video_uploader = urllib.unquote_plus(video_info['author'][0])
1395 if 'title' not in video_info:
1396 self._downloader.trouble(u'ERROR: unable to extract video title')
1398 video_title = urllib.unquote_plus(video_info['title'][0])
1399 video_title = video_title.decode('utf-8')
1400 video_title = sanitize_title(video_title)
1403 simple_title = _simplify_title(video_title)
1406 if 'thumbnail_url' not in video_info:
1407 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1408 video_thumbnail = ''
1409 else: # don't panic if we can't find it
1410 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1414 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1415 if mobj is not None:
1416 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1417 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1418 for expression in format_expressions:
1420 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1428 video_description = u'No description available.'
1429 mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1430 if mobj is not None:
1431 video_description = mobj.group(1).decode('utf-8')
1433 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1434 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1435 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1436 # TODO use another parser
1439 video_subtitles = None
1440 if self._downloader.params.get('writesubtitles', False):
1441 self.report_video_subtitles_download(video_id)
1442 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1444 srt_list = urllib2.urlopen(request).read()
1445 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1446 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1448 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1450 if self._downloader.params.get('subtitleslang', False):
1451 srt_lang = self._downloader.params.get('subtitleslang')
1452 elif 'en' in srt_lang_list:
1455 srt_lang = srt_lang_list[0]
1456 if not srt_lang in srt_lang_list:
1457 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
1459 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1461 srt_xml = urllib2.urlopen(request).read()
1462 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1463 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1465 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1467 self._downloader.trouble(u'WARNING: video has no closed captions')
1470 video_token = urllib.unquote_plus(video_info['token'][0])
1472 # Decide which formats to download
1473 req_format = self._downloader.params.get('format', None)
1475 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1476 self.report_rtmp_download()
1477 video_url_list = [(None, video_info['conn'][0])]
1478 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1479 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1480 url_data = [parse_qs(uds) for uds in url_data_strs]
1481 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1482 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1484 format_limit = self._downloader.params.get('format_limit', None)
1485 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1486 if format_limit is not None and format_limit in available_formats:
1487 format_list = available_formats[available_formats.index(format_limit):]
1489 format_list = available_formats
1490 existing_formats = [x for x in format_list if x in url_map]
1491 if len(existing_formats) == 0:
1492 self._downloader.trouble(u'ERROR: no known formats available for video')
1494 if self._downloader.params.get('listformats', None):
1495 self._print_formats(existing_formats)
1497 if req_format is None or req_format == 'best':
1498 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1499 elif req_format == 'worst':
1500 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1501 elif req_format in ('-1', 'all'):
1502 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1504 # Specific formats. We pick the first in a slash-delimeted sequence.
1505 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1506 req_formats = req_format.split('/')
1507 video_url_list = None
1508 for rf in req_formats:
1510 video_url_list = [(rf, url_map[rf])]
1512 if video_url_list is None:
1513 self._downloader.trouble(u'ERROR: requested format not available')
1516 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1519 for format_param, video_real_url in video_url_list:
1520 # At this point we have a new video
1521 self._downloader.increment_downloads()
1524 video_extension = self._video_extensions.get(format_param, 'flv')
1527 # Process video information
1528 self._downloader.process_info({
1529 'id': video_id.decode('utf-8'),
1530 'url': video_real_url.decode('utf-8'),
1531 'uploader': video_uploader.decode('utf-8'),
1532 'upload_date': upload_date,
1533 'title': video_title,
1534 'stitle': simple_title,
1535 'ext': video_extension.decode('utf-8'),
1536 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1537 'thumbnail': video_thumbnail.decode('utf-8'),
1538 'description': video_description,
1539 'player_url': player_url,
1540 'subtitles': video_subtitles
1542 except UnavailableVideoError, err:
1543 self._downloader.trouble(u'\nERROR: unable to download video')
1546 class MetacafeIE(InfoExtractor):
1547 """Information Extractor for metacafe.com."""
1549 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1550 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1551 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1553 IE_NAME = u'metacafe'
1555 def __init__(self, youtube_ie, downloader=None):
1556 InfoExtractor.__init__(self, downloader)
1557 self._youtube_ie = youtube_ie
1559 def report_disclaimer(self):
1560 """Report disclaimer retrieval."""
1561 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1563 def report_age_confirmation(self):
1564 """Report attempt to confirm age."""
1565 self._downloader.to_screen(u'[metacafe] Confirming age')
1567 def report_download_webpage(self, video_id):
1568 """Report webpage download."""
1569 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1571 def report_extraction(self, video_id):
1572 """Report information extraction."""
1573 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1575 def _real_initialize(self):
1576 # Retrieve disclaimer
1577 request = urllib2.Request(self._DISCLAIMER)
1579 self.report_disclaimer()
1580 disclaimer = urllib2.urlopen(request).read()
1581 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1582 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1588 'submit': "Continue - I'm over 18",
1590 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1592 self.report_age_confirmation()
1593 disclaimer = urllib2.urlopen(request).read()
1594 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1595 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1598 def _real_extract(self, url):
1599 # Extract id and simplified title from URL
1600 mobj = re.match(self._VALID_URL, url)
1602 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1605 video_id = mobj.group(1)
1607 # Check if video comes from YouTube
1608 mobj2 = re.match(r'^yt-(.*)$', video_id)
1609 if mobj2 is not None:
1610 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1613 # At this point we have a new video
1614 self._downloader.increment_downloads()
1616 simple_title = mobj.group(2).decode('utf-8')
1618 # Retrieve video webpage to extract further information
1619 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1621 self.report_download_webpage(video_id)
1622 webpage = urllib2.urlopen(request).read()
1623 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1624 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1627 # Extract URL, uploader and title from webpage
1628 self.report_extraction(video_id)
1629 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1630 if mobj is not None:
1631 mediaURL = urllib.unquote(mobj.group(1))
1632 video_extension = mediaURL[-3:]
1634 # Extract gdaKey if available
1635 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1637 video_url = mediaURL
1639 gdaKey = mobj.group(1)
1640 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1642 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1644 self._downloader.trouble(u'ERROR: unable to extract media URL')
1646 vardict = parse_qs(mobj.group(1))
1647 if 'mediaData' not in vardict:
1648 self._downloader.trouble(u'ERROR: unable to extract media URL')
1650 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1652 self._downloader.trouble(u'ERROR: unable to extract media URL')
1654 mediaURL = mobj.group(1).replace('\\/', '/')
1655 video_extension = mediaURL[-3:]
1656 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1658 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1660 self._downloader.trouble(u'ERROR: unable to extract title')
1662 video_title = mobj.group(1).decode('utf-8')
1663 video_title = sanitize_title(video_title)
1665 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1667 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1669 video_uploader = mobj.group(1)
1672 # Process video information
1673 self._downloader.process_info({
1674 'id': video_id.decode('utf-8'),
1675 'url': video_url.decode('utf-8'),
1676 'uploader': video_uploader.decode('utf-8'),
1677 'upload_date': u'NA',
1678 'title': video_title,
1679 'stitle': simple_title,
1680 'ext': video_extension.decode('utf-8'),
1684 except UnavailableVideoError:
1685 self._downloader.trouble(u'\nERROR: unable to download video')
1688 class DailymotionIE(InfoExtractor):
1689 """Information Extractor for Dailymotion"""
1691 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1692 IE_NAME = u'dailymotion'
1694 def __init__(self, downloader=None):
1695 InfoExtractor.__init__(self, downloader)
1697 def report_download_webpage(self, video_id):
1698 """Report webpage download."""
1699 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1701 def report_extraction(self, video_id):
1702 """Report information extraction."""
1703 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1705 def _real_extract(self, url):
1706 # Extract id and simplified title from URL
1707 mobj = re.match(self._VALID_URL, url)
1709 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1712 # At this point we have a new video
1713 self._downloader.increment_downloads()
1714 video_id = mobj.group(1)
1716 video_extension = 'flv'
1718 # Retrieve video webpage to extract further information
1719 request = urllib2.Request(url)
1720 request.add_header('Cookie', 'family_filter=off')
1722 self.report_download_webpage(video_id)
1723 webpage = urllib2.urlopen(request).read()
1724 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1725 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1728 # Extract URL, uploader and title from webpage
1729 self.report_extraction(video_id)
1730 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1732 self._downloader.trouble(u'ERROR: unable to extract media URL')
1734 sequence = urllib.unquote(mobj.group(1))
1735 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1737 self._downloader.trouble(u'ERROR: unable to extract media URL')
1739 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1741 # if needed add http://www.dailymotion.com/ if relative URL
1743 video_url = mediaURL
1745 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1747 self._downloader.trouble(u'ERROR: unable to extract title')
1749 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1750 video_title = sanitize_title(video_title)
1751 simple_title = _simplify_title(video_title)
1753 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1755 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1757 video_uploader = mobj.group(1)
1760 # Process video information
1761 self._downloader.process_info({
1762 'id': video_id.decode('utf-8'),
1763 'url': video_url.decode('utf-8'),
1764 'uploader': video_uploader.decode('utf-8'),
1765 'upload_date': u'NA',
1766 'title': video_title,
1767 'stitle': simple_title,
1768 'ext': video_extension.decode('utf-8'),
1772 except UnavailableVideoError:
1773 self._downloader.trouble(u'\nERROR: unable to download video')
1776 class GoogleIE(InfoExtractor):
1777 """Information extractor for video.google.com."""
1779 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1780 IE_NAME = u'video.google'
1782 def __init__(self, downloader=None):
1783 InfoExtractor.__init__(self, downloader)
1785 def report_download_webpage(self, video_id):
1786 """Report webpage download."""
1787 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1789 def report_extraction(self, video_id):
1790 """Report information extraction."""
1791 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1793 def _real_extract(self, url):
1794 # Extract id from URL
1795 mobj = re.match(self._VALID_URL, url)
1797 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1800 # At this point we have a new video
1801 self._downloader.increment_downloads()
1802 video_id = mobj.group(1)
1804 video_extension = 'mp4'
1806 # Retrieve video webpage to extract further information
1807 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1809 self.report_download_webpage(video_id)
1810 webpage = urllib2.urlopen(request).read()
1811 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1812 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1815 # Extract URL, uploader, and title from webpage
1816 self.report_extraction(video_id)
1817 mobj = re.search(r"download_url:'([^']+)'", webpage)
1819 video_extension = 'flv'
1820 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1822 self._downloader.trouble(u'ERROR: unable to extract media URL')
1824 mediaURL = urllib.unquote(mobj.group(1))
1825 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1826 mediaURL = mediaURL.replace('\\x26', '\x26')
1828 video_url = mediaURL
1830 mobj = re.search(r'<title>(.*)</title>', webpage)
1832 self._downloader.trouble(u'ERROR: unable to extract title')
1834 video_title = mobj.group(1).decode('utf-8')
1835 video_title = sanitize_title(video_title)
1836 simple_title = _simplify_title(video_title)
1838 # Extract video description
1839 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1841 self._downloader.trouble(u'ERROR: unable to extract video description')
1843 video_description = mobj.group(1).decode('utf-8')
1844 if not video_description:
1845 video_description = 'No description available.'
1847 # Extract video thumbnail
1848 if self._downloader.params.get('forcethumbnail', False):
1849 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1851 webpage = urllib2.urlopen(request).read()
1852 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1853 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1855 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1857 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1859 video_thumbnail = mobj.group(1)
1860 else: # we need something to pass to process_info
1861 video_thumbnail = ''
1864 # Process video information
1865 self._downloader.process_info({
1866 'id': video_id.decode('utf-8'),
1867 'url': video_url.decode('utf-8'),
1869 'upload_date': u'NA',
1870 'title': video_title,
1871 'stitle': simple_title,
1872 'ext': video_extension.decode('utf-8'),
1876 except UnavailableVideoError:
1877 self._downloader.trouble(u'\nERROR: unable to download video')
1880 class PhotobucketIE(InfoExtractor):
1881 """Information extractor for photobucket.com."""
1883 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1884 IE_NAME = u'photobucket'
1886 def __init__(self, downloader=None):
1887 InfoExtractor.__init__(self, downloader)
1889 def report_download_webpage(self, video_id):
1890 """Report webpage download."""
1891 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1893 def report_extraction(self, video_id):
1894 """Report information extraction."""
1895 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1897 def _real_extract(self, url):
1898 # Extract id from URL
1899 mobj = re.match(self._VALID_URL, url)
1901 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1904 # At this point we have a new video
1905 self._downloader.increment_downloads()
1906 video_id = mobj.group(1)
1908 video_extension = 'flv'
1910 # Retrieve video webpage to extract further information
1911 request = urllib2.Request(url)
1913 self.report_download_webpage(video_id)
1914 webpage = urllib2.urlopen(request).read()
1915 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1916 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1919 # Extract URL, uploader, and title from webpage
1920 self.report_extraction(video_id)
1921 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1923 self._downloader.trouble(u'ERROR: unable to extract media URL')
1925 mediaURL = urllib.unquote(mobj.group(1))
1927 video_url = mediaURL
1929 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1931 self._downloader.trouble(u'ERROR: unable to extract title')
1933 video_title = mobj.group(1).decode('utf-8')
1934 video_title = sanitize_title(video_title)
1935 simple_title = _simplify_title(vide_title)
1937 video_uploader = mobj.group(2).decode('utf-8')
1940 # Process video information
1941 self._downloader.process_info({
1942 'id': video_id.decode('utf-8'),
1943 'url': video_url.decode('utf-8'),
1944 'uploader': video_uploader,
1945 'upload_date': u'NA',
1946 'title': video_title,
1947 'stitle': simple_title,
1948 'ext': video_extension.decode('utf-8'),
1952 except UnavailableVideoError:
1953 self._downloader.trouble(u'\nERROR: unable to download video')
1956 class YahooIE(InfoExtractor):
1957 """Information extractor for video.yahoo.com."""
1959 # _VALID_URL matches all Yahoo! Video URLs
1960 # _VPAGE_URL matches only the extractable '/watch/' URLs
1961 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1962 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1963 IE_NAME = u'video.yahoo'
1965 def __init__(self, downloader=None):
1966 InfoExtractor.__init__(self, downloader)
1968 def report_download_webpage(self, video_id):
1969 """Report webpage download."""
1970 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1972 def report_extraction(self, video_id):
1973 """Report information extraction."""
1974 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1976 def _real_extract(self, url, new_video=True):
1977 # Extract ID from URL
1978 mobj = re.match(self._VALID_URL, url)
1980 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1983 # At this point we have a new video
1984 self._downloader.increment_downloads()
1985 video_id = mobj.group(2)
1986 video_extension = 'flv'
1988 # Rewrite valid but non-extractable URLs as
1989 # extractable English language /watch/ URLs
1990 if re.match(self._VPAGE_URL, url) is None:
1991 request = urllib2.Request(url)
1993 webpage = urllib2.urlopen(request).read()
1994 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1995 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1998 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
2000 self._downloader.trouble(u'ERROR: Unable to extract id field')
2002 yahoo_id = mobj.group(1)
2004 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2006 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2008 yahoo_vid = mobj.group(1)
2010 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2011 return self._real_extract(url, new_video=False)
2013 # Retrieve video webpage to extract further information
2014 request = urllib2.Request(url)
2016 self.report_download_webpage(video_id)
2017 webpage = urllib2.urlopen(request).read()
2018 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2019 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2022 # Extract uploader and title from webpage
2023 self.report_extraction(video_id)
2024 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2026 self._downloader.trouble(u'ERROR: unable to extract video title')
2028 video_title = mobj.group(1).decode('utf-8')
2029 simple_title = _simplify_title(video_title)
2031 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2033 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2035 video_uploader = mobj.group(1).decode('utf-8')
2037 # Extract video thumbnail
2038 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2040 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2042 video_thumbnail = mobj.group(1).decode('utf-8')
2044 # Extract video description
2045 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2047 self._downloader.trouble(u'ERROR: unable to extract video description')
2049 video_description = mobj.group(1).decode('utf-8')
2050 if not video_description:
2051 video_description = 'No description available.'
2053 # Extract video height and width
2054 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2056 self._downloader.trouble(u'ERROR: unable to extract video height')
2058 yv_video_height = mobj.group(1)
2060 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2062 self._downloader.trouble(u'ERROR: unable to extract video width')
2064 yv_video_width = mobj.group(1)
2066 # Retrieve video playlist to extract media URL
2067 # I'm not completely sure what all these options are, but we
2068 # seem to need most of them, otherwise the server sends a 401.
2069 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
2070 yv_bitrate = '700' # according to Wikipedia this is hard-coded
2071 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2072 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2073 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2075 self.report_download_webpage(video_id)
2076 webpage = urllib2.urlopen(request).read()
2077 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2078 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2081 # Extract media URL from playlist XML
2082 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2084 self._downloader.trouble(u'ERROR: Unable to extract media URL')
2086 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2087 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2090 # Process video information
2091 self._downloader.process_info({
2092 'id': video_id.decode('utf-8'),
2094 'uploader': video_uploader,
2095 'upload_date': u'NA',
2096 'title': video_title,
2097 'stitle': simple_title,
2098 'ext': video_extension.decode('utf-8'),
2099 'thumbnail': video_thumbnail.decode('utf-8'),
2100 'description': video_description,
2101 'thumbnail': video_thumbnail,
2104 except UnavailableVideoError:
2105 self._downloader.trouble(u'\nERROR: unable to download video')
2108 class VimeoIE(InfoExtractor):
2109 """Information extractor for vimeo.com."""
2111 # _VALID_URL matches Vimeo URLs
2112 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2115 def __init__(self, downloader=None):
2116 InfoExtractor.__init__(self, downloader)
2118 def report_download_webpage(self, video_id):
2119 """Report webpage download."""
2120 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2122 def report_extraction(self, video_id):
2123 """Report information extraction."""
2124 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2126 def _real_extract(self, url, new_video=True):
2127 # Extract ID from URL
2128 mobj = re.match(self._VALID_URL, url)
2130 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2133 # At this point we have a new video
2134 self._downloader.increment_downloads()
2135 video_id = mobj.group(1)
2137 # Retrieve video webpage to extract further information
2138 request = urllib2.Request(url, None, std_headers)
2140 self.report_download_webpage(video_id)
2141 webpage = urllib2.urlopen(request).read()
2142 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2143 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2146 # Now we begin extracting as much information as we can from what we
2147 # retrieved. First we extract the information common to all extractors,
2148 # and latter we extract those that are Vimeo specific.
2149 self.report_extraction(video_id)
2151 # Extract the config JSON
2152 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2154 config = json.loads(config)
2156 self._downloader.trouble(u'ERROR: unable to extract info section')
2160 video_title = config["video"]["title"]
2161 simple_title = _simplify_title(video_title)
2164 video_uploader = config["video"]["owner"]["name"]
2166 # Extract video thumbnail
2167 video_thumbnail = config["video"]["thumbnail"]
2169 # Extract video description
2173 video_description = u'No description available.'
2174 mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2175 if mobj is not None:
2176 video_description = mobj.group(1)
2178 html_parser = lxml.etree.HTMLParser()
2179 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2180 video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2181 # TODO use another parser
2183 # Extract upload date
2184 video_upload_date = u'NA'
2185 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2186 if mobj is not None:
2187 video_upload_date = mobj.group(1)
2189 # Vimeo specific: extract request signature and timestamp
2190 sig = config['request']['signature']
2191 timestamp = config['request']['timestamp']
2193 # Vimeo specific: extract video codec and quality information
2194 # TODO bind to format param
2195 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2196 for codec in codecs:
2197 if codec[0] in config["video"]["files"]:
2198 video_codec = codec[0]
2199 video_extension = codec[1]
2200 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2201 else: quality = 'sd'
2204 self._downloader.trouble(u'ERROR: no known codec found')
2207 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2208 %(video_id, sig, timestamp, quality, video_codec.upper())
2211 # Process video information
2212 self._downloader.process_info({
2215 'uploader': video_uploader,
2216 'upload_date': video_upload_date,
2217 'title': video_title,
2218 'stitle': simple_title,
2219 'ext': video_extension,
2220 'thumbnail': video_thumbnail,
2221 'description': video_description,
2224 except UnavailableVideoError:
2225 self._downloader.trouble(u'ERROR: unable to download video')
2228 class GenericIE(InfoExtractor):
2229 """Generic last-resort information extractor."""
2232 IE_NAME = u'generic'
2234 def __init__(self, downloader=None):
2235 InfoExtractor.__init__(self, downloader)
2237 def report_download_webpage(self, video_id):
2238 """Report webpage download."""
2239 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2240 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2242 def report_extraction(self, video_id):
2243 """Report information extraction."""
2244 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2246 def _real_extract(self, url):
2247 # At this point we have a new video
2248 self._downloader.increment_downloads()
2250 video_id = url.split('/')[-1]
2251 request = urllib2.Request(url)
2253 self.report_download_webpage(video_id)
2254 webpage = urllib2.urlopen(request).read()
2255 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2256 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2258 except ValueError, err:
2259 # since this is the last-resort InfoExtractor, if
2260 # this error is thrown, it'll be thrown here
2261 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2264 self.report_extraction(video_id)
2265 # Start with something easy: JW Player in SWFObject
2266 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2268 # Broaden the search a little bit
2269 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2271 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2274 # It's possible that one of the regexes
2275 # matched, but returned an empty group:
2276 if mobj.group(1) is None:
2277 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2280 video_url = urllib.unquote(mobj.group(1))
2281 video_id = os.path.basename(video_url)
2283 # here's a fun little line of code for you:
2284 video_extension = os.path.splitext(video_id)[1][1:]
2285 video_id = os.path.splitext(video_id)[0]
2287 # it's tempting to parse this further, but you would
2288 # have to take into account all the variations like
2289 # Video Title - Site Name
2290 # Site Name | Video Title
2291 # Video Title - Tagline | Site Name
2292 # and so on and so forth; it's just not practical
2293 mobj = re.search(r'<title>(.*)</title>', webpage)
2295 self._downloader.trouble(u'ERROR: unable to extract title')
2297 video_title = mobj.group(1).decode('utf-8')
2298 video_title = sanitize_title(video_title)
2299 simple_title = _simplify_title(video_title)
2301 # video uploader is domain name
2302 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2304 self._downloader.trouble(u'ERROR: unable to extract title')
2306 video_uploader = mobj.group(1).decode('utf-8')
2309 # Process video information
2310 self._downloader.process_info({
2311 'id': video_id.decode('utf-8'),
2312 'url': video_url.decode('utf-8'),
2313 'uploader': video_uploader,
2314 'upload_date': u'NA',
2315 'title': video_title,
2316 'stitle': simple_title,
2317 'ext': video_extension.decode('utf-8'),
2321 except UnavailableVideoError, err:
2322 self._downloader.trouble(u'\nERROR: unable to download video')
2325 class YoutubeSearchIE(InfoExtractor):
2326 """Information Extractor for YouTube search queries."""
2327 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2328 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
2330 _max_youtube_results = 1000
2331 IE_NAME = u'youtube:search'
2333 def __init__(self, youtube_ie, downloader=None):
2334 InfoExtractor.__init__(self, downloader)
2335 self._youtube_ie = youtube_ie
2337 def report_download_page(self, query, pagenum):
2338 """Report attempt to download playlist page with given number."""
2339 query = query.decode(preferredencoding())
2340 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2342 def _real_initialize(self):
2343 self._youtube_ie.initialize()
2345 def _real_extract(self, query):
2346 mobj = re.match(self._VALID_URL, query)
2348 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2351 prefix, query = query.split(':')
2353 query = query.encode('utf-8')
2355 self._download_n_results(query, 1)
2357 elif prefix == 'all':
2358 self._download_n_results(query, self._max_youtube_results)
2364 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2366 elif n > self._max_youtube_results:
2367 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2368 n = self._max_youtube_results
2369 self._download_n_results(query, n)
2371 except ValueError: # parsing prefix as integer fails
2372 self._download_n_results(query, 1)
2375 def _download_n_results(self, query, n):
2376 """Downloads a specified number of results for a query"""
2382 while (50 * pagenum) < limit:
2383 self.report_download_page(query, pagenum+1)
2384 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
2385 request = urllib2.Request(result_url)
2387 data = urllib2.urlopen(request).read()
2388 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2389 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
2391 api_response = json.loads(data)['data']
2393 new_ids = list(video['id'] for video in api_response['items'])
2394 video_ids += new_ids
2396 limit = min(n, api_response['totalItems'])
2399 if len(video_ids) > n:
2400 video_ids = video_ids[:n]
2401 for id in video_ids:
2402 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2406 class GoogleSearchIE(InfoExtractor):
2407 """Information Extractor for Google Video search queries."""
2408 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2409 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2410 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2411 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2413 _max_google_results = 1000
2414 IE_NAME = u'video.google:search'
2416 def __init__(self, google_ie, downloader=None):
2417 InfoExtractor.__init__(self, downloader)
2418 self._google_ie = google_ie
2420 def report_download_page(self, query, pagenum):
2421 """Report attempt to download playlist page with given number."""
2422 query = query.decode(preferredencoding())
2423 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2425 def _real_initialize(self):
2426 self._google_ie.initialize()
2428 def _real_extract(self, query):
2429 mobj = re.match(self._VALID_URL, query)
2431 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2434 prefix, query = query.split(':')
2436 query = query.encode('utf-8')
2438 self._download_n_results(query, 1)
2440 elif prefix == 'all':
2441 self._download_n_results(query, self._max_google_results)
2447 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2449 elif n > self._max_google_results:
2450 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2451 n = self._max_google_results
2452 self._download_n_results(query, n)
2454 except ValueError: # parsing prefix as integer fails
2455 self._download_n_results(query, 1)
2458 def _download_n_results(self, query, n):
2459 """Downloads a specified number of results for a query"""
2465 self.report_download_page(query, pagenum)
2466 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2467 request = urllib2.Request(result_url)
2469 page = urllib2.urlopen(request).read()
2470 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2471 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2474 # Extract video identifiers
2475 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2476 video_id = mobj.group(1)
2477 if video_id not in video_ids:
2478 video_ids.append(video_id)
2479 if len(video_ids) == n:
2480 # Specified n videos reached
2481 for id in video_ids:
2482 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2485 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2486 for id in video_ids:
2487 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2490 pagenum = pagenum + 1
2493 class YahooSearchIE(InfoExtractor):
2494 """Information Extractor for Yahoo! Video search queries."""
2495 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2496 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2497 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2498 _MORE_PAGES_INDICATOR = r'\s*Next'
2500 _max_yahoo_results = 1000
2501 IE_NAME = u'video.yahoo:search'
2503 def __init__(self, yahoo_ie, downloader=None):
2504 InfoExtractor.__init__(self, downloader)
2505 self._yahoo_ie = yahoo_ie
2507 def report_download_page(self, query, pagenum):
2508 """Report attempt to download playlist page with given number."""
2509 query = query.decode(preferredencoding())
2510 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2512 def _real_initialize(self):
2513 self._yahoo_ie.initialize()
2515 def _real_extract(self, query):
2516 mobj = re.match(self._VALID_URL, query)
2518 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2521 prefix, query = query.split(':')
2523 query = query.encode('utf-8')
2525 self._download_n_results(query, 1)
2527 elif prefix == 'all':
2528 self._download_n_results(query, self._max_yahoo_results)
2534 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2536 elif n > self._max_yahoo_results:
2537 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2538 n = self._max_yahoo_results
2539 self._download_n_results(query, n)
2541 except ValueError: # parsing prefix as integer fails
2542 self._download_n_results(query, 1)
2545 def _download_n_results(self, query, n):
2546 """Downloads a specified number of results for a query"""
2549 already_seen = set()
2553 self.report_download_page(query, pagenum)
2554 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2555 request = urllib2.Request(result_url)
2557 page = urllib2.urlopen(request).read()
2558 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2559 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2562 # Extract video identifiers
2563 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2564 video_id = mobj.group(1)
2565 if video_id not in already_seen:
2566 video_ids.append(video_id)
2567 already_seen.add(video_id)
2568 if len(video_ids) == n:
2569 # Specified n videos reached
2570 for id in video_ids:
2571 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2574 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2575 for id in video_ids:
2576 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2579 pagenum = pagenum + 1
2582 class YoutubePlaylistIE(InfoExtractor):
2583 """Information Extractor for YouTube playlists."""
2585 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2586 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2587 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=PL%s&'
2588 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2590 IE_NAME = u'youtube:playlist'
2592 def __init__(self, youtube_ie, downloader=None):
2593 InfoExtractor.__init__(self, downloader)
2594 self._youtube_ie = youtube_ie
2596 def report_download_page(self, playlist_id, pagenum):
2597 """Report attempt to download playlist page with given number."""
2598 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2600 def _real_initialize(self):
2601 self._youtube_ie.initialize()
2603 def _real_extract(self, url):
2604 # Extract playlist id
2605 mobj = re.match(self._VALID_URL, url)
2607 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2611 if mobj.group(3) is not None:
2612 self._youtube_ie.extract(mobj.group(3))
2615 # Download playlist pages
2616 # prefix is 'p' as default for playlists but there are other types that need extra care
2617 playlist_prefix = mobj.group(1)
2618 if playlist_prefix == 'a':
2619 playlist_access = 'artist'
2621 playlist_prefix = 'p'
2622 playlist_access = 'view_play_list'
2623 playlist_id = mobj.group(2)
2628 self.report_download_page(playlist_id, pagenum)
2629 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2630 request = urllib2.Request(url)
2632 page = urllib2.urlopen(request).read()
2633 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2634 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2637 # Extract video identifiers
2639 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
2640 if mobj.group(1) not in ids_in_page:
2641 ids_in_page.append(mobj.group(1))
2642 video_ids.extend(ids_in_page)
2644 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2646 pagenum = pagenum + 1
2648 playliststart = self._downloader.params.get('playliststart', 1) - 1
2649 playlistend = self._downloader.params.get('playlistend', -1)
2650 if playlistend == -1:
2651 video_ids = video_ids[playliststart:]
2653 video_ids = video_ids[playliststart:playlistend]
2655 for id in video_ids:
2656 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2660 class YoutubeUserIE(InfoExtractor):
2661 """Information Extractor for YouTube users."""
2663 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2664 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2665 _GDATA_PAGE_SIZE = 50
2666 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2667 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2669 IE_NAME = u'youtube:user'
2671 def __init__(self, youtube_ie, downloader=None):
2672 InfoExtractor.__init__(self, downloader)
2673 self._youtube_ie = youtube_ie
2675 def report_download_page(self, username, start_index):
2676 """Report attempt to download user page."""
2677 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2678 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2680 def _real_initialize(self):
2681 self._youtube_ie.initialize()
2683 def _real_extract(self, url):
2685 mobj = re.match(self._VALID_URL, url)
2687 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2690 username = mobj.group(1)
2692 # Download video ids using YouTube Data API. Result size per
2693 # query is limited (currently to 50 videos) so we need to query
2694 # page by page until there are no video ids - it means we got
2701 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2702 self.report_download_page(username, start_index)
2704 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2707 page = urllib2.urlopen(request).read()
2708 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2709 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2712 # Extract video identifiers
2715 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2716 if mobj.group(1) not in ids_in_page:
2717 ids_in_page.append(mobj.group(1))
2719 video_ids.extend(ids_in_page)
2721 # A little optimization - if current page is not
2722 # "full", ie. does not contain PAGE_SIZE video ids then
2723 # we can assume that this page is the last one - there
2724 # are no more ids on further pages - no need to query
2727 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2732 all_ids_count = len(video_ids)
2733 playliststart = self._downloader.params.get('playliststart', 1) - 1
2734 playlistend = self._downloader.params.get('playlistend', -1)
2736 if playlistend == -1:
2737 video_ids = video_ids[playliststart:]
2739 video_ids = video_ids[playliststart:playlistend]
2741 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2742 (username, all_ids_count, len(video_ids)))
2744 for video_id in video_ids:
2745 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2748 class DepositFilesIE(InfoExtractor):
2749 """Information extractor for depositfiles.com"""
2751 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2752 IE_NAME = u'DepositFiles'
2754 def __init__(self, downloader=None):
2755 InfoExtractor.__init__(self, downloader)
2757 def report_download_webpage(self, file_id):
2758 """Report webpage download."""
2759 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2761 def report_extraction(self, file_id):
2762 """Report information extraction."""
2763 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2765 def _real_extract(self, url):
2766 # At this point we have a new file
2767 self._downloader.increment_downloads()
2769 file_id = url.split('/')[-1]
2770 # Rebuild url in english locale
2771 url = 'http://depositfiles.com/en/files/' + file_id
2773 # Retrieve file webpage with 'Free download' button pressed
2774 free_download_indication = { 'gateway_result' : '1' }
2775 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2777 self.report_download_webpage(file_id)
2778 webpage = urllib2.urlopen(request).read()
2779 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2780 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2783 # Search for the real file URL
2784 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2785 if (mobj is None) or (mobj.group(1) is None):
2786 # Try to figure out reason of the error.
2787 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2788 if (mobj is not None) and (mobj.group(1) is not None):
2789 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2790 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2792 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2795 file_url = mobj.group(1)
2796 file_extension = os.path.splitext(file_url)[1][1:]
2798 # Search for file title
2799 mobj = re.search(r'<b title="(.*?)">', webpage)
2801 self._downloader.trouble(u'ERROR: unable to extract title')
2803 file_title = mobj.group(1).decode('utf-8')
2806 # Process file information
2807 self._downloader.process_info({
2808 'id': file_id.decode('utf-8'),
2809 'url': file_url.decode('utf-8'),
2811 'upload_date': u'NA',
2812 'title': file_title,
2813 'stitle': file_title,
2814 'ext': file_extension.decode('utf-8'),
2818 except UnavailableVideoError, err:
2819 self._downloader.trouble(u'ERROR: unable to download file')
2822 class FacebookIE(InfoExtractor):
2823 """Information Extractor for Facebook"""
2825 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2826 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2827 _NETRC_MACHINE = 'facebook'
2828 _available_formats = ['video', 'highqual', 'lowqual']
2829 _video_extensions = {
2834 IE_NAME = u'facebook'
2836 def __init__(self, downloader=None):
2837 InfoExtractor.__init__(self, downloader)
2839 def _reporter(self, message):
2840 """Add header and report message."""
2841 self._downloader.to_screen(u'[facebook] %s' % message)
2843 def report_login(self):
2844 """Report attempt to log in."""
2845 self._reporter(u'Logging in')
2847 def report_video_webpage_download(self, video_id):
2848 """Report attempt to download video webpage."""
2849 self._reporter(u'%s: Downloading video webpage' % video_id)
2851 def report_information_extraction(self, video_id):
2852 """Report attempt to extract video information."""
2853 self._reporter(u'%s: Extracting video information' % video_id)
2855 def _parse_page(self, video_webpage):
2856 """Extract video information from page"""
2858 data = {'title': r'\("video_title", "(.*?)"\)',
2859 'description': r'<div class="datawrap">(.*?)</div>',
2860 'owner': r'\("video_owner_name", "(.*?)"\)',
2861 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2864 for piece in data.keys():
2865 mobj = re.search(data[piece], video_webpage)
2866 if mobj is not None:
2867 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2871 for fmt in self._available_formats:
2872 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2873 if mobj is not None:
2874 # URL is in a Javascript segment inside an escaped Unicode format within
2875 # the generally utf-8 page
2876 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2877 video_info['video_urls'] = video_urls
2881 def _real_initialize(self):
2882 if self._downloader is None:
2887 downloader_params = self._downloader.params
2889 # Attempt to use provided username and password or .netrc data
2890 if downloader_params.get('username', None) is not None:
2891 useremail = downloader_params['username']
2892 password = downloader_params['password']
2893 elif downloader_params.get('usenetrc', False):
2895 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2896 if info is not None:
2900 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2901 except (IOError, netrc.NetrcParseError), err:
2902 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2905 if useremail is None:
2914 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2917 login_results = urllib2.urlopen(request).read()
2918 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2919 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2921 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2922 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2925 def _real_extract(self, url):
2926 mobj = re.match(self._VALID_URL, url)
2928 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2930 video_id = mobj.group('ID')
2933 self.report_video_webpage_download(video_id)
2934 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2936 page = urllib2.urlopen(request)
2937 video_webpage = page.read()
2938 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2939 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2942 # Start extracting information
2943 self.report_information_extraction(video_id)
2945 # Extract information
2946 video_info = self._parse_page(video_webpage)
2949 if 'owner' not in video_info:
2950 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2952 video_uploader = video_info['owner']
2955 if 'title' not in video_info:
2956 self._downloader.trouble(u'ERROR: unable to extract video title')
2958 video_title = video_info['title']
2959 video_title = video_title.decode('utf-8')
2960 video_title = sanitize_title(video_title)
2962 simple_title = _simplify_title(video_title)
2965 if 'thumbnail' not in video_info:
2966 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2967 video_thumbnail = ''
2969 video_thumbnail = video_info['thumbnail']
2973 if 'upload_date' in video_info:
2974 upload_time = video_info['upload_date']
2975 timetuple = email.utils.parsedate_tz(upload_time)
2976 if timetuple is not None:
2978 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2983 video_description = video_info.get('description', 'No description available.')
2985 url_map = video_info['video_urls']
2986 if len(url_map.keys()) > 0:
2987 # Decide which formats to download
2988 req_format = self._downloader.params.get('format', None)
2989 format_limit = self._downloader.params.get('format_limit', None)
2991 if format_limit is not None and format_limit in self._available_formats:
2992 format_list = self._available_formats[self._available_formats.index(format_limit):]
2994 format_list = self._available_formats
2995 existing_formats = [x for x in format_list if x in url_map]
2996 if len(existing_formats) == 0:
2997 self._downloader.trouble(u'ERROR: no known formats available for video')
2999 if req_format is None:
3000 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
3001 elif req_format == 'worst':
3002 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3003 elif req_format == '-1':
3004 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3007 if req_format not in url_map:
3008 self._downloader.trouble(u'ERROR: requested format not available')
3010 video_url_list = [(req_format, url_map[req_format])] # Specific format
3012 for format_param, video_real_url in video_url_list:
3014 # At this point we have a new video
3015 self._downloader.increment_downloads()
3018 video_extension = self._video_extensions.get(format_param, 'mp4')
3021 # Process video information
3022 self._downloader.process_info({
3023 'id': video_id.decode('utf-8'),
3024 'url': video_real_url.decode('utf-8'),
3025 'uploader': video_uploader.decode('utf-8'),
3026 'upload_date': upload_date,
3027 'title': video_title,
3028 'stitle': simple_title,
3029 'ext': video_extension.decode('utf-8'),
3030 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3031 'thumbnail': video_thumbnail.decode('utf-8'),
3032 'description': video_description.decode('utf-8'),
3035 except UnavailableVideoError, err:
3036 self._downloader.trouble(u'\nERROR: unable to download video')
3038 class BlipTVIE(InfoExtractor):
3039 """Information extractor for blip.tv"""
3041 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3042 _URL_EXT = r'^.*\.([a-z0-9]+)$'
3043 IE_NAME = u'blip.tv'
3045 def report_extraction(self, file_id):
3046 """Report information extraction."""
3047 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3049 def report_direct_download(self, title):
3050 """Report information extraction."""
3051 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3053 def _real_extract(self, url):
3054 mobj = re.match(self._VALID_URL, url)
3056 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3063 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3064 request = urllib2.Request(json_url)
3065 self.report_extraction(mobj.group(1))
3068 urlh = urllib2.urlopen(request)
3069 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3070 basename = url.split('/')[-1]
3071 title,ext = os.path.splitext(basename)
3072 title = title.decode('UTF-8')
3073 ext = ext.replace('.', '')
3074 self.report_direct_download(title)
3079 'stitle': _simplify_title(title),
3083 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3084 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3086 if info is None: # Regular URL
3088 json_code = urlh.read()
3089 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3090 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3094 json_data = json.loads(json_code)
3095 if 'Post' in json_data:
3096 data = json_data['Post']
3100 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3101 video_url = data['media']['url']
3102 umobj = re.match(self._URL_EXT, video_url)
3104 raise ValueError('Can not determine filename extension')
3105 ext = umobj.group(1)
3108 'id': data['item_id'],
3110 'uploader': data['display_name'],
3111 'upload_date': upload_date,
3112 'title': data['title'],
3113 'stitle': _simplify_title(data['title']),
3115 'format': data['media']['mimeType'],
3116 'thumbnail': data['thumbnailUrl'],
3117 'description': data['description'],
3118 'player_url': data['embedUrl']
3120 except (ValueError,KeyError), err:
3121 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3124 self._downloader.increment_downloads()
3127 self._downloader.process_info(info)
3128 except UnavailableVideoError, err:
3129 self._downloader.trouble(u'\nERROR: unable to download video')
3132 class MyVideoIE(InfoExtractor):
3133 """Information Extractor for myvideo.de."""
3135 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3136 IE_NAME = u'myvideo'
3138 def __init__(self, downloader=None):
3139 InfoExtractor.__init__(self, downloader)
3141 def report_download_webpage(self, video_id):
3142 """Report webpage download."""
3143 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3145 def report_extraction(self, video_id):
3146 """Report information extraction."""
3147 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3149 def _real_extract(self,url):
3150 mobj = re.match(self._VALID_URL, url)
3152 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3155 video_id = mobj.group(1)
3158 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3160 self.report_download_webpage(video_id)
3161 webpage = urllib2.urlopen(request).read()
3162 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3163 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3166 self.report_extraction(video_id)
3167 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3170 self._downloader.trouble(u'ERROR: unable to extract media URL')
3172 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3174 mobj = re.search('<title>([^<]+)</title>', webpage)
3176 self._downloader.trouble(u'ERROR: unable to extract title')
3179 video_title = mobj.group(1)
3180 video_title = sanitize_title(video_title)
3182 simple_title = _simplify_title(video_title)
3185 self._downloader.process_info({
3189 'upload_date': u'NA',
3190 'title': video_title,
3191 'stitle': simple_title,
3196 except UnavailableVideoError:
3197 self._downloader.trouble(u'\nERROR: Unable to download video')
3199 class ComedyCentralIE(InfoExtractor):
3200 """Information extractor for The Daily Show and Colbert Report """
3202 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3203 IE_NAME = u'comedycentral'
3205 def report_extraction(self, episode_id):
3206 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3208 def report_config_download(self, episode_id):
3209 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3211 def report_index_download(self, episode_id):
3212 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3214 def report_player_url(self, episode_id):
3215 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3217 def _real_extract(self, url):
3218 mobj = re.match(self._VALID_URL, url)
3220 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3223 if mobj.group('shortname'):
3224 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3225 url = u'http://www.thedailyshow.com/full-episodes/'
3227 url = u'http://www.colbertnation.com/full-episodes/'
3228 mobj = re.match(self._VALID_URL, url)
3229 assert mobj is not None
3231 dlNewest = not mobj.group('episode')
3233 epTitle = mobj.group('showname')
3235 epTitle = mobj.group('episode')
3237 req = urllib2.Request(url)
3238 self.report_extraction(epTitle)
3240 htmlHandle = urllib2.urlopen(req)
3241 html = htmlHandle.read()
3242 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3243 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3246 url = htmlHandle.geturl()
3247 mobj = re.match(self._VALID_URL, url)
3249 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3251 if mobj.group('episode') == '':
3252 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3254 epTitle = mobj.group('episode')
3256 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3257 if len(mMovieParams) == 0:
3258 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3261 playerUrl_raw = mMovieParams[0][0]
3262 self.report_player_url(epTitle)
3264 urlHandle = urllib2.urlopen(playerUrl_raw)
3265 playerUrl = urlHandle.geturl()
3266 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3267 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3270 uri = mMovieParams[0][1]
3271 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3272 self.report_index_download(epTitle)
3274 indexXml = urllib2.urlopen(indexUrl).read()
3275 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3276 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3279 idoc = xml.etree.ElementTree.fromstring(indexXml)
3280 itemEls = idoc.findall('.//item')
3281 for itemEl in itemEls:
3282 mediaId = itemEl.findall('./guid')[0].text
3283 shortMediaId = mediaId.split(':')[-1]
3284 showId = mediaId.split(':')[-2].replace('.com', '')
3285 officialTitle = itemEl.findall('./title')[0].text
3286 officialDate = itemEl.findall('./pubDate')[0].text
3288 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3289 urllib.urlencode({'uri': mediaId}))
3290 configReq = urllib2.Request(configUrl)
3291 self.report_config_download(epTitle)
3293 configXml = urllib2.urlopen(configReq).read()
3294 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3295 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3298 cdoc = xml.etree.ElementTree.fromstring(configXml)
3300 for rendition in cdoc.findall('.//rendition'):
3301 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3305 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3308 # For now, just pick the highest bitrate
3309 format,video_url = turls[-1]
3311 self._downloader.increment_downloads()
3313 effTitle = showId + u'-' + epTitle
3318 'upload_date': officialDate,
3320 'stitle': _simplify_title(effTitle),
3324 'description': officialTitle,
3325 'player_url': playerUrl
3329 self._downloader.process_info(info)
3330 except UnavailableVideoError, err:
3331 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3335 class EscapistIE(InfoExtractor):
3336 """Information extractor for The Escapist """
3338 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3339 IE_NAME = u'escapist'
3341 def report_extraction(self, showName):
3342 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3344 def report_config_download(self, showName):
3345 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3347 def _real_extract(self, url):
3348 htmlParser = HTMLParser.HTMLParser()
3350 mobj = re.match(self._VALID_URL, url)
3352 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3354 showName = mobj.group('showname')
3355 videoId = mobj.group('episode')
3357 self.report_extraction(showName)
3359 webPage = urllib2.urlopen(url).read()
3360 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3361 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3364 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3365 description = htmlParser.unescape(descMatch.group(1))
3366 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3367 imgUrl = htmlParser.unescape(imgMatch.group(1))
3368 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3369 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3370 configUrlMatch = re.search('config=(.*)$', playerUrl)
3371 configUrl = urllib2.unquote(configUrlMatch.group(1))
3373 self.report_config_download(showName)
3375 configJSON = urllib2.urlopen(configUrl).read()
3376 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3377 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3380 # Technically, it's JavaScript, not JSON
3381 configJSON = configJSON.replace("'", '"')
3384 config = json.loads(configJSON)
3385 except (ValueError,), err:
3386 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3389 playlist = config['playlist']
3390 videoUrl = playlist[1]['url']
3392 self._downloader.increment_downloads()
3396 'uploader': showName,
3397 'upload_date': None,
3399 'stitle': _simplify_title(showName),
3402 'thumbnail': imgUrl,
3403 'description': description,
3404 'player_url': playerUrl,
3408 self._downloader.process_info(info)
3409 except UnavailableVideoError, err:
3410 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3413 class CollegeHumorIE(InfoExtractor):
3414 """Information extractor for collegehumor.com"""
3416 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3417 IE_NAME = u'collegehumor'
3419 def report_webpage(self, video_id):
3420 """Report information extraction."""
3421 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3423 def report_extraction(self, video_id):
3424 """Report information extraction."""
3425 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3427 def _real_extract(self, url):
3428 htmlParser = HTMLParser.HTMLParser()
3430 mobj = re.match(self._VALID_URL, url)
3432 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3434 video_id = mobj.group('videoid')
3436 self.report_webpage(video_id)
3437 request = urllib2.Request(url)
3439 webpage = urllib2.urlopen(request).read()
3440 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3441 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3444 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3446 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3448 internal_video_id = m.group('internalvideoid')
3452 'internal_id': internal_video_id,
3455 self.report_extraction(video_id)
3456 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3458 metaXml = urllib2.urlopen(xmlUrl).read()
3459 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3460 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3463 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3465 videoNode = mdoc.findall('./video')[0]
3466 info['description'] = videoNode.findall('./description')[0].text
3467 info['title'] = videoNode.findall('./caption')[0].text
3468 info['stitle'] = _simplify_title(info['title'])
3469 info['url'] = videoNode.findall('./file')[0].text
3470 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3471 info['ext'] = info['url'].rpartition('.')[2]
3472 info['format'] = info['ext']
3474 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3477 self._downloader.increment_downloads()
3480 self._downloader.process_info(info)
3481 except UnavailableVideoError, err:
3482 self._downloader.trouble(u'\nERROR: unable to download video')
3485 class XVideosIE(InfoExtractor):
3486 """Information extractor for xvideos.com"""
3488 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3489 IE_NAME = u'xvideos'
3491 def report_webpage(self, video_id):
3492 """Report information extraction."""
3493 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3495 def report_extraction(self, video_id):
3496 """Report information extraction."""
3497 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3499 def _real_extract(self, url):
3500 htmlParser = HTMLParser.HTMLParser()
3502 mobj = re.match(self._VALID_URL, url)
3504 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3506 video_id = mobj.group(1).decode('utf-8')
3508 self.report_webpage(video_id)
3510 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3512 webpage = urllib2.urlopen(request).read()
3513 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3514 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3517 self.report_extraction(video_id)
3521 mobj = re.search(r'flv_url=(.+?)&', webpage)
3523 self._downloader.trouble(u'ERROR: unable to extract video url')
3525 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3529 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3531 self._downloader.trouble(u'ERROR: unable to extract video title')
3533 video_title = mobj.group(1).decode('utf-8')
3536 # Extract video thumbnail
3537 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3539 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3541 video_thumbnail = mobj.group(1).decode('utf-8')
3545 self._downloader.increment_downloads()
3550 'upload_date': None,
3551 'title': video_title,
3552 'stitle': _simplify_title(video_title),
3555 'thumbnail': video_thumbnail,
3556 'description': None,
3561 self._downloader.process_info(info)
3562 except UnavailableVideoError, err:
3563 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3566 class SoundcloudIE(InfoExtractor):
3567 """Information extractor for soundcloud.com
3568 To access the media, the uid of the song and a stream token
3569 must be extracted from the page source and the script must make
3570 a request to media.soundcloud.com/crossdomain.xml. Then
3571 the media can be grabbed by requesting from an url composed
3572 of the stream token and uid
3575 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3576 IE_NAME = u'soundcloud'
3578 def __init__(self, downloader=None):
3579 InfoExtractor.__init__(self, downloader)
3581 def report_webpage(self, video_id):
3582 """Report information extraction."""
3583 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3585 def report_extraction(self, video_id):
3586 """Report information extraction."""
3587 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3589 def _real_extract(self, url):
3590 htmlParser = HTMLParser.HTMLParser()
3592 mobj = re.match(self._VALID_URL, url)
3594 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3597 # extract uploader (which is in the url)
3598 uploader = mobj.group(1).decode('utf-8')
3599 # extract simple title (uploader + slug of song title)
3600 slug_title = mobj.group(2).decode('utf-8')
3601 simple_title = uploader + '-' + slug_title
3603 self.report_webpage('%s/%s' % (uploader, slug_title))
3605 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3607 webpage = urllib2.urlopen(request).read()
3608 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3609 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3612 self.report_extraction('%s/%s' % (uploader, slug_title))
3614 # extract uid and stream token that soundcloud hands out for access
3615 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3617 video_id = mobj.group(1)
3618 stream_token = mobj.group(2)
3620 # extract unsimplified title
3621 mobj = re.search('"title":"(.*?)",', webpage)
3623 title = mobj.group(1)
3625 # construct media url (with uid/token)
3626 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3627 mediaURL = mediaURL % (video_id, stream_token)
3630 description = u'No description available'
3631 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3633 description = mobj.group(1)
3637 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3640 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3641 except Exception, e:
3644 # for soundcloud, a request to a cross domain is required for cookies
3645 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3648 self._downloader.process_info({
3649 'id': video_id.decode('utf-8'),
3651 'uploader': uploader.decode('utf-8'),
3652 'upload_date': upload_date,
3653 'title': simple_title.decode('utf-8'),
3654 'stitle': simple_title.decode('utf-8'),
3658 'description': description.decode('utf-8')
3660 except UnavailableVideoError:
3661 self._downloader.trouble(u'\nERROR: unable to download video')
3664 class InfoQIE(InfoExtractor):
3665 """Information extractor for infoq.com"""
3667 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3670 def report_webpage(self, video_id):
3671 """Report information extraction."""
3672 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3674 def report_extraction(self, video_id):
3675 """Report information extraction."""
3676 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3678 def _real_extract(self, url):
3679 htmlParser = HTMLParser.HTMLParser()
3681 mobj = re.match(self._VALID_URL, url)
3683 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3686 self.report_webpage(url)
3688 request = urllib2.Request(url)
3690 webpage = urllib2.urlopen(request).read()
3691 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3692 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3695 self.report_extraction(url)
3699 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3701 self._downloader.trouble(u'ERROR: unable to extract video url')
3703 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3707 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3709 self._downloader.trouble(u'ERROR: unable to extract video title')
3711 video_title = mobj.group(1).decode('utf-8')
3713 # Extract description
3714 video_description = u'No description available.'
3715 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3716 if mobj is not None:
3717 video_description = mobj.group(1).decode('utf-8')
3719 video_filename = video_url.split('/')[-1]
3720 video_id, extension = video_filename.split('.')
3722 self._downloader.increment_downloads()
3727 'upload_date': None,
3728 'title': video_title,
3729 'stitle': _simplify_title(video_title),
3731 'format': extension, # Extension is always(?) mp4, but seems to be flv
3733 'description': video_description,
3738 self._downloader.process_info(info)
3739 except UnavailableVideoError, err:
3740 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3742 class MixcloudIE(InfoExtractor):
3743 """Information extractor for www.mixcloud.com"""
3744 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3745 IE_NAME = u'mixcloud'
3747 def __init__(self, downloader=None):
3748 InfoExtractor.__init__(self, downloader)
3750 def report_download_json(self, file_id):
3751 """Report JSON download."""
3752 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3754 def report_extraction(self, file_id):
3755 """Report information extraction."""
3756 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3758 def get_urls(self, jsonData, fmt, bitrate='best'):
3759 """Get urls from 'audio_formats' section in json"""
3762 bitrate_list = jsonData[fmt]
3763 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3764 bitrate = max(bitrate_list) # select highest
3766 url_list = jsonData[fmt][bitrate]
3767 except TypeError: # we have no bitrate info.
3768 url_list = jsonData[fmt]
3771 def check_urls(self, url_list):
3772 """Returns 1st active url from list"""
3773 for url in url_list:
3775 urllib2.urlopen(url)
3777 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3782 def _print_formats(self, formats):
3783 print 'Available formats:'
3784 for fmt in formats.keys():
3785 for b in formats[fmt]:
3787 ext = formats[fmt][b][0]
3788 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3789 except TypeError: # we have no bitrate info
3790 ext = formats[fmt][0]
3791 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3794 def _real_extract(self, url):
3795 mobj = re.match(self._VALID_URL, url)
3797 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3799 # extract uploader & filename from url
3800 uploader = mobj.group(1).decode('utf-8')
3801 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3803 # construct API request
3804 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3805 # retrieve .json file with links to files
3806 request = urllib2.Request(file_url)
3808 self.report_download_json(file_url)
3809 jsonData = urllib2.urlopen(request).read()
3810 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3811 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3815 json_data = json.loads(jsonData)
3816 player_url = json_data['player_swf_url']
3817 formats = dict(json_data['audio_formats'])
3819 req_format = self._downloader.params.get('format', None)
3822 if self._downloader.params.get('listformats', None):
3823 self._print_formats(formats)
3826 if req_format is None or req_format == 'best':
3827 for format_param in formats.keys():
3828 url_list = self.get_urls(formats, format_param)
3830 file_url = self.check_urls(url_list)
3831 if file_url is not None:
3834 if req_format not in formats.keys():
3835 self._downloader.trouble(u'ERROR: format is not available')
3838 url_list = self.get_urls(formats, req_format)
3839 file_url = self.check_urls(url_list)
3840 format_param = req_format
3843 self._downloader.increment_downloads()
3845 # Process file information
3846 self._downloader.process_info({
3847 'id': file_id.decode('utf-8'),
3848 'url': file_url.decode('utf-8'),
3849 'uploader': uploader.decode('utf-8'),
3850 'upload_date': u'NA',
3851 'title': json_data['name'],
3852 'stitle': _simplify_title(json_data['name']),
3853 'ext': file_url.split('.')[-1].decode('utf-8'),
3854 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3855 'thumbnail': json_data['thumbnail_url'],
3856 'description': json_data['description'],
3857 'player_url': player_url.decode('utf-8'),
3859 except UnavailableVideoError, err:
3860 self._downloader.trouble(u'ERROR: unable to download file')
3862 class StanfordOpenClassroomIE(InfoExtractor):
3863 """Information extractor for Stanford's Open ClassRoom"""
3865 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3866 IE_NAME = u'stanfordoc'
3868 def report_download_webpage(self, objid):
3869 """Report information extraction."""
3870 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3872 def report_extraction(self, video_id):
3873 """Report information extraction."""
3874 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3876 def _real_extract(self, url):
3877 mobj = re.match(self._VALID_URL, url)
3879 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3882 if mobj.group('course') and mobj.group('video'): # A specific video
3883 course = mobj.group('course')
3884 video = mobj.group('video')
3886 'id': _simplify_title(course + '_' + video),
3889 self.report_extraction(info['id'])
3890 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3891 xmlUrl = baseUrl + video + '.xml'
3893 metaXml = urllib2.urlopen(xmlUrl).read()
3894 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3895 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3897 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3899 info['title'] = mdoc.findall('./title')[0].text
3900 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3902 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3904 info['stitle'] = _simplify_title(info['title'])
3905 info['ext'] = info['url'].rpartition('.')[2]
3906 info['format'] = info['ext']
3907 self._downloader.increment_downloads()
3909 self._downloader.process_info(info)
3910 except UnavailableVideoError, err:
3911 self._downloader.trouble(u'\nERROR: unable to download video')
3912 elif mobj.group('course'): # A course page
3913 unescapeHTML = HTMLParser.HTMLParser().unescape
3915 course = mobj.group('course')
3917 'id': _simplify_title(course),
3921 self.report_download_webpage(info['id'])
3923 coursepage = urllib2.urlopen(url).read()
3924 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3925 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3928 m = re.search('<h1>([^<]+)</h1>', coursepage)
3930 info['title'] = unescapeHTML(m.group(1))
3932 info['title'] = info['id']
3933 info['stitle'] = _simplify_title(info['title'])
3935 m = re.search('<description>([^<]+)</description>', coursepage)
3937 info['description'] = unescapeHTML(m.group(1))
3939 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3942 'type': 'reference',
3943 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3947 for entry in info['list']:
3948 assert entry['type'] == 'reference'
3949 self.extract(entry['url'])
3951 unescapeHTML = HTMLParser.HTMLParser().unescape
3954 'id': 'Stanford OpenClassroom',
3958 self.report_download_webpage(info['id'])
3959 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3961 rootpage = urllib2.urlopen(rootURL).read()
3962 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3963 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3966 info['title'] = info['id']
3967 info['stitle'] = _simplify_title(info['title'])
3969 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3972 'type': 'reference',
3973 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3977 for entry in info['list']:
3978 assert entry['type'] == 'reference'
3979 self.extract(entry['url'])
3981 class MTVIE(InfoExtractor):
3982 """Information extractor for MTV.com"""
3984 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3987 def report_webpage(self, video_id):
3988 """Report information extraction."""
3989 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3991 def report_extraction(self, video_id):
3992 """Report information extraction."""
3993 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3995 def _real_extract(self, url):
3996 mobj = re.match(self._VALID_URL, url)
3998 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4000 if not mobj.group('proto'):
4001 url = 'http://' + url
4002 video_id = mobj.group('videoid')
4003 self.report_webpage(video_id)
4005 request = urllib2.Request(url)
4007 webpage = urllib2.urlopen(request).read()
4008 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4009 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4012 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4014 self._downloader.trouble(u'ERROR: unable to extract song name')
4016 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4017 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4019 self._downloader.trouble(u'ERROR: unable to extract performer')
4021 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4022 video_title = performer + ' - ' + song_name
4024 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4026 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4028 mtvn_uri = mobj.group(1)
4030 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4032 self._downloader.trouble(u'ERROR: unable to extract content id')
4034 content_id = mobj.group(1)
4036 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4037 self.report_extraction(video_id)
4038 request = urllib2.Request(videogen_url)
4040 metadataXml = urllib2.urlopen(request).read()
4041 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4042 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4045 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4046 renditions = mdoc.findall('.//rendition')
4048 # For now, always pick the highest quality.
4049 rendition = renditions[-1]
4052 _,_,ext = rendition.attrib['type'].partition('/')
4053 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4054 video_url = rendition.find('./src').text
4056 self._downloader.trouble('Invalid rendition field.')
4059 self._downloader.increment_downloads()
4063 'uploader': performer,
4064 'title': video_title,
4065 'stitle': _simplify_title(video_title),
4071 self._downloader.process_info(info)
4072 except UnavailableVideoError, err:
4073 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4076 class PostProcessor(object):
4077 """Post Processor class.
4079 PostProcessor objects can be added to downloaders with their
4080 add_post_processor() method. When the downloader has finished a
4081 successful download, it will take its internal chain of PostProcessors
4082 and start calling the run() method on each one of them, first with
4083 an initial argument and then with the returned value of the previous
4086 The chain will be stopped if one of them ever returns None or the end
4087 of the chain is reached.
4089 PostProcessor objects follow a "mutual registration" process similar
4090 to InfoExtractor objects.
4095 def __init__(self, downloader=None):
4096 self._downloader = downloader
4098 def set_downloader(self, downloader):
4099 """Sets the downloader for this PP."""
4100 self._downloader = downloader
4102 def run(self, information):
4103 """Run the PostProcessor.
4105 The "information" argument is a dictionary like the ones
4106 composed by InfoExtractors. The only difference is that this
4107 one has an extra field called "filepath" that points to the
4110 When this method returns None, the postprocessing chain is
4111 stopped. However, this method may return an information
4112 dictionary that will be passed to the next postprocessing
4113 object in the chain. It can be the one it received after
4114 changing some fields.
4116 In addition, this method may raise a PostProcessingError
4117 exception that will be taken into account by the downloader
4120 return information # by default, do nothing
4122 class AudioConversionError(BaseException):
4123 def __init__(self, message):
4124 self.message = message
4126 class FFmpegExtractAudioPP(PostProcessor):
4128 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4129 PostProcessor.__init__(self, downloader)
4130 if preferredcodec is None:
4131 preferredcodec = 'best'
4132 self._preferredcodec = preferredcodec
4133 self._preferredquality = preferredquality
4134 self._keepvideo = keepvideo
4137 def get_audio_codec(path):
4139 cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4140 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4141 output = handle.communicate()[0]
4142 if handle.wait() != 0:
4144 except (IOError, OSError):
4147 for line in output.split('\n'):
4148 if line.startswith('codec_name='):
4149 audio_codec = line.split('=')[1].strip()
4150 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4155 def run_ffmpeg(path, out_path, codec, more_opts):
4159 acodec_opts = ['-acodec', codec]
4160 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4162 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4163 stdout,stderr = p.communicate()
4164 except (IOError, OSError):
4165 e = sys.exc_info()[1]
4166 if isinstance(e, OSError) and e.errno == 2:
4167 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4170 if p.returncode != 0:
4171 msg = stderr.strip().split('\n')[-1]
4172 raise AudioConversionError(msg)
4174 def run(self, information):
4175 path = information['filepath']
4177 filecodec = self.get_audio_codec(path)
4178 if filecodec is None:
4179 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4183 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4184 if self._preferredcodec == 'm4a' and filecodec == 'aac':
4185 # Lossless, but in another container
4187 extension = self._preferredcodec
4188 more_opts = ['-absf', 'aac_adtstoasc']
4189 elif filecodec in ['aac', 'mp3', 'vorbis']:
4190 # Lossless if possible
4192 extension = filecodec
4193 if filecodec == 'aac':
4194 more_opts = ['-f', 'adts']
4195 if filecodec == 'vorbis':
4199 acodec = 'libmp3lame'
4202 if self._preferredquality is not None:
4203 more_opts += ['-ab', self._preferredquality]
4205 # We convert the audio (lossy)
4206 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4207 extension = self._preferredcodec
4209 if self._preferredquality is not None:
4210 more_opts += ['-ab', self._preferredquality]
4211 if self._preferredcodec == 'aac':
4212 more_opts += ['-f', 'adts']
4213 if self._preferredcodec == 'm4a':
4214 more_opts += ['-absf', 'aac_adtstoasc']
4215 if self._preferredcodec == 'vorbis':
4217 if self._preferredcodec == 'wav':
4219 more_opts += ['-f', 'wav']
4221 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4222 new_path = prefix + sep + extension
4223 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4225 self.run_ffmpeg(path, new_path, acodec, more_opts)
4227 etype,e,tb = sys.exc_info()
4228 if isinstance(e, AudioConversionError):
4229 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4231 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4234 # Try to update the date time for extracted audio file.
4235 if information.get('filetime') is not None:
4237 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4239 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4241 if not self._keepvideo:
4243 os.remove(_encodeFilename(path))
4244 except (IOError, OSError):
4245 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4248 information['filepath'] = new_path
4252 def updateSelf(downloader, filename):
4253 ''' Update the program file with the latest version from the repository '''
4254 # Note: downloader only used for options
4255 if not os.access(filename, os.W_OK):
4256 sys.exit('ERROR: no write permissions on %s' % filename)
4258 downloader.to_screen(u'Updating to latest version...')
4262 urlh = urllib.urlopen(UPDATE_URL)
4263 newcontent = urlh.read()
4265 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4266 if vmatch is not None and vmatch.group(1) == __version__:
4267 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4271 except (IOError, OSError), err:
4272 sys.exit('ERROR: unable to download latest version')
4275 outf = open(filename, 'wb')
4277 outf.write(newcontent)
4280 except (IOError, OSError), err:
4281 sys.exit('ERROR: unable to overwrite current version')
4283 downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4286 def _readOptions(filename_bytes):
4288 optionf = open(filename_bytes)
4290 return [] # silently skip if file is not present
4294 res += shlex.split(l, comments=True)
4299 def _format_option_string(option):
4300 ''' ('-o', '--option') -> -o, --format METAVAR'''
4304 if option._short_opts: opts.append(option._short_opts[0])
4305 if option._long_opts: opts.append(option._long_opts[0])
4306 if len(opts) > 1: opts.insert(1, ', ')
4308 if option.takes_value(): opts.append(' %s' % option.metavar)
4310 return "".join(opts)
4312 def _find_term_columns():
4313 columns = os.environ.get('COLUMNS', None)
4318 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4319 out,err = sp.communicate()
4320 return int(out.split()[1])
4326 max_help_position = 80
4328 # No need to wrap help messages if we're on a wide console
4329 columns = _find_term_columns()
4330 if columns: max_width = columns
4332 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4333 fmt.format_option_strings = _format_option_string
4336 'version' : __version__,
4338 'usage' : '%prog [options] url [url...]',
4339 'conflict_handler' : 'resolve',
4342 parser = optparse.OptionParser(**kw)
4345 general = optparse.OptionGroup(parser, 'General Options')
4346 selection = optparse.OptionGroup(parser, 'Video Selection')
4347 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4348 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4349 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4350 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4351 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4353 general.add_option('-h', '--help',
4354 action='help', help='print this help text and exit')
4355 general.add_option('-v', '--version',
4356 action='version', help='print program version and exit')
4357 general.add_option('-U', '--update',
4358 action='store_true', dest='update_self', help='update this program to latest version')
4359 general.add_option('-i', '--ignore-errors',
4360 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4361 general.add_option('-r', '--rate-limit',
4362 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4363 general.add_option('-R', '--retries',
4364 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4365 general.add_option('--dump-user-agent',
4366 action='store_true', dest='dump_user_agent',
4367 help='display the current browser identification', default=False)
4368 general.add_option('--list-extractors',
4369 action='store_true', dest='list_extractors',
4370 help='List all supported extractors and the URLs they would handle', default=False)
4372 selection.add_option('--playlist-start',
4373 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4374 selection.add_option('--playlist-end',
4375 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4376 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4377 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4378 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4380 authentication.add_option('-u', '--username',
4381 dest='username', metavar='USERNAME', help='account username')
4382 authentication.add_option('-p', '--password',
4383 dest='password', metavar='PASSWORD', help='account password')
4384 authentication.add_option('-n', '--netrc',
4385 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4388 video_format.add_option('-f', '--format',
4389 action='store', dest='format', metavar='FORMAT', help='video format code')
4390 video_format.add_option('--all-formats',
4391 action='store_const', dest='format', help='download all available video formats', const='all')
4392 video_format.add_option('--prefer-free-formats',
4393 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4394 video_format.add_option('--max-quality',
4395 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4396 video_format.add_option('-F', '--list-formats',
4397 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4398 video_format.add_option('--write-srt',
4399 action='store_true', dest='writesubtitles',
4400 help='write video closed captions to a .srt file (currently youtube only)', default=False)
4401 video_format.add_option('--srt-lang',
4402 action='store', dest='subtitleslang', metavar='LANG',
4403 help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
4406 verbosity.add_option('-q', '--quiet',
4407 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4408 verbosity.add_option('-s', '--simulate',
4409 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4410 verbosity.add_option('--skip-download',
4411 action='store_true', dest='skip_download', help='do not download the video', default=False)
4412 verbosity.add_option('-g', '--get-url',
4413 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4414 verbosity.add_option('-e', '--get-title',
4415 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4416 verbosity.add_option('--get-thumbnail',
4417 action='store_true', dest='getthumbnail',
4418 help='simulate, quiet but print thumbnail URL', default=False)
4419 verbosity.add_option('--get-description',
4420 action='store_true', dest='getdescription',
4421 help='simulate, quiet but print video description', default=False)
4422 verbosity.add_option('--get-filename',
4423 action='store_true', dest='getfilename',
4424 help='simulate, quiet but print output filename', default=False)
4425 verbosity.add_option('--get-format',
4426 action='store_true', dest='getformat',
4427 help='simulate, quiet but print output format', default=False)
4428 verbosity.add_option('--no-progress',
4429 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4430 verbosity.add_option('--console-title',
4431 action='store_true', dest='consoletitle',
4432 help='display progress in console titlebar', default=False)
4433 verbosity.add_option('-v', '--verbose',
4434 action='store_true', dest='verbose', help='print various debugging information', default=False)
4437 filesystem.add_option('-t', '--title',
4438 action='store_true', dest='usetitle', help='use title in file name', default=False)
4439 filesystem.add_option('-l', '--literal',
4440 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4441 filesystem.add_option('-A', '--auto-number',
4442 action='store_true', dest='autonumber',
4443 help='number downloaded files starting from 00000', default=False)
4444 filesystem.add_option('-o', '--output',
4445 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4446 filesystem.add_option('-a', '--batch-file',
4447 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4448 filesystem.add_option('-w', '--no-overwrites',
4449 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4450 filesystem.add_option('-c', '--continue',
4451 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4452 filesystem.add_option('--no-continue',
4453 action='store_false', dest='continue_dl',
4454 help='do not resume partially downloaded files (restart from beginning)')
4455 filesystem.add_option('--cookies',
4456 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4457 filesystem.add_option('--no-part',
4458 action='store_true', dest='nopart', help='do not use .part files', default=False)
4459 filesystem.add_option('--no-mtime',
4460 action='store_false', dest='updatetime',
4461 help='do not use the Last-modified header to set the file modification time', default=True)
4462 filesystem.add_option('--write-description',
4463 action='store_true', dest='writedescription',
4464 help='write video description to a .description file', default=False)
4465 filesystem.add_option('--write-info-json',
4466 action='store_true', dest='writeinfojson',
4467 help='write video metadata to a .info.json file', default=False)
4470 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4471 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4472 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4473 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4474 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4475 help='ffmpeg audio bitrate specification, 128k by default')
4476 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4477 help='keeps the video file on disk after the post-processing; the video is erased by default')
4480 parser.add_option_group(general)
4481 parser.add_option_group(selection)
4482 parser.add_option_group(filesystem)
4483 parser.add_option_group(verbosity)
4484 parser.add_option_group(video_format)
4485 parser.add_option_group(authentication)
4486 parser.add_option_group(postproc)
4488 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4490 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4492 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4493 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4494 opts, args = parser.parse_args(argv)
4496 return parser, opts, args
4498 def gen_extractors():
4499 """ Return a list of an instance of every supported extractor.
4500 The order does matter; the first extractor matched is the one handling the URL.
4502 youtube_ie = YoutubeIE()
4503 google_ie = GoogleIE()
4504 yahoo_ie = YahooIE()
4506 YoutubePlaylistIE(youtube_ie),
4507 YoutubeUserIE(youtube_ie),
4508 YoutubeSearchIE(youtube_ie),
4510 MetacafeIE(youtube_ie),
4513 GoogleSearchIE(google_ie),
4516 YahooSearchIE(yahoo_ie),
4529 StanfordOpenClassroomIE(),
4536 parser, opts, args = parseOpts()
4538 # Open appropriate CookieJar
4539 if opts.cookiefile is None:
4540 jar = cookielib.CookieJar()
4543 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4544 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4546 except (IOError, OSError), err:
4547 sys.exit(u'ERROR: unable to open cookie file')
4550 if opts.dump_user_agent:
4551 print std_headers['User-Agent']
4554 # Batch file verification
4556 if opts.batchfile is not None:
4558 if opts.batchfile == '-':
4561 batchfd = open(opts.batchfile, 'r')
4562 batchurls = batchfd.readlines()
4563 batchurls = [x.strip() for x in batchurls]
4564 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4566 sys.exit(u'ERROR: batch file could not be read')
4567 all_urls = batchurls + args
4569 # General configuration
4570 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4571 proxy_handler = urllib2.ProxyHandler()
4572 opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4573 urllib2.install_opener(opener)
4574 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4577 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4579 extractors = gen_extractors()
4581 if opts.list_extractors:
4582 for ie in extractors:
4584 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4585 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4586 for mu in matchedUrls:
4590 # Conflicting, missing and erroneous options
4591 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4592 parser.error(u'using .netrc conflicts with giving username/password')
4593 if opts.password is not None and opts.username is None:
4594 parser.error(u'account username missing')
4595 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4596 parser.error(u'using output template conflicts with using title, literal title or auto number')
4597 if opts.usetitle and opts.useliteral:
4598 parser.error(u'using title conflicts with using literal title')
4599 if opts.username is not None and opts.password is None:
4600 opts.password = getpass.getpass(u'Type account password and press return:')
4601 if opts.ratelimit is not None:
4602 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4603 if numeric_limit is None:
4604 parser.error(u'invalid rate limit specified')
4605 opts.ratelimit = numeric_limit
4606 if opts.retries is not None:
4608 opts.retries = long(opts.retries)
4609 except (TypeError, ValueError), err:
4610 parser.error(u'invalid retry count specified')
4612 opts.playliststart = int(opts.playliststart)
4613 if opts.playliststart <= 0:
4614 raise ValueError(u'Playlist start must be positive')
4615 except (TypeError, ValueError), err:
4616 parser.error(u'invalid playlist start number specified')
4618 opts.playlistend = int(opts.playlistend)
4619 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4620 raise ValueError(u'Playlist end must be greater than playlist start')
4621 except (TypeError, ValueError), err:
4622 parser.error(u'invalid playlist end number specified')
4623 if opts.extractaudio:
4624 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4625 parser.error(u'invalid audio format specified')
4628 fd = FileDownloader({
4629 'usenetrc': opts.usenetrc,
4630 'username': opts.username,
4631 'password': opts.password,
4632 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4633 'forceurl': opts.geturl,
4634 'forcetitle': opts.gettitle,
4635 'forcethumbnail': opts.getthumbnail,
4636 'forcedescription': opts.getdescription,
4637 'forcefilename': opts.getfilename,
4638 'forceformat': opts.getformat,
4639 'simulate': opts.simulate,
4640 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4641 'format': opts.format,
4642 'format_limit': opts.format_limit,
4643 'listformats': opts.listformats,
4644 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4645 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4646 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4647 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4648 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4649 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4650 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4651 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4652 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4653 or u'%(id)s.%(ext)s'),
4654 'ignoreerrors': opts.ignoreerrors,
4655 'ratelimit': opts.ratelimit,
4656 'nooverwrites': opts.nooverwrites,
4657 'retries': opts.retries,
4658 'continuedl': opts.continue_dl,
4659 'noprogress': opts.noprogress,
4660 'playliststart': opts.playliststart,
4661 'playlistend': opts.playlistend,
4662 'logtostderr': opts.outtmpl == '-',
4663 'consoletitle': opts.consoletitle,
4664 'nopart': opts.nopart,
4665 'updatetime': opts.updatetime,
4666 'writedescription': opts.writedescription,
4667 'writeinfojson': opts.writeinfojson,
4668 'writesubtitles': opts.writesubtitles,
4669 'subtitleslang': opts.subtitleslang,
4670 'matchtitle': opts.matchtitle,
4671 'rejecttitle': opts.rejecttitle,
4672 'max_downloads': opts.max_downloads,
4673 'prefer_free_formats': opts.prefer_free_formats,
4674 'verbose': opts.verbose,
4676 for extractor in extractors:
4677 fd.add_info_extractor(extractor)
4680 if opts.extractaudio:
4681 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4684 if opts.update_self:
4685 updateSelf(fd, sys.argv[0])
4688 if len(all_urls) < 1:
4689 if not opts.update_self:
4690 parser.error(u'you must provide at least one URL')
4695 retcode = fd.download(all_urls)
4696 except MaxDownloadsReached:
4697 fd.to_screen(u'--max-download limit reached, aborting.')
4700 # Dump cookie jar if requested
4701 if opts.cookiefile is not None:
4704 except (IOError, OSError), err:
4705 sys.exit(u'ERROR: unable to save cookie jar')
4712 except DownloadError:
4714 except SameFileError:
4715 sys.exit(u'ERROR: fixed output name but more than one file to download')
4716 except KeyboardInterrupt:
4717 sys.exit(u'\nERROR: Interrupted by user')
4719 if __name__ == '__main__':
4722 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: