2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
21 __license__ = 'Public Domain'
22 __version__ = '2012.02.27'
24 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
57 except ImportError: # Python 2.4
60 import cStringIO as StringIO
64 # parse_qs was moved from the cgi module to the urlparse module recently.
66 from urlparse import parse_qs
68 from cgi import parse_qs
76 import xml.etree.ElementTree
77 except ImportError: # Python<2.5: Not officially supported, but let it slip
78 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
81 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
82 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
83 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Encoding': 'gzip, deflate',
85 'Accept-Language': 'en-us,en;q=0.5',
90 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
96 def raiseError(msg, i):
97 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
98 def skipSpace(i, expectMore=True):
99 while i < len(s) and s[i] in ' \t\r\n':
103 raiseError('Premature end', i)
105 def decodeEscape(match):
121 return unichr(int(esc[1:5], 16))
122 if len(esc) == 5+6 and esc[5:7] == '\\u':
123 hi = int(esc[1:5], 16)
124 low = int(esc[7:11], 16)
125 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
126 raise ValueError('Unknown escape ' + str(esc))
133 while s[e-bslashes-1] == '\\':
135 if bslashes % 2 == 1:
139 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
140 stri = rexp.sub(decodeEscape, s[i:e])
146 if s[i] == '}': # Empty dictionary
150 raiseError('Expected a string object key', i)
151 i,key = parseString(i)
153 if i >= len(s) or s[i] != ':':
154 raiseError('Expected a colon', i)
161 raiseError('Expected comma or closing curly brace', i)
166 if s[i] == ']': # Empty array
171 i = skipSpace(i) # Raise exception if premature end
175 raiseError('Expected a comma or closing bracket', i)
177 def parseDiscrete(i):
178 for k,v in {'true': True, 'false': False, 'null': None}.items():
179 if s.startswith(k, i):
181 raiseError('Not a boolean (or null)', i)
183 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
185 raiseError('Not a number', i)
187 if '.' in nums or 'e' in nums or 'E' in nums:
188 return (i+len(nums), float(nums))
189 return (i+len(nums), int(nums))
190 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
193 i,res = CHARMAP.get(s[i], parseNumber)(i)
194 i = skipSpace(i, False)
198 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
201 def preferredencoding():
202 """Get preferred encoding.
204 Returns the best encoding scheme for the system, based on
205 locale.getpreferredencoding() and some further tweaks.
207 def yield_preferredencoding():
209 pref = locale.getpreferredencoding()
215 return yield_preferredencoding().next()
218 def htmlentity_transform(matchobj):
219 """Transforms an HTML entity to a Unicode character.
221 This function receives a match object and is intended to be used with
222 the re.sub() function.
224 entity = matchobj.group(1)
226 # Known non-numeric HTML entity
227 if entity in htmlentitydefs.name2codepoint:
228 return unichr(htmlentitydefs.name2codepoint[entity])
231 mobj = re.match(ur'(?u)#(x?\d+)', entity)
233 numstr = mobj.group(1)
234 if numstr.startswith(u'x'):
236 numstr = u'0%s' % numstr
239 return unichr(long(numstr, base))
241 # Unknown entity in name, return its literal representation
242 return (u'&%s;' % entity)
245 def clean_html(html):
246 """Clean an HTML snippet into a readable string"""
248 html = html.replace('\n', ' ')
249 html = re.sub('<\s*br\s*/?\s*>', '\n', html)
251 html = re.sub('<.*?>', '', html)
252 # Replace html entities
253 html = re.sub(ur'(?u)&(.+?);', htmlentity_transform, html)
257 def sanitize_title(utitle):
258 """Sanitizes a video title so it could be used as part of a filename."""
259 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
260 return utitle.replace(unicode(os.sep), u'%')
263 def sanitize_open(filename, open_mode):
264 """Try to open the given filename, and slightly tweak it if this fails.
266 Attempts to open the given filename. If this fails, it tries to change
267 the filename slightly, step by step, until it's either able to open it
268 or it fails and raises a final exception, like the standard open()
271 It returns the tuple (stream, definitive_file_name).
275 if sys.platform == 'win32':
277 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
278 return (sys.stdout, filename)
279 stream = open(_encodeFilename(filename), open_mode)
280 return (stream, filename)
281 except (IOError, OSError), err:
282 # In case of error, try to remove win32 forbidden chars
283 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
285 # An exception here should be caught in the caller
286 stream = open(_encodeFilename(filename), open_mode)
287 return (stream, filename)
290 def timeconvert(timestr):
291 """Convert RFC 2822 defined time string into system timestamp"""
293 timetuple = email.utils.parsedate_tz(timestr)
294 if timetuple is not None:
295 timestamp = email.utils.mktime_tz(timetuple)
298 def _simplify_title(title):
299 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
300 return expr.sub(u'_', title).strip(u'_')
302 def _orderedSet(iterable):
303 """ Remove all duplicates from the input iterable """
310 def _unescapeHTML(s):
312 @param s a string (of type unicode)
314 assert type(s) == type(u'')
316 htmlParser = HTMLParser.HTMLParser()
317 return htmlParser.unescape(s)
319 def _encodeFilename(s):
321 @param s The name of the file (of type unicode)
324 assert type(s) == type(u'')
326 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
327 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
328 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
329 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
332 return s.encode(sys.getfilesystemencoding(), 'ignore')
334 class DownloadError(Exception):
335 """Download Error exception.
337 This exception may be thrown by FileDownloader objects if they are not
338 configured to continue on errors. They will contain the appropriate
344 class SameFileError(Exception):
345 """Same File exception.
347 This exception will be thrown by FileDownloader objects if they detect
348 multiple files would have to be downloaded to the same file on disk.
353 class PostProcessingError(Exception):
354 """Post Processing exception.
356 This exception may be raised by PostProcessor's .run() method to
357 indicate an error in the postprocessing task.
361 class MaxDownloadsReached(Exception):
362 """ --max-downloads limit has been reached. """
366 class UnavailableVideoError(Exception):
367 """Unavailable Format exception.
369 This exception will be thrown when a video is requested
370 in a format that is not available for that video.
375 class ContentTooShortError(Exception):
376 """Content Too Short exception.
378 This exception may be raised by FileDownloader objects when a file they
379 download is too small for what the server announced first, indicating
380 the connection was probably interrupted.
386 def __init__(self, downloaded, expected):
387 self.downloaded = downloaded
388 self.expected = expected
391 class YoutubeDLHandler(urllib2.HTTPHandler):
392 """Handler for HTTP requests and responses.
394 This class, when installed with an OpenerDirector, automatically adds
395 the standard headers to every HTTP request and handles gzipped and
396 deflated responses from web servers. If compression is to be avoided in
397 a particular request, the original request in the program code only has
398 to include the HTTP header "Youtubedl-No-Compression", which will be
399 removed before making the real request.
401 Part of this code was copied from:
403 http://techknack.net/python-urllib2-handlers/
405 Andrew Rowls, the author of that code, agreed to release it to the
412 return zlib.decompress(data, -zlib.MAX_WBITS)
414 return zlib.decompress(data)
417 def addinfourl_wrapper(stream, headers, url, code):
418 if hasattr(urllib2.addinfourl, 'getcode'):
419 return urllib2.addinfourl(stream, headers, url, code)
420 ret = urllib2.addinfourl(stream, headers, url)
424 def http_request(self, req):
425 for h in std_headers:
428 req.add_header(h, std_headers[h])
429 if 'Youtubedl-no-compression' in req.headers:
430 if 'Accept-encoding' in req.headers:
431 del req.headers['Accept-encoding']
432 del req.headers['Youtubedl-no-compression']
435 def http_response(self, req, resp):
438 if resp.headers.get('Content-encoding', '') == 'gzip':
439 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
440 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
441 resp.msg = old_resp.msg
443 if resp.headers.get('Content-encoding', '') == 'deflate':
444 gz = StringIO.StringIO(self.deflate(resp.read()))
445 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
446 resp.msg = old_resp.msg
450 class FileDownloader(object):
451 """File Downloader class.
453 File downloader objects are the ones responsible of downloading the
454 actual video file and writing it to disk if the user has requested
455 it, among some other tasks. In most cases there should be one per
456 program. As, given a video URL, the downloader doesn't know how to
457 extract all the needed information, task that InfoExtractors do, it
458 has to pass the URL to one of them.
460 For this, file downloader objects have a method that allows
461 InfoExtractors to be registered in a given order. When it is passed
462 a URL, the file downloader handles it to the first InfoExtractor it
463 finds that reports being able to handle it. The InfoExtractor extracts
464 all the information about the video or videos the URL refers to, and
465 asks the FileDownloader to process the video information, possibly
466 downloading the video.
468 File downloaders accept a lot of parameters. In order not to saturate
469 the object constructor with arguments, it receives a dictionary of
470 options instead. These options are available through the params
471 attribute for the InfoExtractors to use. The FileDownloader also
472 registers itself as the downloader in charge for the InfoExtractors
473 that are added to it, so this is a "mutual registration".
477 username: Username for authentication purposes.
478 password: Password for authentication purposes.
479 usenetrc: Use netrc for authentication instead.
480 quiet: Do not print messages to stdout.
481 forceurl: Force printing final URL.
482 forcetitle: Force printing title.
483 forcethumbnail: Force printing thumbnail URL.
484 forcedescription: Force printing description.
485 forcefilename: Force printing final filename.
486 simulate: Do not download the video files.
487 format: Video format code.
488 format_limit: Highest quality format to try.
489 outtmpl: Template for output names.
490 ignoreerrors: Do not stop on download errors.
491 ratelimit: Download speed limit, in bytes/sec.
492 nooverwrites: Prevent overwriting files.
493 retries: Number of times to retry for HTTP error 5xx
494 continuedl: Try to continue downloads if possible.
495 noprogress: Do not print the progress bar.
496 playliststart: Playlist item to start at.
497 playlistend: Playlist item to end at.
498 matchtitle: Download only matching titles.
499 rejecttitle: Reject downloads for matching titles.
500 logtostderr: Log messages to stderr instead of stdout.
501 consoletitle: Display progress in console window's titlebar.
502 nopart: Do not use temporary .part files.
503 updatetime: Use the Last-modified header to set output file timestamps.
504 writedescription: Write the video description to a .description file
505 writeinfojson: Write the video description to a .info.json file
506 writesubtitles: Write the video subtitles to a .srt file
507 subtitleslang: Language of the subtitles to download
513 _download_retcode = None
514 _num_downloads = None
517 def __init__(self, params):
518 """Create a FileDownloader object with the given options."""
521 self._download_retcode = 0
522 self._num_downloads = 0
523 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
527 def format_bytes(bytes):
530 if type(bytes) is str:
535 exponent = long(math.log(bytes, 1024.0))
536 suffix = 'bkMGTPEZY'[exponent]
537 converted = float(bytes) / float(1024 ** exponent)
538 return '%.2f%s' % (converted, suffix)
541 def calc_percent(byte_counter, data_len):
544 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
547 def calc_eta(start, now, total, current):
551 if current == 0 or dif < 0.001: # One millisecond
553 rate = float(current) / dif
554 eta = long((float(total) - float(current)) / rate)
555 (eta_mins, eta_secs) = divmod(eta, 60)
558 return '%02d:%02d' % (eta_mins, eta_secs)
561 def calc_speed(start, now, bytes):
563 if bytes == 0 or dif < 0.001: # One millisecond
564 return '%10s' % '---b/s'
565 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
568 def best_block_size(elapsed_time, bytes):
569 new_min = max(bytes / 2.0, 1.0)
570 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
571 if elapsed_time < 0.001:
573 rate = bytes / elapsed_time
581 def parse_bytes(bytestr):
582 """Parse a string indicating a byte quantity into a long integer."""
583 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
586 number = float(matchobj.group(1))
587 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
588 return long(round(number * multiplier))
590 def add_info_extractor(self, ie):
591 """Add an InfoExtractor object to the end of the list."""
593 ie.set_downloader(self)
595 def add_post_processor(self, pp):
596 """Add a PostProcessor object to the end of the chain."""
598 pp.set_downloader(self)
600 def to_screen(self, message, skip_eol=False):
601 """Print message to stdout if not in quiet mode."""
602 assert type(message) == type(u'')
603 if not self.params.get('quiet', False):
604 terminator = [u'\n', u''][skip_eol]
605 output = message + terminator
607 if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
608 output = output.encode(preferredencoding(), 'ignore')
609 self._screen_file.write(output)
610 self._screen_file.flush()
612 def to_stderr(self, message):
613 """Print message to stderr."""
614 print >>sys.stderr, message.encode(preferredencoding())
616 def to_cons_title(self, message):
617 """Set console/terminal window title to message."""
618 if not self.params.get('consoletitle', False):
620 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
621 # c_wchar_p() might not be necessary if `message` is
622 # already of type unicode()
623 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
624 elif 'TERM' in os.environ:
625 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
627 def fixed_template(self):
628 """Checks if the output template is fixed."""
629 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
631 def trouble(self, message=None):
632 """Determine action to take when a download problem appears.
634 Depending on if the downloader has been configured to ignore
635 download errors or not, this method may throw an exception or
636 not when errors are found, after printing the message.
638 if message is not None:
639 self.to_stderr(message)
640 if not self.params.get('ignoreerrors', False):
641 raise DownloadError(message)
642 self._download_retcode = 1
644 def slow_down(self, start_time, byte_counter):
645 """Sleep if the download speed is over the rate limit."""
646 rate_limit = self.params.get('ratelimit', None)
647 if rate_limit is None or byte_counter == 0:
650 elapsed = now - start_time
653 speed = float(byte_counter) / elapsed
654 if speed > rate_limit:
655 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
657 def temp_name(self, filename):
658 """Returns a temporary filename for the given filename."""
659 if self.params.get('nopart', False) or filename == u'-' or \
660 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
662 return filename + u'.part'
664 def undo_temp_name(self, filename):
665 if filename.endswith(u'.part'):
666 return filename[:-len(u'.part')]
669 def try_rename(self, old_filename, new_filename):
671 if old_filename == new_filename:
673 os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
674 except (IOError, OSError), err:
675 self.trouble(u'ERROR: unable to rename file')
677 def try_utime(self, filename, last_modified_hdr):
678 """Try to set the last-modified time of the given file."""
679 if last_modified_hdr is None:
681 if not os.path.isfile(_encodeFilename(filename)):
683 timestr = last_modified_hdr
686 filetime = timeconvert(timestr)
690 os.utime(filename, (time.time(), filetime))
695 def report_writedescription(self, descfn):
696 """ Report that the description file is being written """
697 self.to_screen(u'[info] Writing video description to: ' + descfn)
699 def report_writesubtitles(self, srtfn):
700 """ Report that the subtitles file is being written """
701 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
703 def report_writeinfojson(self, infofn):
704 """ Report that the metadata file has been written """
705 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
707 def report_destination(self, filename):
708 """Report destination filename."""
709 self.to_screen(u'[download] Destination: ' + filename)
711 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
712 """Report download progress."""
713 if self.params.get('noprogress', False):
715 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
716 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
717 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
718 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
720 def report_resuming_byte(self, resume_len):
721 """Report attempt to resume at given byte."""
722 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
724 def report_retry(self, count, retries):
725 """Report retry in case of HTTP error 5xx"""
726 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
728 def report_file_already_downloaded(self, file_name):
729 """Report file has already been fully downloaded."""
731 self.to_screen(u'[download] %s has already been downloaded' % file_name)
732 except (UnicodeEncodeError), err:
733 self.to_screen(u'[download] The file has already been downloaded')
735 def report_unable_to_resume(self):
736 """Report it was impossible to resume download."""
737 self.to_screen(u'[download] Unable to resume')
739 def report_finish(self):
740 """Report download finished."""
741 if self.params.get('noprogress', False):
742 self.to_screen(u'[download] Download completed')
746 def increment_downloads(self):
747 """Increment the ordinal that assigns a number to each file."""
748 self._num_downloads += 1
750 def prepare_filename(self, info_dict):
751 """Generate the output filename."""
753 template_dict = dict(info_dict)
754 template_dict['epoch'] = unicode(long(time.time()))
755 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
756 filename = self.params['outtmpl'] % template_dict
758 except (ValueError, KeyError), err:
759 self.trouble(u'ERROR: invalid system charset or erroneous output template')
762 def _match_entry(self, info_dict):
763 """ Returns None iff the file should be downloaded """
765 title = info_dict['title']
766 matchtitle = self.params.get('matchtitle', False)
767 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
768 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
769 rejecttitle = self.params.get('rejecttitle', False)
770 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
771 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
774 def process_info(self, info_dict):
775 """Process a single dictionary returned by an InfoExtractor."""
777 reason = self._match_entry(info_dict)
778 if reason is not None:
779 self.to_screen(u'[download] ' + reason)
782 max_downloads = self.params.get('max_downloads')
783 if max_downloads is not None:
784 if self._num_downloads > int(max_downloads):
785 raise MaxDownloadsReached()
787 filename = self.prepare_filename(info_dict)
790 if self.params.get('forcetitle', False):
791 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
792 if self.params.get('forceurl', False):
793 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
794 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
795 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
796 if self.params.get('forcedescription', False) and 'description' in info_dict:
797 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
798 if self.params.get('forcefilename', False) and filename is not None:
799 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
800 if self.params.get('forceformat', False):
801 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
803 # Do nothing else if in simulate mode
804 if self.params.get('simulate', False):
811 dn = os.path.dirname(_encodeFilename(filename))
812 if dn != '' and not os.path.exists(dn): # dn is already encoded
814 except (OSError, IOError), err:
815 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
818 if self.params.get('writedescription', False):
820 descfn = filename + u'.description'
821 self.report_writedescription(descfn)
822 descfile = open(_encodeFilename(descfn), 'wb')
824 descfile.write(info_dict['description'].encode('utf-8'))
827 except (OSError, IOError):
828 self.trouble(u'ERROR: Cannot write description file ' + descfn)
831 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
832 # subtitles download errors are already managed as troubles in relevant IE
833 # that way it will silently go on when used with unsupporting IE
835 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
836 self.report_writesubtitles(srtfn)
837 srtfile = open(_encodeFilename(srtfn), 'wb')
839 srtfile.write(info_dict['subtitles'].encode('utf-8'))
842 except (OSError, IOError):
843 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
846 if self.params.get('writeinfojson', False):
847 infofn = filename + u'.info.json'
848 self.report_writeinfojson(infofn)
851 except (NameError,AttributeError):
852 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
855 infof = open(_encodeFilename(infofn), 'wb')
857 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
858 json.dump(json_info_dict, infof)
861 except (OSError, IOError):
862 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
865 if not self.params.get('skip_download', False):
866 if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
870 success = self._do_download(filename, info_dict)
871 except (OSError, IOError), err:
872 raise UnavailableVideoError
873 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
874 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
876 except (ContentTooShortError, ), err:
877 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
882 self.post_process(filename, info_dict)
883 except (PostProcessingError), err:
884 self.trouble(u'ERROR: postprocessing: %s' % str(err))
887 def download(self, url_list):
888 """Download a given list of URLs."""
889 if len(url_list) > 1 and self.fixed_template():
890 raise SameFileError(self.params['outtmpl'])
893 suitable_found = False
895 # Go to next InfoExtractor if not suitable
896 if not ie.suitable(url):
899 # Suitable InfoExtractor found
900 suitable_found = True
902 # Extract information from URL and process it
905 # Suitable InfoExtractor had been found; go to next URL
908 if not suitable_found:
909 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
911 return self._download_retcode
913 def post_process(self, filename, ie_info):
914 """Run the postprocessing chain on the given file."""
916 info['filepath'] = filename
922 def _download_with_rtmpdump(self, filename, url, player_url):
923 self.report_destination(filename)
924 tmpfilename = self.temp_name(filename)
926 # Check for rtmpdump first
928 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
929 except (OSError, IOError):
930 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
933 # Download using rtmpdump. rtmpdump returns exit code 2 when
934 # the connection was interrumpted and resuming appears to be
935 # possible. This is part of rtmpdump's normal usage, AFAIK.
936 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
937 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
938 if self.params.get('verbose', False):
941 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
944 self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
945 retval = subprocess.call(args)
946 while retval == 2 or retval == 1:
947 prevsize = os.path.getsize(_encodeFilename(tmpfilename))
948 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
949 time.sleep(5.0) # This seems to be needed
950 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
951 cursize = os.path.getsize(_encodeFilename(tmpfilename))
952 if prevsize == cursize and retval == 1:
954 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
955 if prevsize == cursize and retval == 2 and cursize > 1024:
956 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
960 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
961 self.try_rename(tmpfilename, filename)
964 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
967 def _do_download(self, filename, info_dict):
968 url = info_dict['url']
969 player_url = info_dict.get('player_url', None)
971 # Check file already present
972 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
973 self.report_file_already_downloaded(filename)
976 # Attempt to download using rtmpdump
977 if url.startswith('rtmp'):
978 return self._download_with_rtmpdump(filename, url, player_url)
980 tmpfilename = self.temp_name(filename)
983 # Do not include the Accept-Encoding header
984 headers = {'Youtubedl-no-compression': 'True'}
985 basic_request = urllib2.Request(url, None, headers)
986 request = urllib2.Request(url, None, headers)
988 # Establish possible resume length
989 if os.path.isfile(_encodeFilename(tmpfilename)):
990 resume_len = os.path.getsize(_encodeFilename(tmpfilename))
996 if self.params.get('continuedl', False):
997 self.report_resuming_byte(resume_len)
998 request.add_header('Range','bytes=%d-' % resume_len)
1004 retries = self.params.get('retries', 0)
1005 while count <= retries:
1006 # Establish connection
1008 if count == 0 and 'urlhandle' in info_dict:
1009 data = info_dict['urlhandle']
1010 data = urllib2.urlopen(request)
1012 except (urllib2.HTTPError, ), err:
1013 if (err.code < 500 or err.code >= 600) and err.code != 416:
1014 # Unexpected HTTP error
1016 elif err.code == 416:
1017 # Unable to resume (requested range not satisfiable)
1019 # Open the connection again without the range header
1020 data = urllib2.urlopen(basic_request)
1021 content_length = data.info()['Content-Length']
1022 except (urllib2.HTTPError, ), err:
1023 if err.code < 500 or err.code >= 600:
1026 # Examine the reported length
1027 if (content_length is not None and
1028 (resume_len - 100 < long(content_length) < resume_len + 100)):
1029 # The file had already been fully downloaded.
1030 # Explanation to the above condition: in issue #175 it was revealed that
1031 # YouTube sometimes adds or removes a few bytes from the end of the file,
1032 # changing the file size slightly and causing problems for some users. So
1033 # I decided to implement a suggested change and consider the file
1034 # completely downloaded if the file size differs less than 100 bytes from
1035 # the one in the hard drive.
1036 self.report_file_already_downloaded(filename)
1037 self.try_rename(tmpfilename, filename)
1040 # The length does not match, we start the download over
1041 self.report_unable_to_resume()
1046 if count <= retries:
1047 self.report_retry(count, retries)
1050 self.trouble(u'ERROR: giving up after %s retries' % retries)
1053 data_len = data.info().get('Content-length', None)
1054 if data_len is not None:
1055 data_len = long(data_len) + resume_len
1056 data_len_str = self.format_bytes(data_len)
1057 byte_counter = 0 + resume_len
1061 # Download and write
1062 before = time.time()
1063 data_block = data.read(block_size)
1065 if len(data_block) == 0:
1067 byte_counter += len(data_block)
1069 # Open file just in time
1072 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1073 assert stream is not None
1074 filename = self.undo_temp_name(tmpfilename)
1075 self.report_destination(filename)
1076 except (OSError, IOError), err:
1077 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1080 stream.write(data_block)
1081 except (IOError, OSError), err:
1082 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1084 block_size = self.best_block_size(after - before, len(data_block))
1087 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1088 if data_len is None:
1089 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1091 percent_str = self.calc_percent(byte_counter, data_len)
1092 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1093 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1096 self.slow_down(start, byte_counter - resume_len)
1099 self.trouble(u'\nERROR: Did not get any data blocks')
1102 self.report_finish()
1103 if data_len is not None and byte_counter != data_len:
1104 raise ContentTooShortError(byte_counter, long(data_len))
1105 self.try_rename(tmpfilename, filename)
1107 # Update file modification time
1108 if self.params.get('updatetime', True):
1109 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1114 class InfoExtractor(object):
1115 """Information Extractor class.
1117 Information extractors are the classes that, given a URL, extract
1118 information from the video (or videos) the URL refers to. This
1119 information includes the real video URL, the video title and simplified
1120 title, author and others. The information is stored in a dictionary
1121 which is then passed to the FileDownloader. The FileDownloader
1122 processes this information possibly downloading the video to the file
1123 system, among other possible outcomes. The dictionaries must include
1124 the following fields:
1126 id: Video identifier.
1127 url: Final video URL.
1128 uploader: Nickname of the video uploader.
1129 title: Literal title.
1130 stitle: Simplified title.
1131 ext: Video filename extension.
1132 format: Video format.
1133 player_url: SWF Player URL (may be None).
1135 The following fields are optional. Their primary purpose is to allow
1136 youtube-dl to serve as the backend for a video search function, such
1137 as the one in youtube2mp3. They are only used when their respective
1138 forced printing functions are called:
1140 thumbnail: Full URL to a video thumbnail image.
1141 description: One-line video description.
1143 Subclasses of this one should re-define the _real_initialize() and
1144 _real_extract() methods and define a _VALID_URL regexp.
1145 Probably, they should also be added to the list of extractors.
1151 def __init__(self, downloader=None):
1152 """Constructor. Receives an optional downloader."""
1154 self.set_downloader(downloader)
1156 def suitable(self, url):
1157 """Receives a URL and returns True if suitable for this IE."""
1158 return re.match(self._VALID_URL, url) is not None
1160 def initialize(self):
1161 """Initializes an instance (authentication, etc)."""
1163 self._real_initialize()
1166 def extract(self, url):
1167 """Extracts URL information and returns it in list of dicts."""
1169 return self._real_extract(url)
1171 def set_downloader(self, downloader):
1172 """Sets the downloader for this IE."""
1173 self._downloader = downloader
1175 def _real_initialize(self):
1176 """Real initialization process. Redefine in subclasses."""
1179 def _real_extract(self, url):
1180 """Real extraction process. Redefine in subclasses."""
1184 class YoutubeIE(InfoExtractor):
1185 """Information extractor for youtube.com."""
1187 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1188 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1189 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1190 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1191 _NETRC_MACHINE = 'youtube'
1192 # Listed in order of quality
1193 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1194 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1195 _video_extensions = {
1201 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1206 _video_dimensions = {
1221 IE_NAME = u'youtube'
1223 def report_lang(self):
1224 """Report attempt to set language."""
1225 self._downloader.to_screen(u'[youtube] Setting language')
1227 def report_login(self):
1228 """Report attempt to log in."""
1229 self._downloader.to_screen(u'[youtube] Logging in')
1231 def report_age_confirmation(self):
1232 """Report attempt to confirm age."""
1233 self._downloader.to_screen(u'[youtube] Confirming age')
1235 def report_video_webpage_download(self, video_id):
1236 """Report attempt to download video webpage."""
1237 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1239 def report_video_info_webpage_download(self, video_id):
1240 """Report attempt to download video info webpage."""
1241 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1243 def report_video_subtitles_download(self, video_id):
1244 """Report attempt to download video info webpage."""
1245 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1247 def report_information_extraction(self, video_id):
1248 """Report attempt to extract video information."""
1249 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1251 def report_unavailable_format(self, video_id, format):
1252 """Report extracted video URL."""
1253 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1255 def report_rtmp_download(self):
1256 """Indicate the download will use the RTMP protocol."""
1257 self._downloader.to_screen(u'[youtube] RTMP download detected')
1259 def _closed_captions_xml_to_srt(self, xml_string):
1261 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1262 # TODO parse xml instead of regex
1263 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1264 if not dur: dur = '4'
1265 start = float(start)
1266 end = start + float(dur)
1267 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1268 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1269 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
1270 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
1271 srt += str(n) + '\n'
1272 srt += start + ' --> ' + end + '\n'
1273 srt += caption + '\n\n'
1276 def _print_formats(self, formats):
1277 print 'Available formats:'
1279 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1281 def _real_initialize(self):
1282 if self._downloader is None:
1287 downloader_params = self._downloader.params
1289 # Attempt to use provided username and password or .netrc data
1290 if downloader_params.get('username', None) is not None:
1291 username = downloader_params['username']
1292 password = downloader_params['password']
1293 elif downloader_params.get('usenetrc', False):
1295 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1296 if info is not None:
1300 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1301 except (IOError, netrc.NetrcParseError), err:
1302 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1306 request = urllib2.Request(self._LANG_URL)
1309 urllib2.urlopen(request).read()
1310 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1311 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1314 # No authentication to be performed
1315 if username is None:
1320 'current_form': 'loginForm',
1322 'action_login': 'Log In',
1323 'username': username,
1324 'password': password,
1326 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1329 login_results = urllib2.urlopen(request).read()
1330 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1331 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1333 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1334 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1340 'action_confirm': 'Confirm',
1342 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1344 self.report_age_confirmation()
1345 age_results = urllib2.urlopen(request).read()
1346 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1347 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1350 def _real_extract(self, url):
1351 # Extract video id from URL
1352 mobj = re.match(self._VALID_URL, url)
1354 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1356 video_id = mobj.group(2)
1359 self.report_video_webpage_download(video_id)
1360 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1362 video_webpage = urllib2.urlopen(request).read()
1363 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1364 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1367 # Attempt to extract SWF player URL
1368 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1369 if mobj is not None:
1370 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1375 self.report_video_info_webpage_download(video_id)
1376 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1377 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1378 % (video_id, el_type))
1379 request = urllib2.Request(video_info_url)
1381 video_info_webpage = urllib2.urlopen(request).read()
1382 video_info = parse_qs(video_info_webpage)
1383 if 'token' in video_info:
1385 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1386 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1388 if 'token' not in video_info:
1389 if 'reason' in video_info:
1390 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1392 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1395 # Start extracting information
1396 self.report_information_extraction(video_id)
1399 if 'author' not in video_info:
1400 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1402 video_uploader = urllib.unquote_plus(video_info['author'][0])
1405 if 'title' not in video_info:
1406 self._downloader.trouble(u'ERROR: unable to extract video title')
1408 video_title = urllib.unquote_plus(video_info['title'][0])
1409 video_title = video_title.decode('utf-8')
1410 video_title = sanitize_title(video_title)
1413 simple_title = _simplify_title(video_title)
1416 if 'thumbnail_url' not in video_info:
1417 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1418 video_thumbnail = ''
1419 else: # don't panic if we can't find it
1420 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1424 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1425 if mobj is not None:
1426 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1427 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1428 for expression in format_expressions:
1430 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1438 video_description = u'No description available.'
1439 mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1440 if mobj is not None:
1441 video_description = mobj.group(1).decode('utf-8')
1443 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1444 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1445 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1446 # TODO use another parser
1449 video_subtitles = None
1450 if self._downloader.params.get('writesubtitles', False):
1451 self.report_video_subtitles_download(video_id)
1452 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1454 srt_list = urllib2.urlopen(request).read()
1455 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1456 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1458 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1460 if self._downloader.params.get('subtitleslang', False):
1461 srt_lang = self._downloader.params.get('subtitleslang')
1462 elif 'en' in srt_lang_list:
1465 srt_lang = srt_lang_list[0]
1466 if not srt_lang in srt_lang_list:
1467 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
1469 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1471 srt_xml = urllib2.urlopen(request).read()
1472 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1473 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1475 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1477 self._downloader.trouble(u'WARNING: video has no closed captions')
1480 video_token = urllib.unquote_plus(video_info['token'][0])
1482 # Decide which formats to download
1483 req_format = self._downloader.params.get('format', None)
1485 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1486 self.report_rtmp_download()
1487 video_url_list = [(None, video_info['conn'][0])]
1488 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1489 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1490 url_data = [parse_qs(uds) for uds in url_data_strs]
1491 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1492 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1494 format_limit = self._downloader.params.get('format_limit', None)
1495 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1496 if format_limit is not None and format_limit in available_formats:
1497 format_list = available_formats[available_formats.index(format_limit):]
1499 format_list = available_formats
1500 existing_formats = [x for x in format_list if x in url_map]
1501 if len(existing_formats) == 0:
1502 self._downloader.trouble(u'ERROR: no known formats available for video')
1504 if self._downloader.params.get('listformats', None):
1505 self._print_formats(existing_formats)
1507 if req_format is None or req_format == 'best':
1508 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1509 elif req_format == 'worst':
1510 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1511 elif req_format in ('-1', 'all'):
1512 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1514 # Specific formats. We pick the first in a slash-delimeted sequence.
1515 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1516 req_formats = req_format.split('/')
1517 video_url_list = None
1518 for rf in req_formats:
1520 video_url_list = [(rf, url_map[rf])]
1522 if video_url_list is None:
1523 self._downloader.trouble(u'ERROR: requested format not available')
1526 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1529 for format_param, video_real_url in video_url_list:
1530 # At this point we have a new video
1531 self._downloader.increment_downloads()
1534 video_extension = self._video_extensions.get(format_param, 'flv')
1537 # Process video information
1538 self._downloader.process_info({
1539 'id': video_id.decode('utf-8'),
1540 'url': video_real_url.decode('utf-8'),
1541 'uploader': video_uploader.decode('utf-8'),
1542 'upload_date': upload_date,
1543 'title': video_title,
1544 'stitle': simple_title,
1545 'ext': video_extension.decode('utf-8'),
1546 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1547 'thumbnail': video_thumbnail.decode('utf-8'),
1548 'description': video_description,
1549 'player_url': player_url,
1550 'subtitles': video_subtitles
1552 except UnavailableVideoError, err:
1553 self._downloader.trouble(u'\nERROR: unable to download video')
1556 class MetacafeIE(InfoExtractor):
1557 """Information Extractor for metacafe.com."""
1559 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1560 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1561 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1563 IE_NAME = u'metacafe'
1565 def __init__(self, youtube_ie, downloader=None):
1566 InfoExtractor.__init__(self, downloader)
1567 self._youtube_ie = youtube_ie
1569 def report_disclaimer(self):
1570 """Report disclaimer retrieval."""
1571 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1573 def report_age_confirmation(self):
1574 """Report attempt to confirm age."""
1575 self._downloader.to_screen(u'[metacafe] Confirming age')
1577 def report_download_webpage(self, video_id):
1578 """Report webpage download."""
1579 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1581 def report_extraction(self, video_id):
1582 """Report information extraction."""
1583 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1585 def _real_initialize(self):
1586 # Retrieve disclaimer
1587 request = urllib2.Request(self._DISCLAIMER)
1589 self.report_disclaimer()
1590 disclaimer = urllib2.urlopen(request).read()
1591 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1592 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1598 'submit': "Continue - I'm over 18",
1600 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1602 self.report_age_confirmation()
1603 disclaimer = urllib2.urlopen(request).read()
1604 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1605 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1608 def _real_extract(self, url):
1609 # Extract id and simplified title from URL
1610 mobj = re.match(self._VALID_URL, url)
1612 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1615 video_id = mobj.group(1)
1617 # Check if video comes from YouTube
1618 mobj2 = re.match(r'^yt-(.*)$', video_id)
1619 if mobj2 is not None:
1620 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1623 # At this point we have a new video
1624 self._downloader.increment_downloads()
1626 simple_title = mobj.group(2).decode('utf-8')
1628 # Retrieve video webpage to extract further information
1629 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1631 self.report_download_webpage(video_id)
1632 webpage = urllib2.urlopen(request).read()
1633 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1634 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1637 # Extract URL, uploader and title from webpage
1638 self.report_extraction(video_id)
1639 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1640 if mobj is not None:
1641 mediaURL = urllib.unquote(mobj.group(1))
1642 video_extension = mediaURL[-3:]
1644 # Extract gdaKey if available
1645 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1647 video_url = mediaURL
1649 gdaKey = mobj.group(1)
1650 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1652 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1654 self._downloader.trouble(u'ERROR: unable to extract media URL')
1656 vardict = parse_qs(mobj.group(1))
1657 if 'mediaData' not in vardict:
1658 self._downloader.trouble(u'ERROR: unable to extract media URL')
1660 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1662 self._downloader.trouble(u'ERROR: unable to extract media URL')
1664 mediaURL = mobj.group(1).replace('\\/', '/')
1665 video_extension = mediaURL[-3:]
1666 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1668 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1670 self._downloader.trouble(u'ERROR: unable to extract title')
1672 video_title = mobj.group(1).decode('utf-8')
1673 video_title = sanitize_title(video_title)
1675 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1677 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1679 video_uploader = mobj.group(1)
1682 # Process video information
1683 self._downloader.process_info({
1684 'id': video_id.decode('utf-8'),
1685 'url': video_url.decode('utf-8'),
1686 'uploader': video_uploader.decode('utf-8'),
1687 'upload_date': u'NA',
1688 'title': video_title,
1689 'stitle': simple_title,
1690 'ext': video_extension.decode('utf-8'),
1694 except UnavailableVideoError:
1695 self._downloader.trouble(u'\nERROR: unable to download video')
1698 class DailymotionIE(InfoExtractor):
1699 """Information Extractor for Dailymotion"""
1701 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1702 IE_NAME = u'dailymotion'
1704 def __init__(self, downloader=None):
1705 InfoExtractor.__init__(self, downloader)
1707 def report_download_webpage(self, video_id):
1708 """Report webpage download."""
1709 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1711 def report_extraction(self, video_id):
1712 """Report information extraction."""
1713 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1715 def _real_extract(self, url):
1716 # Extract id and simplified title from URL
1717 mobj = re.match(self._VALID_URL, url)
1719 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1722 # At this point we have a new video
1723 self._downloader.increment_downloads()
1724 video_id = mobj.group(1)
1726 video_extension = 'flv'
1728 # Retrieve video webpage to extract further information
1729 request = urllib2.Request(url)
1730 request.add_header('Cookie', 'family_filter=off')
1732 self.report_download_webpage(video_id)
1733 webpage = urllib2.urlopen(request).read()
1734 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1735 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1738 # Extract URL, uploader and title from webpage
1739 self.report_extraction(video_id)
1740 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1742 self._downloader.trouble(u'ERROR: unable to extract media URL')
1744 sequence = urllib.unquote(mobj.group(1))
1745 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1747 self._downloader.trouble(u'ERROR: unable to extract media URL')
1749 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1751 # if needed add http://www.dailymotion.com/ if relative URL
1753 video_url = mediaURL
1755 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1757 self._downloader.trouble(u'ERROR: unable to extract title')
1759 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1760 video_title = sanitize_title(video_title)
1761 simple_title = _simplify_title(video_title)
1763 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1765 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1767 video_uploader = mobj.group(1)
1770 # Process video information
1771 self._downloader.process_info({
1772 'id': video_id.decode('utf-8'),
1773 'url': video_url.decode('utf-8'),
1774 'uploader': video_uploader.decode('utf-8'),
1775 'upload_date': u'NA',
1776 'title': video_title,
1777 'stitle': simple_title,
1778 'ext': video_extension.decode('utf-8'),
1782 except UnavailableVideoError:
1783 self._downloader.trouble(u'\nERROR: unable to download video')
1786 class GoogleIE(InfoExtractor):
1787 """Information extractor for video.google.com."""
1789 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1790 IE_NAME = u'video.google'
1792 def __init__(self, downloader=None):
1793 InfoExtractor.__init__(self, downloader)
1795 def report_download_webpage(self, video_id):
1796 """Report webpage download."""
1797 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1799 def report_extraction(self, video_id):
1800 """Report information extraction."""
1801 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1803 def _real_extract(self, url):
1804 # Extract id from URL
1805 mobj = re.match(self._VALID_URL, url)
1807 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1810 # At this point we have a new video
1811 self._downloader.increment_downloads()
1812 video_id = mobj.group(1)
1814 video_extension = 'mp4'
1816 # Retrieve video webpage to extract further information
1817 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1819 self.report_download_webpage(video_id)
1820 webpage = urllib2.urlopen(request).read()
1821 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1822 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1825 # Extract URL, uploader, and title from webpage
1826 self.report_extraction(video_id)
1827 mobj = re.search(r"download_url:'([^']+)'", webpage)
1829 video_extension = 'flv'
1830 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1832 self._downloader.trouble(u'ERROR: unable to extract media URL')
1834 mediaURL = urllib.unquote(mobj.group(1))
1835 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1836 mediaURL = mediaURL.replace('\\x26', '\x26')
1838 video_url = mediaURL
1840 mobj = re.search(r'<title>(.*)</title>', webpage)
1842 self._downloader.trouble(u'ERROR: unable to extract title')
1844 video_title = mobj.group(1).decode('utf-8')
1845 video_title = sanitize_title(video_title)
1846 simple_title = _simplify_title(video_title)
1848 # Extract video description
1849 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1851 self._downloader.trouble(u'ERROR: unable to extract video description')
1853 video_description = mobj.group(1).decode('utf-8')
1854 if not video_description:
1855 video_description = 'No description available.'
1857 # Extract video thumbnail
1858 if self._downloader.params.get('forcethumbnail', False):
1859 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1861 webpage = urllib2.urlopen(request).read()
1862 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1863 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1865 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1867 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1869 video_thumbnail = mobj.group(1)
1870 else: # we need something to pass to process_info
1871 video_thumbnail = ''
1874 # Process video information
1875 self._downloader.process_info({
1876 'id': video_id.decode('utf-8'),
1877 'url': video_url.decode('utf-8'),
1879 'upload_date': u'NA',
1880 'title': video_title,
1881 'stitle': simple_title,
1882 'ext': video_extension.decode('utf-8'),
1886 except UnavailableVideoError:
1887 self._downloader.trouble(u'\nERROR: unable to download video')
1890 class PhotobucketIE(InfoExtractor):
1891 """Information extractor for photobucket.com."""
1893 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1894 IE_NAME = u'photobucket'
1896 def __init__(self, downloader=None):
1897 InfoExtractor.__init__(self, downloader)
1899 def report_download_webpage(self, video_id):
1900 """Report webpage download."""
1901 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1903 def report_extraction(self, video_id):
1904 """Report information extraction."""
1905 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1907 def _real_extract(self, url):
1908 # Extract id from URL
1909 mobj = re.match(self._VALID_URL, url)
1911 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1914 # At this point we have a new video
1915 self._downloader.increment_downloads()
1916 video_id = mobj.group(1)
1918 video_extension = 'flv'
1920 # Retrieve video webpage to extract further information
1921 request = urllib2.Request(url)
1923 self.report_download_webpage(video_id)
1924 webpage = urllib2.urlopen(request).read()
1925 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1926 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1929 # Extract URL, uploader, and title from webpage
1930 self.report_extraction(video_id)
1931 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1933 self._downloader.trouble(u'ERROR: unable to extract media URL')
1935 mediaURL = urllib.unquote(mobj.group(1))
1937 video_url = mediaURL
1939 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1941 self._downloader.trouble(u'ERROR: unable to extract title')
1943 video_title = mobj.group(1).decode('utf-8')
1944 video_title = sanitize_title(video_title)
1945 simple_title = _simplify_title(vide_title)
1947 video_uploader = mobj.group(2).decode('utf-8')
1950 # Process video information
1951 self._downloader.process_info({
1952 'id': video_id.decode('utf-8'),
1953 'url': video_url.decode('utf-8'),
1954 'uploader': video_uploader,
1955 'upload_date': u'NA',
1956 'title': video_title,
1957 'stitle': simple_title,
1958 'ext': video_extension.decode('utf-8'),
1962 except UnavailableVideoError:
1963 self._downloader.trouble(u'\nERROR: unable to download video')
1966 class YahooIE(InfoExtractor):
1967 """Information extractor for video.yahoo.com."""
1969 # _VALID_URL matches all Yahoo! Video URLs
1970 # _VPAGE_URL matches only the extractable '/watch/' URLs
1971 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1972 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1973 IE_NAME = u'video.yahoo'
1975 def __init__(self, downloader=None):
1976 InfoExtractor.__init__(self, downloader)
1978 def report_download_webpage(self, video_id):
1979 """Report webpage download."""
1980 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1982 def report_extraction(self, video_id):
1983 """Report information extraction."""
1984 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1986 def _real_extract(self, url, new_video=True):
1987 # Extract ID from URL
1988 mobj = re.match(self._VALID_URL, url)
1990 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1993 # At this point we have a new video
1994 self._downloader.increment_downloads()
1995 video_id = mobj.group(2)
1996 video_extension = 'flv'
1998 # Rewrite valid but non-extractable URLs as
1999 # extractable English language /watch/ URLs
2000 if re.match(self._VPAGE_URL, url) is None:
2001 request = urllib2.Request(url)
2003 webpage = urllib2.urlopen(request).read()
2004 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2005 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2008 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
2010 self._downloader.trouble(u'ERROR: Unable to extract id field')
2012 yahoo_id = mobj.group(1)
2014 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2016 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2018 yahoo_vid = mobj.group(1)
2020 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2021 return self._real_extract(url, new_video=False)
2023 # Retrieve video webpage to extract further information
2024 request = urllib2.Request(url)
2026 self.report_download_webpage(video_id)
2027 webpage = urllib2.urlopen(request).read()
2028 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2029 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2032 # Extract uploader and title from webpage
2033 self.report_extraction(video_id)
2034 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2036 self._downloader.trouble(u'ERROR: unable to extract video title')
2038 video_title = mobj.group(1).decode('utf-8')
2039 simple_title = _simplify_title(video_title)
2041 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2043 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2045 video_uploader = mobj.group(1).decode('utf-8')
2047 # Extract video thumbnail
2048 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2050 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2052 video_thumbnail = mobj.group(1).decode('utf-8')
2054 # Extract video description
2055 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2057 self._downloader.trouble(u'ERROR: unable to extract video description')
2059 video_description = mobj.group(1).decode('utf-8')
2060 if not video_description:
2061 video_description = 'No description available.'
2063 # Extract video height and width
2064 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2066 self._downloader.trouble(u'ERROR: unable to extract video height')
2068 yv_video_height = mobj.group(1)
2070 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2072 self._downloader.trouble(u'ERROR: unable to extract video width')
2074 yv_video_width = mobj.group(1)
2076 # Retrieve video playlist to extract media URL
2077 # I'm not completely sure what all these options are, but we
2078 # seem to need most of them, otherwise the server sends a 401.
2079 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
2080 yv_bitrate = '700' # according to Wikipedia this is hard-coded
2081 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2082 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2083 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2085 self.report_download_webpage(video_id)
2086 webpage = urllib2.urlopen(request).read()
2087 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2088 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2091 # Extract media URL from playlist XML
2092 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2094 self._downloader.trouble(u'ERROR: Unable to extract media URL')
2096 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2097 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2100 # Process video information
2101 self._downloader.process_info({
2102 'id': video_id.decode('utf-8'),
2104 'uploader': video_uploader,
2105 'upload_date': u'NA',
2106 'title': video_title,
2107 'stitle': simple_title,
2108 'ext': video_extension.decode('utf-8'),
2109 'thumbnail': video_thumbnail.decode('utf-8'),
2110 'description': video_description,
2111 'thumbnail': video_thumbnail,
2114 except UnavailableVideoError:
2115 self._downloader.trouble(u'\nERROR: unable to download video')
2118 class VimeoIE(InfoExtractor):
2119 """Information extractor for vimeo.com."""
2121 # _VALID_URL matches Vimeo URLs
2122 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2125 def __init__(self, downloader=None):
2126 InfoExtractor.__init__(self, downloader)
2128 def report_download_webpage(self, video_id):
2129 """Report webpage download."""
2130 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2132 def report_extraction(self, video_id):
2133 """Report information extraction."""
2134 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2136 def _real_extract(self, url, new_video=True):
2137 # Extract ID from URL
2138 mobj = re.match(self._VALID_URL, url)
2140 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2143 # At this point we have a new video
2144 self._downloader.increment_downloads()
2145 video_id = mobj.group(1)
2147 # Retrieve video webpage to extract further information
2148 request = urllib2.Request(url, None, std_headers)
2150 self.report_download_webpage(video_id)
2151 webpage = urllib2.urlopen(request).read()
2152 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2153 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2156 # Now we begin extracting as much information as we can from what we
2157 # retrieved. First we extract the information common to all extractors,
2158 # and latter we extract those that are Vimeo specific.
2159 self.report_extraction(video_id)
2161 # Extract the config JSON
2162 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2164 config = json.loads(config)
2166 self._downloader.trouble(u'ERROR: unable to extract info section')
2170 video_title = config["video"]["title"]
2171 simple_title = _simplify_title(video_title)
2174 video_uploader = config["video"]["owner"]["name"]
2176 # Extract video thumbnail
2177 video_thumbnail = config["video"]["thumbnail"]
2179 # Extract video description
2183 video_description = u'No description available.'
2184 mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2185 if mobj is not None:
2186 video_description = mobj.group(1)
2188 html_parser = lxml.etree.HTMLParser()
2189 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2190 video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2191 # TODO use another parser
2193 # Extract upload date
2194 video_upload_date = u'NA'
2195 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2196 if mobj is not None:
2197 video_upload_date = mobj.group(1)
2199 # Vimeo specific: extract request signature and timestamp
2200 sig = config['request']['signature']
2201 timestamp = config['request']['timestamp']
2203 # Vimeo specific: extract video codec and quality information
2204 # TODO bind to format param
2205 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2206 for codec in codecs:
2207 if codec[0] in config["video"]["files"]:
2208 video_codec = codec[0]
2209 video_extension = codec[1]
2210 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2211 else: quality = 'sd'
2214 self._downloader.trouble(u'ERROR: no known codec found')
2217 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2218 %(video_id, sig, timestamp, quality, video_codec.upper())
2221 # Process video information
2222 self._downloader.process_info({
2225 'uploader': video_uploader,
2226 'upload_date': video_upload_date,
2227 'title': video_title,
2228 'stitle': simple_title,
2229 'ext': video_extension,
2230 'thumbnail': video_thumbnail,
2231 'description': video_description,
2234 except UnavailableVideoError:
2235 self._downloader.trouble(u'ERROR: unable to download video')
2238 class GenericIE(InfoExtractor):
2239 """Generic last-resort information extractor."""
2242 IE_NAME = u'generic'
2244 def __init__(self, downloader=None):
2245 InfoExtractor.__init__(self, downloader)
2247 def report_download_webpage(self, video_id):
2248 """Report webpage download."""
2249 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2250 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2252 def report_extraction(self, video_id):
2253 """Report information extraction."""
2254 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2256 def _real_extract(self, url):
2257 # At this point we have a new video
2258 self._downloader.increment_downloads()
2260 video_id = url.split('/')[-1]
2261 request = urllib2.Request(url)
2263 self.report_download_webpage(video_id)
2264 webpage = urllib2.urlopen(request).read()
2265 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2266 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2268 except ValueError, err:
2269 # since this is the last-resort InfoExtractor, if
2270 # this error is thrown, it'll be thrown here
2271 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2274 self.report_extraction(video_id)
2275 # Start with something easy: JW Player in SWFObject
2276 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2278 # Broaden the search a little bit
2279 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2281 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2284 # It's possible that one of the regexes
2285 # matched, but returned an empty group:
2286 if mobj.group(1) is None:
2287 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2290 video_url = urllib.unquote(mobj.group(1))
2291 video_id = os.path.basename(video_url)
2293 # here's a fun little line of code for you:
2294 video_extension = os.path.splitext(video_id)[1][1:]
2295 video_id = os.path.splitext(video_id)[0]
2297 # it's tempting to parse this further, but you would
2298 # have to take into account all the variations like
2299 # Video Title - Site Name
2300 # Site Name | Video Title
2301 # Video Title - Tagline | Site Name
2302 # and so on and so forth; it's just not practical
2303 mobj = re.search(r'<title>(.*)</title>', webpage)
2305 self._downloader.trouble(u'ERROR: unable to extract title')
2307 video_title = mobj.group(1).decode('utf-8')
2308 video_title = sanitize_title(video_title)
2309 simple_title = _simplify_title(video_title)
2311 # video uploader is domain name
2312 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2314 self._downloader.trouble(u'ERROR: unable to extract title')
2316 video_uploader = mobj.group(1).decode('utf-8')
2319 # Process video information
2320 self._downloader.process_info({
2321 'id': video_id.decode('utf-8'),
2322 'url': video_url.decode('utf-8'),
2323 'uploader': video_uploader,
2324 'upload_date': u'NA',
2325 'title': video_title,
2326 'stitle': simple_title,
2327 'ext': video_extension.decode('utf-8'),
2331 except UnavailableVideoError, err:
2332 self._downloader.trouble(u'\nERROR: unable to download video')
2335 class YoutubeSearchIE(InfoExtractor):
2336 """Information Extractor for YouTube search queries."""
2337 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2338 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
2340 _max_youtube_results = 1000
2341 IE_NAME = u'youtube:search'
2343 def __init__(self, youtube_ie, downloader=None):
2344 InfoExtractor.__init__(self, downloader)
2345 self._youtube_ie = youtube_ie
2347 def report_download_page(self, query, pagenum):
2348 """Report attempt to download playlist page with given number."""
2349 query = query.decode(preferredencoding())
2350 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2352 def _real_initialize(self):
2353 self._youtube_ie.initialize()
2355 def _real_extract(self, query):
2356 mobj = re.match(self._VALID_URL, query)
2358 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2361 prefix, query = query.split(':')
2363 query = query.encode('utf-8')
2365 self._download_n_results(query, 1)
2367 elif prefix == 'all':
2368 self._download_n_results(query, self._max_youtube_results)
2374 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2376 elif n > self._max_youtube_results:
2377 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2378 n = self._max_youtube_results
2379 self._download_n_results(query, n)
2381 except ValueError: # parsing prefix as integer fails
2382 self._download_n_results(query, 1)
2385 def _download_n_results(self, query, n):
2386 """Downloads a specified number of results for a query"""
2392 while (50 * pagenum) < limit:
2393 self.report_download_page(query, pagenum+1)
2394 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
2395 request = urllib2.Request(result_url)
2397 data = urllib2.urlopen(request).read()
2398 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2399 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
2401 api_response = json.loads(data)['data']
2403 new_ids = list(video['id'] for video in api_response['items'])
2404 video_ids += new_ids
2406 limit = min(n, api_response['totalItems'])
2409 if len(video_ids) > n:
2410 video_ids = video_ids[:n]
2411 for id in video_ids:
2412 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2416 class GoogleSearchIE(InfoExtractor):
2417 """Information Extractor for Google Video search queries."""
2418 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2419 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2420 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2421 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2423 _max_google_results = 1000
2424 IE_NAME = u'video.google:search'
2426 def __init__(self, google_ie, downloader=None):
2427 InfoExtractor.__init__(self, downloader)
2428 self._google_ie = google_ie
2430 def report_download_page(self, query, pagenum):
2431 """Report attempt to download playlist page with given number."""
2432 query = query.decode(preferredencoding())
2433 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2435 def _real_initialize(self):
2436 self._google_ie.initialize()
2438 def _real_extract(self, query):
2439 mobj = re.match(self._VALID_URL, query)
2441 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2444 prefix, query = query.split(':')
2446 query = query.encode('utf-8')
2448 self._download_n_results(query, 1)
2450 elif prefix == 'all':
2451 self._download_n_results(query, self._max_google_results)
2457 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2459 elif n > self._max_google_results:
2460 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2461 n = self._max_google_results
2462 self._download_n_results(query, n)
2464 except ValueError: # parsing prefix as integer fails
2465 self._download_n_results(query, 1)
2468 def _download_n_results(self, query, n):
2469 """Downloads a specified number of results for a query"""
2475 self.report_download_page(query, pagenum)
2476 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2477 request = urllib2.Request(result_url)
2479 page = urllib2.urlopen(request).read()
2480 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2481 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2484 # Extract video identifiers
2485 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2486 video_id = mobj.group(1)
2487 if video_id not in video_ids:
2488 video_ids.append(video_id)
2489 if len(video_ids) == n:
2490 # Specified n videos reached
2491 for id in video_ids:
2492 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2495 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2496 for id in video_ids:
2497 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2500 pagenum = pagenum + 1
2503 class YahooSearchIE(InfoExtractor):
2504 """Information Extractor for Yahoo! Video search queries."""
2505 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2506 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2507 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2508 _MORE_PAGES_INDICATOR = r'\s*Next'
2510 _max_yahoo_results = 1000
2511 IE_NAME = u'video.yahoo:search'
2513 def __init__(self, yahoo_ie, downloader=None):
2514 InfoExtractor.__init__(self, downloader)
2515 self._yahoo_ie = yahoo_ie
2517 def report_download_page(self, query, pagenum):
2518 """Report attempt to download playlist page with given number."""
2519 query = query.decode(preferredencoding())
2520 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2522 def _real_initialize(self):
2523 self._yahoo_ie.initialize()
2525 def _real_extract(self, query):
2526 mobj = re.match(self._VALID_URL, query)
2528 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2531 prefix, query = query.split(':')
2533 query = query.encode('utf-8')
2535 self._download_n_results(query, 1)
2537 elif prefix == 'all':
2538 self._download_n_results(query, self._max_yahoo_results)
2544 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2546 elif n > self._max_yahoo_results:
2547 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2548 n = self._max_yahoo_results
2549 self._download_n_results(query, n)
2551 except ValueError: # parsing prefix as integer fails
2552 self._download_n_results(query, 1)
2555 def _download_n_results(self, query, n):
2556 """Downloads a specified number of results for a query"""
2559 already_seen = set()
2563 self.report_download_page(query, pagenum)
2564 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2565 request = urllib2.Request(result_url)
2567 page = urllib2.urlopen(request).read()
2568 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2569 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2572 # Extract video identifiers
2573 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2574 video_id = mobj.group(1)
2575 if video_id not in already_seen:
2576 video_ids.append(video_id)
2577 already_seen.add(video_id)
2578 if len(video_ids) == n:
2579 # Specified n videos reached
2580 for id in video_ids:
2581 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2584 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2585 for id in video_ids:
2586 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2589 pagenum = pagenum + 1
2592 class YoutubePlaylistIE(InfoExtractor):
2593 """Information Extractor for YouTube playlists."""
2595 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2596 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2597 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=PL%s&'
2598 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2600 IE_NAME = u'youtube:playlist'
2602 def __init__(self, youtube_ie, downloader=None):
2603 InfoExtractor.__init__(self, downloader)
2604 self._youtube_ie = youtube_ie
2606 def report_download_page(self, playlist_id, pagenum):
2607 """Report attempt to download playlist page with given number."""
2608 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2610 def _real_initialize(self):
2611 self._youtube_ie.initialize()
2613 def _real_extract(self, url):
2614 # Extract playlist id
2615 mobj = re.match(self._VALID_URL, url)
2617 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2621 if mobj.group(3) is not None:
2622 self._youtube_ie.extract(mobj.group(3))
2625 # Download playlist pages
2626 # prefix is 'p' as default for playlists but there are other types that need extra care
2627 playlist_prefix = mobj.group(1)
2628 if playlist_prefix == 'a':
2629 playlist_access = 'artist'
2631 playlist_prefix = 'p'
2632 playlist_access = 'view_play_list'
2633 playlist_id = mobj.group(2)
2638 self.report_download_page(playlist_id, pagenum)
2639 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2640 request = urllib2.Request(url)
2642 page = urllib2.urlopen(request).read()
2643 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2644 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2647 # Extract video identifiers
2649 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
2650 if mobj.group(1) not in ids_in_page:
2651 ids_in_page.append(mobj.group(1))
2652 video_ids.extend(ids_in_page)
2654 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2656 pagenum = pagenum + 1
2658 playliststart = self._downloader.params.get('playliststart', 1) - 1
2659 playlistend = self._downloader.params.get('playlistend', -1)
2660 if playlistend == -1:
2661 video_ids = video_ids[playliststart:]
2663 video_ids = video_ids[playliststart:playlistend]
2665 for id in video_ids:
2666 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2670 class YoutubeUserIE(InfoExtractor):
2671 """Information Extractor for YouTube users."""
2673 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2674 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2675 _GDATA_PAGE_SIZE = 50
2676 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2677 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2679 IE_NAME = u'youtube:user'
2681 def __init__(self, youtube_ie, downloader=None):
2682 InfoExtractor.__init__(self, downloader)
2683 self._youtube_ie = youtube_ie
2685 def report_download_page(self, username, start_index):
2686 """Report attempt to download user page."""
2687 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2688 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2690 def _real_initialize(self):
2691 self._youtube_ie.initialize()
2693 def _real_extract(self, url):
2695 mobj = re.match(self._VALID_URL, url)
2697 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2700 username = mobj.group(1)
2702 # Download video ids using YouTube Data API. Result size per
2703 # query is limited (currently to 50 videos) so we need to query
2704 # page by page until there are no video ids - it means we got
2711 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2712 self.report_download_page(username, start_index)
2714 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2717 page = urllib2.urlopen(request).read()
2718 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2719 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2722 # Extract video identifiers
2725 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2726 if mobj.group(1) not in ids_in_page:
2727 ids_in_page.append(mobj.group(1))
2729 video_ids.extend(ids_in_page)
2731 # A little optimization - if current page is not
2732 # "full", ie. does not contain PAGE_SIZE video ids then
2733 # we can assume that this page is the last one - there
2734 # are no more ids on further pages - no need to query
2737 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2742 all_ids_count = len(video_ids)
2743 playliststart = self._downloader.params.get('playliststart', 1) - 1
2744 playlistend = self._downloader.params.get('playlistend', -1)
2746 if playlistend == -1:
2747 video_ids = video_ids[playliststart:]
2749 video_ids = video_ids[playliststart:playlistend]
2751 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2752 (username, all_ids_count, len(video_ids)))
2754 for video_id in video_ids:
2755 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2758 class DepositFilesIE(InfoExtractor):
2759 """Information extractor for depositfiles.com"""
2761 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2762 IE_NAME = u'DepositFiles'
2764 def __init__(self, downloader=None):
2765 InfoExtractor.__init__(self, downloader)
2767 def report_download_webpage(self, file_id):
2768 """Report webpage download."""
2769 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2771 def report_extraction(self, file_id):
2772 """Report information extraction."""
2773 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2775 def _real_extract(self, url):
2776 # At this point we have a new file
2777 self._downloader.increment_downloads()
2779 file_id = url.split('/')[-1]
2780 # Rebuild url in english locale
2781 url = 'http://depositfiles.com/en/files/' + file_id
2783 # Retrieve file webpage with 'Free download' button pressed
2784 free_download_indication = { 'gateway_result' : '1' }
2785 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2787 self.report_download_webpage(file_id)
2788 webpage = urllib2.urlopen(request).read()
2789 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2790 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2793 # Search for the real file URL
2794 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2795 if (mobj is None) or (mobj.group(1) is None):
2796 # Try to figure out reason of the error.
2797 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2798 if (mobj is not None) and (mobj.group(1) is not None):
2799 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2800 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2802 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2805 file_url = mobj.group(1)
2806 file_extension = os.path.splitext(file_url)[1][1:]
2808 # Search for file title
2809 mobj = re.search(r'<b title="(.*?)">', webpage)
2811 self._downloader.trouble(u'ERROR: unable to extract title')
2813 file_title = mobj.group(1).decode('utf-8')
2816 # Process file information
2817 self._downloader.process_info({
2818 'id': file_id.decode('utf-8'),
2819 'url': file_url.decode('utf-8'),
2821 'upload_date': u'NA',
2822 'title': file_title,
2823 'stitle': file_title,
2824 'ext': file_extension.decode('utf-8'),
2828 except UnavailableVideoError, err:
2829 self._downloader.trouble(u'ERROR: unable to download file')
2832 class FacebookIE(InfoExtractor):
2833 """Information Extractor for Facebook"""
2835 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2836 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2837 _NETRC_MACHINE = 'facebook'
2838 _available_formats = ['video', 'highqual', 'lowqual']
2839 _video_extensions = {
2844 IE_NAME = u'facebook'
2846 def __init__(self, downloader=None):
2847 InfoExtractor.__init__(self, downloader)
2849 def _reporter(self, message):
2850 """Add header and report message."""
2851 self._downloader.to_screen(u'[facebook] %s' % message)
2853 def report_login(self):
2854 """Report attempt to log in."""
2855 self._reporter(u'Logging in')
2857 def report_video_webpage_download(self, video_id):
2858 """Report attempt to download video webpage."""
2859 self._reporter(u'%s: Downloading video webpage' % video_id)
2861 def report_information_extraction(self, video_id):
2862 """Report attempt to extract video information."""
2863 self._reporter(u'%s: Extracting video information' % video_id)
2865 def _parse_page(self, video_webpage):
2866 """Extract video information from page"""
2868 data = {'title': r'\("video_title", "(.*?)"\)',
2869 'description': r'<div class="datawrap">(.*?)</div>',
2870 'owner': r'\("video_owner_name", "(.*?)"\)',
2871 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2874 for piece in data.keys():
2875 mobj = re.search(data[piece], video_webpage)
2876 if mobj is not None:
2877 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2881 for fmt in self._available_formats:
2882 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2883 if mobj is not None:
2884 # URL is in a Javascript segment inside an escaped Unicode format within
2885 # the generally utf-8 page
2886 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2887 video_info['video_urls'] = video_urls
2891 def _real_initialize(self):
2892 if self._downloader is None:
2897 downloader_params = self._downloader.params
2899 # Attempt to use provided username and password or .netrc data
2900 if downloader_params.get('username', None) is not None:
2901 useremail = downloader_params['username']
2902 password = downloader_params['password']
2903 elif downloader_params.get('usenetrc', False):
2905 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2906 if info is not None:
2910 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2911 except (IOError, netrc.NetrcParseError), err:
2912 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2915 if useremail is None:
2924 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2927 login_results = urllib2.urlopen(request).read()
2928 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2929 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2931 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2932 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2935 def _real_extract(self, url):
2936 mobj = re.match(self._VALID_URL, url)
2938 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2940 video_id = mobj.group('ID')
2943 self.report_video_webpage_download(video_id)
2944 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2946 page = urllib2.urlopen(request)
2947 video_webpage = page.read()
2948 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2949 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2952 # Start extracting information
2953 self.report_information_extraction(video_id)
2955 # Extract information
2956 video_info = self._parse_page(video_webpage)
2959 if 'owner' not in video_info:
2960 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2962 video_uploader = video_info['owner']
2965 if 'title' not in video_info:
2966 self._downloader.trouble(u'ERROR: unable to extract video title')
2968 video_title = video_info['title']
2969 video_title = video_title.decode('utf-8')
2970 video_title = sanitize_title(video_title)
2972 simple_title = _simplify_title(video_title)
2975 if 'thumbnail' not in video_info:
2976 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2977 video_thumbnail = ''
2979 video_thumbnail = video_info['thumbnail']
2983 if 'upload_date' in video_info:
2984 upload_time = video_info['upload_date']
2985 timetuple = email.utils.parsedate_tz(upload_time)
2986 if timetuple is not None:
2988 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2993 video_description = video_info.get('description', 'No description available.')
2995 url_map = video_info['video_urls']
2996 if len(url_map.keys()) > 0:
2997 # Decide which formats to download
2998 req_format = self._downloader.params.get('format', None)
2999 format_limit = self._downloader.params.get('format_limit', None)
3001 if format_limit is not None and format_limit in self._available_formats:
3002 format_list = self._available_formats[self._available_formats.index(format_limit):]
3004 format_list = self._available_formats
3005 existing_formats = [x for x in format_list if x in url_map]
3006 if len(existing_formats) == 0:
3007 self._downloader.trouble(u'ERROR: no known formats available for video')
3009 if req_format is None:
3010 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
3011 elif req_format == 'worst':
3012 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3013 elif req_format == '-1':
3014 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3017 if req_format not in url_map:
3018 self._downloader.trouble(u'ERROR: requested format not available')
3020 video_url_list = [(req_format, url_map[req_format])] # Specific format
3022 for format_param, video_real_url in video_url_list:
3024 # At this point we have a new video
3025 self._downloader.increment_downloads()
3028 video_extension = self._video_extensions.get(format_param, 'mp4')
3031 # Process video information
3032 self._downloader.process_info({
3033 'id': video_id.decode('utf-8'),
3034 'url': video_real_url.decode('utf-8'),
3035 'uploader': video_uploader.decode('utf-8'),
3036 'upload_date': upload_date,
3037 'title': video_title,
3038 'stitle': simple_title,
3039 'ext': video_extension.decode('utf-8'),
3040 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3041 'thumbnail': video_thumbnail.decode('utf-8'),
3042 'description': video_description.decode('utf-8'),
3045 except UnavailableVideoError, err:
3046 self._downloader.trouble(u'\nERROR: unable to download video')
3048 class BlipTVIE(InfoExtractor):
3049 """Information extractor for blip.tv"""
3051 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3052 _URL_EXT = r'^.*\.([a-z0-9]+)$'
3053 IE_NAME = u'blip.tv'
3055 def report_extraction(self, file_id):
3056 """Report information extraction."""
3057 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3059 def report_direct_download(self, title):
3060 """Report information extraction."""
3061 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3063 def _real_extract(self, url):
3064 mobj = re.match(self._VALID_URL, url)
3066 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3073 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3074 request = urllib2.Request(json_url)
3075 self.report_extraction(mobj.group(1))
3078 urlh = urllib2.urlopen(request)
3079 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3080 basename = url.split('/')[-1]
3081 title,ext = os.path.splitext(basename)
3082 title = title.decode('UTF-8')
3083 ext = ext.replace('.', '')
3084 self.report_direct_download(title)
3089 'stitle': _simplify_title(title),
3093 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3094 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3096 if info is None: # Regular URL
3098 json_code = urlh.read()
3099 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3100 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3104 json_data = json.loads(json_code)
3105 if 'Post' in json_data:
3106 data = json_data['Post']
3110 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3111 video_url = data['media']['url']
3112 umobj = re.match(self._URL_EXT, video_url)
3114 raise ValueError('Can not determine filename extension')
3115 ext = umobj.group(1)
3118 'id': data['item_id'],
3120 'uploader': data['display_name'],
3121 'upload_date': upload_date,
3122 'title': data['title'],
3123 'stitle': _simplify_title(data['title']),
3125 'format': data['media']['mimeType'],
3126 'thumbnail': data['thumbnailUrl'],
3127 'description': data['description'],
3128 'player_url': data['embedUrl']
3130 except (ValueError,KeyError), err:
3131 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3134 self._downloader.increment_downloads()
3137 self._downloader.process_info(info)
3138 except UnavailableVideoError, err:
3139 self._downloader.trouble(u'\nERROR: unable to download video')
3142 class MyVideoIE(InfoExtractor):
3143 """Information Extractor for myvideo.de."""
3145 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3146 IE_NAME = u'myvideo'
3148 def __init__(self, downloader=None):
3149 InfoExtractor.__init__(self, downloader)
3151 def report_download_webpage(self, video_id):
3152 """Report webpage download."""
3153 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3155 def report_extraction(self, video_id):
3156 """Report information extraction."""
3157 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3159 def _real_extract(self,url):
3160 mobj = re.match(self._VALID_URL, url)
3162 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3165 video_id = mobj.group(1)
3168 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3170 self.report_download_webpage(video_id)
3171 webpage = urllib2.urlopen(request).read()
3172 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3173 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3176 self.report_extraction(video_id)
3177 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3180 self._downloader.trouble(u'ERROR: unable to extract media URL')
3182 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3184 mobj = re.search('<title>([^<]+)</title>', webpage)
3186 self._downloader.trouble(u'ERROR: unable to extract title')
3189 video_title = mobj.group(1)
3190 video_title = sanitize_title(video_title)
3192 simple_title = _simplify_title(video_title)
3195 self._downloader.process_info({
3199 'upload_date': u'NA',
3200 'title': video_title,
3201 'stitle': simple_title,
3206 except UnavailableVideoError:
3207 self._downloader.trouble(u'\nERROR: Unable to download video')
3209 class ComedyCentralIE(InfoExtractor):
3210 """Information extractor for The Daily Show and Colbert Report """
3212 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3213 IE_NAME = u'comedycentral'
3215 def report_extraction(self, episode_id):
3216 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3218 def report_config_download(self, episode_id):
3219 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3221 def report_index_download(self, episode_id):
3222 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3224 def report_player_url(self, episode_id):
3225 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3227 def _real_extract(self, url):
3228 mobj = re.match(self._VALID_URL, url)
3230 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3233 if mobj.group('shortname'):
3234 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3235 url = u'http://www.thedailyshow.com/full-episodes/'
3237 url = u'http://www.colbertnation.com/full-episodes/'
3238 mobj = re.match(self._VALID_URL, url)
3239 assert mobj is not None
3241 dlNewest = not mobj.group('episode')
3243 epTitle = mobj.group('showname')
3245 epTitle = mobj.group('episode')
3247 req = urllib2.Request(url)
3248 self.report_extraction(epTitle)
3250 htmlHandle = urllib2.urlopen(req)
3251 html = htmlHandle.read()
3252 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3253 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3256 url = htmlHandle.geturl()
3257 mobj = re.match(self._VALID_URL, url)
3259 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3261 if mobj.group('episode') == '':
3262 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3264 epTitle = mobj.group('episode')
3266 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3267 if len(mMovieParams) == 0:
3268 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3271 playerUrl_raw = mMovieParams[0][0]
3272 self.report_player_url(epTitle)
3274 urlHandle = urllib2.urlopen(playerUrl_raw)
3275 playerUrl = urlHandle.geturl()
3276 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3277 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3280 uri = mMovieParams[0][1]
3281 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3282 self.report_index_download(epTitle)
3284 indexXml = urllib2.urlopen(indexUrl).read()
3285 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3286 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3289 idoc = xml.etree.ElementTree.fromstring(indexXml)
3290 itemEls = idoc.findall('.//item')
3291 for itemEl in itemEls:
3292 mediaId = itemEl.findall('./guid')[0].text
3293 shortMediaId = mediaId.split(':')[-1]
3294 showId = mediaId.split(':')[-2].replace('.com', '')
3295 officialTitle = itemEl.findall('./title')[0].text
3296 officialDate = itemEl.findall('./pubDate')[0].text
3298 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3299 urllib.urlencode({'uri': mediaId}))
3300 configReq = urllib2.Request(configUrl)
3301 self.report_config_download(epTitle)
3303 configXml = urllib2.urlopen(configReq).read()
3304 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3305 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3308 cdoc = xml.etree.ElementTree.fromstring(configXml)
3310 for rendition in cdoc.findall('.//rendition'):
3311 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3315 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3318 # For now, just pick the highest bitrate
3319 format,video_url = turls[-1]
3321 self._downloader.increment_downloads()
3323 effTitle = showId + u'-' + epTitle
3328 'upload_date': officialDate,
3330 'stitle': _simplify_title(effTitle),
3334 'description': officialTitle,
3335 'player_url': playerUrl
3339 self._downloader.process_info(info)
3340 except UnavailableVideoError, err:
3341 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3345 class EscapistIE(InfoExtractor):
3346 """Information extractor for The Escapist """
3348 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3349 IE_NAME = u'escapist'
3351 def report_extraction(self, showName):
3352 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3354 def report_config_download(self, showName):
3355 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3357 def _real_extract(self, url):
3358 mobj = re.match(self._VALID_URL, url)
3360 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3362 showName = mobj.group('showname')
3363 videoId = mobj.group('episode')
3365 self.report_extraction(showName)
3367 webPage = urllib2.urlopen(url).read()
3368 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3369 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3372 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3373 description = unescapeHTML(descMatch.group(1))
3374 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3375 imgUrl = unescapeHTML(imgMatch.group(1))
3376 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3377 playerUrl = unescapeHTML(playerUrlMatch.group(1))
3378 configUrlMatch = re.search('config=(.*)$', playerUrl)
3379 configUrl = urllib2.unquote(configUrlMatch.group(1))
3381 self.report_config_download(showName)
3383 configJSON = urllib2.urlopen(configUrl).read()
3384 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3385 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3388 # Technically, it's JavaScript, not JSON
3389 configJSON = configJSON.replace("'", '"')
3392 config = json.loads(configJSON)
3393 except (ValueError,), err:
3394 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3397 playlist = config['playlist']
3398 videoUrl = playlist[1]['url']
3400 self._downloader.increment_downloads()
3404 'uploader': showName,
3405 'upload_date': None,
3407 'stitle': _simplify_title(showName),
3410 'thumbnail': imgUrl,
3411 'description': description,
3412 'player_url': playerUrl,
3416 self._downloader.process_info(info)
3417 except UnavailableVideoError, err:
3418 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3421 class CollegeHumorIE(InfoExtractor):
3422 """Information extractor for collegehumor.com"""
3424 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3425 IE_NAME = u'collegehumor'
3427 def report_webpage(self, video_id):
3428 """Report information extraction."""
3429 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3431 def report_extraction(self, video_id):
3432 """Report information extraction."""
3433 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3435 def _real_extract(self, url):
3436 mobj = re.match(self._VALID_URL, url)
3438 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3440 video_id = mobj.group('videoid')
3442 self.report_webpage(video_id)
3443 request = urllib2.Request(url)
3445 webpage = urllib2.urlopen(request).read()
3446 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3447 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3450 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3452 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3454 internal_video_id = m.group('internalvideoid')
3458 'internal_id': internal_video_id,
3461 self.report_extraction(video_id)
3462 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3464 metaXml = urllib2.urlopen(xmlUrl).read()
3465 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3466 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3469 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3471 videoNode = mdoc.findall('./video')[0]
3472 info['description'] = videoNode.findall('./description')[0].text
3473 info['title'] = videoNode.findall('./caption')[0].text
3474 info['stitle'] = _simplify_title(info['title'])
3475 info['url'] = videoNode.findall('./file')[0].text
3476 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3477 info['ext'] = info['url'].rpartition('.')[2]
3478 info['format'] = info['ext']
3480 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3483 self._downloader.increment_downloads()
3486 self._downloader.process_info(info)
3487 except UnavailableVideoError, err:
3488 self._downloader.trouble(u'\nERROR: unable to download video')
3491 class XVideosIE(InfoExtractor):
3492 """Information extractor for xvideos.com"""
3494 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3495 IE_NAME = u'xvideos'
3497 def report_webpage(self, video_id):
3498 """Report information extraction."""
3499 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3501 def report_extraction(self, video_id):
3502 """Report information extraction."""
3503 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3505 def _real_extract(self, url):
3506 mobj = re.match(self._VALID_URL, url)
3508 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3510 video_id = mobj.group(1).decode('utf-8')
3512 self.report_webpage(video_id)
3514 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3516 webpage = urllib2.urlopen(request).read()
3517 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3518 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3521 self.report_extraction(video_id)
3525 mobj = re.search(r'flv_url=(.+?)&', webpage)
3527 self._downloader.trouble(u'ERROR: unable to extract video url')
3529 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3533 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3535 self._downloader.trouble(u'ERROR: unable to extract video title')
3537 video_title = mobj.group(1).decode('utf-8')
3540 # Extract video thumbnail
3541 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3543 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3545 video_thumbnail = mobj.group(1).decode('utf-8')
3549 self._downloader.increment_downloads()
3554 'upload_date': None,
3555 'title': video_title,
3556 'stitle': _simplify_title(video_title),
3559 'thumbnail': video_thumbnail,
3560 'description': None,
3565 self._downloader.process_info(info)
3566 except UnavailableVideoError, err:
3567 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3570 class SoundcloudIE(InfoExtractor):
3571 """Information extractor for soundcloud.com
3572 To access the media, the uid of the song and a stream token
3573 must be extracted from the page source and the script must make
3574 a request to media.soundcloud.com/crossdomain.xml. Then
3575 the media can be grabbed by requesting from an url composed
3576 of the stream token and uid
3579 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3580 IE_NAME = u'soundcloud'
3582 def __init__(self, downloader=None):
3583 InfoExtractor.__init__(self, downloader)
3585 def report_webpage(self, video_id):
3586 """Report information extraction."""
3587 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3589 def report_extraction(self, video_id):
3590 """Report information extraction."""
3591 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3593 def _real_extract(self, url):
3594 mobj = re.match(self._VALID_URL, url)
3596 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3599 # extract uploader (which is in the url)
3600 uploader = mobj.group(1).decode('utf-8')
3601 # extract simple title (uploader + slug of song title)
3602 slug_title = mobj.group(2).decode('utf-8')
3603 simple_title = uploader + '-' + slug_title
3605 self.report_webpage('%s/%s' % (uploader, slug_title))
3607 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3609 webpage = urllib2.urlopen(request).read()
3610 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3611 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3614 self.report_extraction('%s/%s' % (uploader, slug_title))
3616 # extract uid and stream token that soundcloud hands out for access
3617 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3619 video_id = mobj.group(1)
3620 stream_token = mobj.group(2)
3622 # extract unsimplified title
3623 mobj = re.search('"title":"(.*?)",', webpage)
3625 title = mobj.group(1)
3627 # construct media url (with uid/token)
3628 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3629 mediaURL = mediaURL % (video_id, stream_token)
3632 description = u'No description available'
3633 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3635 description = mobj.group(1)
3639 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3642 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3643 except Exception, e:
3646 # for soundcloud, a request to a cross domain is required for cookies
3647 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3650 self._downloader.process_info({
3651 'id': video_id.decode('utf-8'),
3653 'uploader': uploader.decode('utf-8'),
3654 'upload_date': upload_date,
3655 'title': simple_title.decode('utf-8'),
3656 'stitle': simple_title.decode('utf-8'),
3660 'description': description.decode('utf-8')
3662 except UnavailableVideoError:
3663 self._downloader.trouble(u'\nERROR: unable to download video')
3666 class InfoQIE(InfoExtractor):
3667 """Information extractor for infoq.com"""
3669 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3672 def report_webpage(self, video_id):
3673 """Report information extraction."""
3674 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3676 def report_extraction(self, video_id):
3677 """Report information extraction."""
3678 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3680 def _real_extract(self, url):
3681 mobj = re.match(self._VALID_URL, url)
3683 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3686 self.report_webpage(url)
3688 request = urllib2.Request(url)
3690 webpage = urllib2.urlopen(request).read()
3691 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3692 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3695 self.report_extraction(url)
3699 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3701 self._downloader.trouble(u'ERROR: unable to extract video url')
3703 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3707 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3709 self._downloader.trouble(u'ERROR: unable to extract video title')
3711 video_title = mobj.group(1).decode('utf-8')
3713 # Extract description
3714 video_description = u'No description available.'
3715 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3716 if mobj is not None:
3717 video_description = mobj.group(1).decode('utf-8')
3719 video_filename = video_url.split('/')[-1]
3720 video_id, extension = video_filename.split('.')
3722 self._downloader.increment_downloads()
3727 'upload_date': None,
3728 'title': video_title,
3729 'stitle': _simplify_title(video_title),
3731 'format': extension, # Extension is always(?) mp4, but seems to be flv
3733 'description': video_description,
3738 self._downloader.process_info(info)
3739 except UnavailableVideoError, err:
3740 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3742 class MixcloudIE(InfoExtractor):
3743 """Information extractor for www.mixcloud.com"""
3744 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3745 IE_NAME = u'mixcloud'
3747 def __init__(self, downloader=None):
3748 InfoExtractor.__init__(self, downloader)
3750 def report_download_json(self, file_id):
3751 """Report JSON download."""
3752 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3754 def report_extraction(self, file_id):
3755 """Report information extraction."""
3756 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3758 def get_urls(self, jsonData, fmt, bitrate='best'):
3759 """Get urls from 'audio_formats' section in json"""
3762 bitrate_list = jsonData[fmt]
3763 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3764 bitrate = max(bitrate_list) # select highest
3766 url_list = jsonData[fmt][bitrate]
3767 except TypeError: # we have no bitrate info.
3768 url_list = jsonData[fmt]
3772 def check_urls(self, url_list):
3773 """Returns 1st active url from list"""
3774 for url in url_list:
3776 urllib2.urlopen(url)
3778 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3783 def _print_formats(self, formats):
3784 print 'Available formats:'
3785 for fmt in formats.keys():
3786 for b in formats[fmt]:
3788 ext = formats[fmt][b][0]
3789 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3790 except TypeError: # we have no bitrate info
3791 ext = formats[fmt][0]
3792 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3795 def _real_extract(self, url):
3796 mobj = re.match(self._VALID_URL, url)
3798 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3800 # extract uploader & filename from url
3801 uploader = mobj.group(1).decode('utf-8')
3802 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3804 # construct API request
3805 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3806 # retrieve .json file with links to files
3807 request = urllib2.Request(file_url)
3809 self.report_download_json(file_url)
3810 jsonData = urllib2.urlopen(request).read()
3811 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3812 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3816 json_data = json.loads(jsonData)
3817 player_url = json_data['player_swf_url']
3818 formats = dict(json_data['audio_formats'])
3820 req_format = self._downloader.params.get('format', None)
3823 if self._downloader.params.get('listformats', None):
3824 self._print_formats(formats)
3827 if req_format is None or req_format == 'best':
3828 for format_param in formats.keys():
3829 url_list = self.get_urls(formats, format_param)
3831 file_url = self.check_urls(url_list)
3832 if file_url is not None:
3835 if req_format not in formats.keys():
3836 self._downloader.trouble(u'ERROR: format is not available')
3839 url_list = self.get_urls(formats, req_format)
3840 file_url = self.check_urls(url_list)
3841 format_param = req_format
3844 self._downloader.increment_downloads()
3846 # Process file information
3847 self._downloader.process_info({
3848 'id': file_id.decode('utf-8'),
3849 'url': file_url.decode('utf-8'),
3850 'uploader': uploader.decode('utf-8'),
3851 'upload_date': u'NA',
3852 'title': json_data['name'],
3853 'stitle': _simplify_title(json_data['name']),
3854 'ext': file_url.split('.')[-1].decode('utf-8'),
3855 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3856 'thumbnail': json_data['thumbnail_url'],
3857 'description': json_data['description'],
3858 'player_url': player_url.decode('utf-8'),
3860 except UnavailableVideoError, err:
3861 self._downloader.trouble(u'ERROR: unable to download file')
3863 class StanfordOpenClassroomIE(InfoExtractor):
3864 """Information extractor for Stanford's Open ClassRoom"""
3866 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3867 IE_NAME = u'stanfordoc'
3869 def report_download_webpage(self, objid):
3870 """Report information extraction."""
3871 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3873 def report_extraction(self, video_id):
3874 """Report information extraction."""
3875 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3877 def _real_extract(self, url):
3878 mobj = re.match(self._VALID_URL, url)
3880 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3883 if mobj.group('course') and mobj.group('video'): # A specific video
3884 course = mobj.group('course')
3885 video = mobj.group('video')
3887 'id': _simplify_title(course + '_' + video),
3890 self.report_extraction(info['id'])
3891 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3892 xmlUrl = baseUrl + video + '.xml'
3894 metaXml = urllib2.urlopen(xmlUrl).read()
3895 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3896 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3898 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3900 info['title'] = mdoc.findall('./title')[0].text
3901 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3903 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3905 info['stitle'] = _simplify_title(info['title'])
3906 info['ext'] = info['url'].rpartition('.')[2]
3907 info['format'] = info['ext']
3908 self._downloader.increment_downloads()
3910 self._downloader.process_info(info)
3911 except UnavailableVideoError, err:
3912 self._downloader.trouble(u'\nERROR: unable to download video')
3913 elif mobj.group('course'): # A course page
3914 course = mobj.group('course')
3916 'id': _simplify_title(course),
3920 self.report_download_webpage(info['id'])
3922 coursepage = urllib2.urlopen(url).read()
3923 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3924 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3927 m = re.search('<h1>([^<]+)</h1>', coursepage)
3929 info['title'] = unescapeHTML(m.group(1))
3931 info['title'] = info['id']
3932 info['stitle'] = _simplify_title(info['title'])
3934 m = re.search('<description>([^<]+)</description>', coursepage)
3936 info['description'] = unescapeHTML(m.group(1))
3938 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3941 'type': 'reference',
3942 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3946 for entry in info['list']:
3947 assert entry['type'] == 'reference'
3948 self.extract(entry['url'])
3951 'id': 'Stanford OpenClassroom',
3955 self.report_download_webpage(info['id'])
3956 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3958 rootpage = urllib2.urlopen(rootURL).read()
3959 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3960 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3963 info['title'] = info['id']
3964 info['stitle'] = _simplify_title(info['title'])
3966 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3969 'type': 'reference',
3970 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3974 for entry in info['list']:
3975 assert entry['type'] == 'reference'
3976 self.extract(entry['url'])
3978 class MTVIE(InfoExtractor):
3979 """Information extractor for MTV.com"""
3981 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3984 def report_webpage(self, video_id):
3985 """Report information extraction."""
3986 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3988 def report_extraction(self, video_id):
3989 """Report information extraction."""
3990 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3992 def _real_extract(self, url):
3993 mobj = re.match(self._VALID_URL, url)
3995 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3997 if not mobj.group('proto'):
3998 url = 'http://' + url
3999 video_id = mobj.group('videoid')
4000 self.report_webpage(video_id)
4002 request = urllib2.Request(url)
4004 webpage = urllib2.urlopen(request).read()
4005 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4006 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4009 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4011 self._downloader.trouble(u'ERROR: unable to extract song name')
4013 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4014 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4016 self._downloader.trouble(u'ERROR: unable to extract performer')
4018 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4019 video_title = performer + ' - ' + song_name
4021 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4023 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4025 mtvn_uri = mobj.group(1)
4027 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4029 self._downloader.trouble(u'ERROR: unable to extract content id')
4031 content_id = mobj.group(1)
4033 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4034 self.report_extraction(video_id)
4035 request = urllib2.Request(videogen_url)
4037 metadataXml = urllib2.urlopen(request).read()
4038 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4039 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4042 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4043 renditions = mdoc.findall('.//rendition')
4045 # For now, always pick the highest quality.
4046 rendition = renditions[-1]
4049 _,_,ext = rendition.attrib['type'].partition('/')
4050 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4051 video_url = rendition.find('./src').text
4053 self._downloader.trouble('Invalid rendition field.')
4056 self._downloader.increment_downloads()
4060 'uploader': performer,
4061 'title': video_title,
4062 'stitle': _simplify_title(video_title),
4068 self._downloader.process_info(info)
4069 except UnavailableVideoError, err:
4070 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4073 class PostProcessor(object):
4074 """Post Processor class.
4076 PostProcessor objects can be added to downloaders with their
4077 add_post_processor() method. When the downloader has finished a
4078 successful download, it will take its internal chain of PostProcessors
4079 and start calling the run() method on each one of them, first with
4080 an initial argument and then with the returned value of the previous
4083 The chain will be stopped if one of them ever returns None or the end
4084 of the chain is reached.
4086 PostProcessor objects follow a "mutual registration" process similar
4087 to InfoExtractor objects.
4092 def __init__(self, downloader=None):
4093 self._downloader = downloader
4095 def set_downloader(self, downloader):
4096 """Sets the downloader for this PP."""
4097 self._downloader = downloader
4099 def run(self, information):
4100 """Run the PostProcessor.
4102 The "information" argument is a dictionary like the ones
4103 composed by InfoExtractors. The only difference is that this
4104 one has an extra field called "filepath" that points to the
4107 When this method returns None, the postprocessing chain is
4108 stopped. However, this method may return an information
4109 dictionary that will be passed to the next postprocessing
4110 object in the chain. It can be the one it received after
4111 changing some fields.
4113 In addition, this method may raise a PostProcessingError
4114 exception that will be taken into account by the downloader
4117 return information # by default, do nothing
4119 class AudioConversionError(BaseException):
4120 def __init__(self, message):
4121 self.message = message
4123 class FFmpegExtractAudioPP(PostProcessor):
4125 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4126 PostProcessor.__init__(self, downloader)
4127 if preferredcodec is None:
4128 preferredcodec = 'best'
4129 self._preferredcodec = preferredcodec
4130 self._preferredquality = preferredquality
4131 self._keepvideo = keepvideo
4134 def get_audio_codec(path):
4136 cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4137 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4138 output = handle.communicate()[0]
4139 if handle.wait() != 0:
4141 except (IOError, OSError):
4144 for line in output.split('\n'):
4145 if line.startswith('codec_name='):
4146 audio_codec = line.split('=')[1].strip()
4147 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4152 def run_ffmpeg(path, out_path, codec, more_opts):
4156 acodec_opts = ['-acodec', codec]
4157 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4159 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4160 stdout,stderr = p.communicate()
4161 except (IOError, OSError):
4162 e = sys.exc_info()[1]
4163 if isinstance(e, OSError) and e.errno == 2:
4164 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4167 if p.returncode != 0:
4168 msg = stderr.strip().split('\n')[-1]
4169 raise AudioConversionError(msg)
4171 def run(self, information):
4172 path = information['filepath']
4174 filecodec = self.get_audio_codec(path)
4175 if filecodec is None:
4176 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4180 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4181 if self._preferredcodec == 'm4a' and filecodec == 'aac':
4182 # Lossless, but in another container
4184 extension = self._preferredcodec
4185 more_opts = ['-absf', 'aac_adtstoasc']
4186 elif filecodec in ['aac', 'mp3', 'vorbis']:
4187 # Lossless if possible
4189 extension = filecodec
4190 if filecodec == 'aac':
4191 more_opts = ['-f', 'adts']
4192 if filecodec == 'vorbis':
4196 acodec = 'libmp3lame'
4199 if self._preferredquality is not None:
4200 more_opts += ['-ab', self._preferredquality]
4202 # We convert the audio (lossy)
4203 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4204 extension = self._preferredcodec
4206 if self._preferredquality is not None:
4207 more_opts += ['-ab', self._preferredquality]
4208 if self._preferredcodec == 'aac':
4209 more_opts += ['-f', 'adts']
4210 if self._preferredcodec == 'm4a':
4211 more_opts += ['-absf', 'aac_adtstoasc']
4212 if self._preferredcodec == 'vorbis':
4214 if self._preferredcodec == 'wav':
4216 more_opts += ['-f', 'wav']
4218 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4219 new_path = prefix + sep + extension
4220 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4222 self.run_ffmpeg(path, new_path, acodec, more_opts)
4224 etype,e,tb = sys.exc_info()
4225 if isinstance(e, AudioConversionError):
4226 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4228 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4231 # Try to update the date time for extracted audio file.
4232 if information.get('filetime') is not None:
4234 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4236 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4238 if not self._keepvideo:
4240 os.remove(_encodeFilename(path))
4241 except (IOError, OSError):
4242 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4245 information['filepath'] = new_path
4249 def updateSelf(downloader, filename):
4250 ''' Update the program file with the latest version from the repository '''
4251 # Note: downloader only used for options
4252 if not os.access(filename, os.W_OK):
4253 sys.exit('ERROR: no write permissions on %s' % filename)
4255 downloader.to_screen(u'Updating to latest version...')
4259 urlh = urllib.urlopen(UPDATE_URL)
4260 newcontent = urlh.read()
4262 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4263 if vmatch is not None and vmatch.group(1) == __version__:
4264 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4268 except (IOError, OSError), err:
4269 sys.exit('ERROR: unable to download latest version')
4272 outf = open(filename, 'wb')
4274 outf.write(newcontent)
4277 except (IOError, OSError), err:
4278 sys.exit('ERROR: unable to overwrite current version')
4280 downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4283 def _readOptions(filename_bytes):
4285 optionf = open(filename_bytes)
4287 return [] # silently skip if file is not present
4291 res += shlex.split(l, comments=True)
4296 def _format_option_string(option):
4297 ''' ('-o', '--option') -> -o, --format METAVAR'''
4301 if option._short_opts: opts.append(option._short_opts[0])
4302 if option._long_opts: opts.append(option._long_opts[0])
4303 if len(opts) > 1: opts.insert(1, ', ')
4305 if option.takes_value(): opts.append(' %s' % option.metavar)
4307 return "".join(opts)
4309 def _find_term_columns():
4310 columns = os.environ.get('COLUMNS', None)
4315 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4316 out,err = sp.communicate()
4317 return int(out.split()[1])
4323 max_help_position = 80
4325 # No need to wrap help messages if we're on a wide console
4326 columns = _find_term_columns()
4327 if columns: max_width = columns
4329 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4330 fmt.format_option_strings = _format_option_string
4333 'version' : __version__,
4335 'usage' : '%prog [options] url [url...]',
4336 'conflict_handler' : 'resolve',
4339 parser = optparse.OptionParser(**kw)
4342 general = optparse.OptionGroup(parser, 'General Options')
4343 selection = optparse.OptionGroup(parser, 'Video Selection')
4344 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4345 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4346 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4347 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4348 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4350 general.add_option('-h', '--help',
4351 action='help', help='print this help text and exit')
4352 general.add_option('-v', '--version',
4353 action='version', help='print program version and exit')
4354 general.add_option('-U', '--update',
4355 action='store_true', dest='update_self', help='update this program to latest version')
4356 general.add_option('-i', '--ignore-errors',
4357 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4358 general.add_option('-r', '--rate-limit',
4359 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4360 general.add_option('-R', '--retries',
4361 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4362 general.add_option('--dump-user-agent',
4363 action='store_true', dest='dump_user_agent',
4364 help='display the current browser identification', default=False)
4365 general.add_option('--list-extractors',
4366 action='store_true', dest='list_extractors',
4367 help='List all supported extractors and the URLs they would handle', default=False)
4369 selection.add_option('--playlist-start',
4370 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4371 selection.add_option('--playlist-end',
4372 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4373 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4374 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4375 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4377 authentication.add_option('-u', '--username',
4378 dest='username', metavar='USERNAME', help='account username')
4379 authentication.add_option('-p', '--password',
4380 dest='password', metavar='PASSWORD', help='account password')
4381 authentication.add_option('-n', '--netrc',
4382 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4385 video_format.add_option('-f', '--format',
4386 action='store', dest='format', metavar='FORMAT', help='video format code')
4387 video_format.add_option('--all-formats',
4388 action='store_const', dest='format', help='download all available video formats', const='all')
4389 video_format.add_option('--prefer-free-formats',
4390 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4391 video_format.add_option('--max-quality',
4392 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4393 video_format.add_option('-F', '--list-formats',
4394 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4395 video_format.add_option('--write-srt',
4396 action='store_true', dest='writesubtitles',
4397 help='write video closed captions to a .srt file (currently youtube only)', default=False)
4398 video_format.add_option('--srt-lang',
4399 action='store', dest='subtitleslang', metavar='LANG',
4400 help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
4403 verbosity.add_option('-q', '--quiet',
4404 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4405 verbosity.add_option('-s', '--simulate',
4406 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4407 verbosity.add_option('--skip-download',
4408 action='store_true', dest='skip_download', help='do not download the video', default=False)
4409 verbosity.add_option('-g', '--get-url',
4410 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4411 verbosity.add_option('-e', '--get-title',
4412 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4413 verbosity.add_option('--get-thumbnail',
4414 action='store_true', dest='getthumbnail',
4415 help='simulate, quiet but print thumbnail URL', default=False)
4416 verbosity.add_option('--get-description',
4417 action='store_true', dest='getdescription',
4418 help='simulate, quiet but print video description', default=False)
4419 verbosity.add_option('--get-filename',
4420 action='store_true', dest='getfilename',
4421 help='simulate, quiet but print output filename', default=False)
4422 verbosity.add_option('--get-format',
4423 action='store_true', dest='getformat',
4424 help='simulate, quiet but print output format', default=False)
4425 verbosity.add_option('--no-progress',
4426 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4427 verbosity.add_option('--console-title',
4428 action='store_true', dest='consoletitle',
4429 help='display progress in console titlebar', default=False)
4430 verbosity.add_option('-v', '--verbose',
4431 action='store_true', dest='verbose', help='print various debugging information', default=False)
4434 filesystem.add_option('-t', '--title',
4435 action='store_true', dest='usetitle', help='use title in file name', default=False)
4436 filesystem.add_option('-l', '--literal',
4437 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4438 filesystem.add_option('-A', '--auto-number',
4439 action='store_true', dest='autonumber',
4440 help='number downloaded files starting from 00000', default=False)
4441 filesystem.add_option('-o', '--output',
4442 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4443 filesystem.add_option('-a', '--batch-file',
4444 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4445 filesystem.add_option('-w', '--no-overwrites',
4446 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4447 filesystem.add_option('-c', '--continue',
4448 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4449 filesystem.add_option('--no-continue',
4450 action='store_false', dest='continue_dl',
4451 help='do not resume partially downloaded files (restart from beginning)')
4452 filesystem.add_option('--cookies',
4453 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4454 filesystem.add_option('--no-part',
4455 action='store_true', dest='nopart', help='do not use .part files', default=False)
4456 filesystem.add_option('--no-mtime',
4457 action='store_false', dest='updatetime',
4458 help='do not use the Last-modified header to set the file modification time', default=True)
4459 filesystem.add_option('--write-description',
4460 action='store_true', dest='writedescription',
4461 help='write video description to a .description file', default=False)
4462 filesystem.add_option('--write-info-json',
4463 action='store_true', dest='writeinfojson',
4464 help='write video metadata to a .info.json file', default=False)
4467 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4468 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4469 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4470 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4471 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4472 help='ffmpeg audio bitrate specification, 128k by default')
4473 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4474 help='keeps the video file on disk after the post-processing; the video is erased by default')
4477 parser.add_option_group(general)
4478 parser.add_option_group(selection)
4479 parser.add_option_group(filesystem)
4480 parser.add_option_group(verbosity)
4481 parser.add_option_group(video_format)
4482 parser.add_option_group(authentication)
4483 parser.add_option_group(postproc)
4485 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4487 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4489 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4490 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4491 opts, args = parser.parse_args(argv)
4493 return parser, opts, args
4495 def gen_extractors():
4496 """ Return a list of an instance of every supported extractor.
4497 The order does matter; the first extractor matched is the one handling the URL.
4499 youtube_ie = YoutubeIE()
4500 google_ie = GoogleIE()
4501 yahoo_ie = YahooIE()
4503 YoutubePlaylistIE(youtube_ie),
4504 YoutubeUserIE(youtube_ie),
4505 YoutubeSearchIE(youtube_ie),
4507 MetacafeIE(youtube_ie),
4510 GoogleSearchIE(google_ie),
4513 YahooSearchIE(yahoo_ie),
4526 StanfordOpenClassroomIE(),
4533 parser, opts, args = parseOpts()
4535 # Open appropriate CookieJar
4536 if opts.cookiefile is None:
4537 jar = cookielib.CookieJar()
4540 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4541 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4543 except (IOError, OSError), err:
4544 sys.exit(u'ERROR: unable to open cookie file')
4547 if opts.dump_user_agent:
4548 print std_headers['User-Agent']
4551 # Batch file verification
4553 if opts.batchfile is not None:
4555 if opts.batchfile == '-':
4558 batchfd = open(opts.batchfile, 'r')
4559 batchurls = batchfd.readlines()
4560 batchurls = [x.strip() for x in batchurls]
4561 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4563 sys.exit(u'ERROR: batch file could not be read')
4564 all_urls = batchurls + args
4566 # General configuration
4567 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4568 proxy_handler = urllib2.ProxyHandler()
4569 opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4570 urllib2.install_opener(opener)
4571 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4574 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4576 extractors = gen_extractors()
4578 if opts.list_extractors:
4579 for ie in extractors:
4581 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4582 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4583 for mu in matchedUrls:
4587 # Conflicting, missing and erroneous options
4588 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4589 parser.error(u'using .netrc conflicts with giving username/password')
4590 if opts.password is not None and opts.username is None:
4591 parser.error(u'account username missing')
4592 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4593 parser.error(u'using output template conflicts with using title, literal title or auto number')
4594 if opts.usetitle and opts.useliteral:
4595 parser.error(u'using title conflicts with using literal title')
4596 if opts.username is not None and opts.password is None:
4597 opts.password = getpass.getpass(u'Type account password and press return:')
4598 if opts.ratelimit is not None:
4599 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4600 if numeric_limit is None:
4601 parser.error(u'invalid rate limit specified')
4602 opts.ratelimit = numeric_limit
4603 if opts.retries is not None:
4605 opts.retries = long(opts.retries)
4606 except (TypeError, ValueError), err:
4607 parser.error(u'invalid retry count specified')
4609 opts.playliststart = int(opts.playliststart)
4610 if opts.playliststart <= 0:
4611 raise ValueError(u'Playlist start must be positive')
4612 except (TypeError, ValueError), err:
4613 parser.error(u'invalid playlist start number specified')
4615 opts.playlistend = int(opts.playlistend)
4616 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4617 raise ValueError(u'Playlist end must be greater than playlist start')
4618 except (TypeError, ValueError), err:
4619 parser.error(u'invalid playlist end number specified')
4620 if opts.extractaudio:
4621 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4622 parser.error(u'invalid audio format specified')
4625 fd = FileDownloader({
4626 'usenetrc': opts.usenetrc,
4627 'username': opts.username,
4628 'password': opts.password,
4629 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4630 'forceurl': opts.geturl,
4631 'forcetitle': opts.gettitle,
4632 'forcethumbnail': opts.getthumbnail,
4633 'forcedescription': opts.getdescription,
4634 'forcefilename': opts.getfilename,
4635 'forceformat': opts.getformat,
4636 'simulate': opts.simulate,
4637 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4638 'format': opts.format,
4639 'format_limit': opts.format_limit,
4640 'listformats': opts.listformats,
4641 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4642 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4643 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4644 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4645 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4646 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4647 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4648 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4649 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4650 or u'%(id)s.%(ext)s'),
4651 'ignoreerrors': opts.ignoreerrors,
4652 'ratelimit': opts.ratelimit,
4653 'nooverwrites': opts.nooverwrites,
4654 'retries': opts.retries,
4655 'continuedl': opts.continue_dl,
4656 'noprogress': opts.noprogress,
4657 'playliststart': opts.playliststart,
4658 'playlistend': opts.playlistend,
4659 'logtostderr': opts.outtmpl == '-',
4660 'consoletitle': opts.consoletitle,
4661 'nopart': opts.nopart,
4662 'updatetime': opts.updatetime,
4663 'writedescription': opts.writedescription,
4664 'writeinfojson': opts.writeinfojson,
4665 'writesubtitles': opts.writesubtitles,
4666 'subtitleslang': opts.subtitleslang,
4667 'matchtitle': opts.matchtitle,
4668 'rejecttitle': opts.rejecttitle,
4669 'max_downloads': opts.max_downloads,
4670 'prefer_free_formats': opts.prefer_free_formats,
4671 'verbose': opts.verbose,
4673 for extractor in extractors:
4674 fd.add_info_extractor(extractor)
4677 if opts.extractaudio:
4678 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4681 if opts.update_self:
4682 updateSelf(fd, sys.argv[0])
4685 if len(all_urls) < 1:
4686 if not opts.update_self:
4687 parser.error(u'you must provide at least one URL')
4692 retcode = fd.download(all_urls)
4693 except MaxDownloadsReached:
4694 fd.to_screen(u'--max-download limit reached, aborting.')
4697 # Dump cookie jar if requested
4698 if opts.cookiefile is not None:
4701 except (IOError, OSError), err:
4702 sys.exit(u'ERROR: unable to save cookie jar')
4709 except DownloadError:
4711 except SameFileError:
4712 sys.exit(u'ERROR: fixed output name but more than one file to download')
4713 except KeyboardInterrupt:
4714 sys.exit(u'\nERROR: Interrupted by user')
4716 if __name__ == '__main__':
4719 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: