2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
20 __license__ = 'Public Domain'
21 __version__ = '2012.02.27'
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
56 except ImportError: # Python 2.4
59 import cStringIO as StringIO
63 # parse_qs was moved from the cgi module to the urlparse module recently.
65 from urlparse import parse_qs
67 from cgi import parse_qs
75 import xml.etree.ElementTree
76 except ImportError: # Python<2.5: Not officially supported, but let it slip
77 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
80 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
81 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
82 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
83 'Accept-Encoding': 'gzip, deflate',
84 'Accept-Language': 'en-us,en;q=0.5',
89 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
95 def raiseError(msg, i):
96 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
97 def skipSpace(i, expectMore=True):
98 while i < len(s) and s[i] in ' \t\r\n':
102 raiseError('Premature end', i)
104 def decodeEscape(match):
120 return unichr(int(esc[1:5], 16))
121 if len(esc) == 5+6 and esc[5:7] == '\\u':
122 hi = int(esc[1:5], 16)
123 low = int(esc[7:11], 16)
124 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
125 raise ValueError('Unknown escape ' + str(esc))
132 while s[e-bslashes-1] == '\\':
134 if bslashes % 2 == 1:
138 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
139 stri = rexp.sub(decodeEscape, s[i:e])
145 if s[i] == '}': # Empty dictionary
149 raiseError('Expected a string object key', i)
150 i,key = parseString(i)
152 if i >= len(s) or s[i] != ':':
153 raiseError('Expected a colon', i)
160 raiseError('Expected comma or closing curly brace', i)
165 if s[i] == ']': # Empty array
170 i = skipSpace(i) # Raise exception if premature end
174 raiseError('Expected a comma or closing bracket', i)
176 def parseDiscrete(i):
177 for k,v in {'true': True, 'false': False, 'null': None}.items():
178 if s.startswith(k, i):
180 raiseError('Not a boolean (or null)', i)
182 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
184 raiseError('Not a number', i)
186 if '.' in nums or 'e' in nums or 'E' in nums:
187 return (i+len(nums), float(nums))
188 return (i+len(nums), int(nums))
189 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
192 i,res = CHARMAP.get(s[i], parseNumber)(i)
193 i = skipSpace(i, False)
197 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
200 def preferredencoding():
201 """Get preferred encoding.
203 Returns the best encoding scheme for the system, based on
204 locale.getpreferredencoding() and some further tweaks.
206 def yield_preferredencoding():
208 pref = locale.getpreferredencoding()
214 return yield_preferredencoding().next()
217 def htmlentity_transform(matchobj):
218 """Transforms an HTML entity to a Unicode character.
220 This function receives a match object and is intended to be used with
221 the re.sub() function.
223 entity = matchobj.group(1)
225 # Known non-numeric HTML entity
226 if entity in htmlentitydefs.name2codepoint:
227 return unichr(htmlentitydefs.name2codepoint[entity])
230 mobj = re.match(ur'(?u)#(x?\d+)', entity)
232 numstr = mobj.group(1)
233 if numstr.startswith(u'x'):
235 numstr = u'0%s' % numstr
238 return unichr(long(numstr, base))
240 # Unknown entity in name, return its literal representation
241 return (u'&%s;' % entity)
244 def sanitize_title(utitle):
245 """Sanitizes a video title so it could be used as part of a filename."""
246 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
247 return utitle.replace(unicode(os.sep), u'%')
250 def sanitize_open(filename, open_mode):
251 """Try to open the given filename, and slightly tweak it if this fails.
253 Attempts to open the given filename. If this fails, it tries to change
254 the filename slightly, step by step, until it's either able to open it
255 or it fails and raises a final exception, like the standard open()
258 It returns the tuple (stream, definitive_file_name).
262 if sys.platform == 'win32':
264 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
265 return (sys.stdout, filename)
266 stream = open(_encodeFilename(filename), open_mode)
267 return (stream, filename)
268 except (IOError, OSError), err:
269 # In case of error, try to remove win32 forbidden chars
270 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
272 # An exception here should be caught in the caller
273 stream = open(_encodeFilename(filename), open_mode)
274 return (stream, filename)
277 def timeconvert(timestr):
278 """Convert RFC 2822 defined time string into system timestamp"""
280 timetuple = email.utils.parsedate_tz(timestr)
281 if timetuple is not None:
282 timestamp = email.utils.mktime_tz(timetuple)
285 def _simplify_title(title):
286 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
287 return expr.sub(u'_', title).strip(u'_')
289 def _orderedSet(iterable):
290 """ Remove all duplicates from the input iterable """
297 def _unescapeHTML(s):
299 @param s a string (of type unicode)
301 assert type(s) == type(u'')
303 htmlParser = HTMLParser.HTMLParser()
304 return htmlParser.unescape(s)
306 def _encodeFilename(s):
308 @param s The name of the file (of type unicode)
311 assert type(s) == type(u'')
313 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
314 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
315 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
316 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
319 return s.encode(sys.getfilesystemencoding(), 'ignore')
321 class DownloadError(Exception):
322 """Download Error exception.
324 This exception may be thrown by FileDownloader objects if they are not
325 configured to continue on errors. They will contain the appropriate
331 class SameFileError(Exception):
332 """Same File exception.
334 This exception will be thrown by FileDownloader objects if they detect
335 multiple files would have to be downloaded to the same file on disk.
340 class PostProcessingError(Exception):
341 """Post Processing exception.
343 This exception may be raised by PostProcessor's .run() method to
344 indicate an error in the postprocessing task.
348 class MaxDownloadsReached(Exception):
349 """ --max-downloads limit has been reached. """
353 class UnavailableVideoError(Exception):
354 """Unavailable Format exception.
356 This exception will be thrown when a video is requested
357 in a format that is not available for that video.
362 class ContentTooShortError(Exception):
363 """Content Too Short exception.
365 This exception may be raised by FileDownloader objects when a file they
366 download is too small for what the server announced first, indicating
367 the connection was probably interrupted.
373 def __init__(self, downloaded, expected):
374 self.downloaded = downloaded
375 self.expected = expected
378 class YoutubeDLHandler(urllib2.HTTPHandler):
379 """Handler for HTTP requests and responses.
381 This class, when installed with an OpenerDirector, automatically adds
382 the standard headers to every HTTP request and handles gzipped and
383 deflated responses from web servers. If compression is to be avoided in
384 a particular request, the original request in the program code only has
385 to include the HTTP header "Youtubedl-No-Compression", which will be
386 removed before making the real request.
388 Part of this code was copied from:
390 http://techknack.net/python-urllib2-handlers/
392 Andrew Rowls, the author of that code, agreed to release it to the
399 return zlib.decompress(data, -zlib.MAX_WBITS)
401 return zlib.decompress(data)
404 def addinfourl_wrapper(stream, headers, url, code):
405 if hasattr(urllib2.addinfourl, 'getcode'):
406 return urllib2.addinfourl(stream, headers, url, code)
407 ret = urllib2.addinfourl(stream, headers, url)
411 def http_request(self, req):
412 for h in std_headers:
415 req.add_header(h, std_headers[h])
416 if 'Youtubedl-no-compression' in req.headers:
417 if 'Accept-encoding' in req.headers:
418 del req.headers['Accept-encoding']
419 del req.headers['Youtubedl-no-compression']
422 def http_response(self, req, resp):
425 if resp.headers.get('Content-encoding', '') == 'gzip':
426 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
427 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
428 resp.msg = old_resp.msg
430 if resp.headers.get('Content-encoding', '') == 'deflate':
431 gz = StringIO.StringIO(self.deflate(resp.read()))
432 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
433 resp.msg = old_resp.msg
437 class FileDownloader(object):
438 """File Downloader class.
440 File downloader objects are the ones responsible of downloading the
441 actual video file and writing it to disk if the user has requested
442 it, among some other tasks. In most cases there should be one per
443 program. As, given a video URL, the downloader doesn't know how to
444 extract all the needed information, task that InfoExtractors do, it
445 has to pass the URL to one of them.
447 For this, file downloader objects have a method that allows
448 InfoExtractors to be registered in a given order. When it is passed
449 a URL, the file downloader handles it to the first InfoExtractor it
450 finds that reports being able to handle it. The InfoExtractor extracts
451 all the information about the video or videos the URL refers to, and
452 asks the FileDownloader to process the video information, possibly
453 downloading the video.
455 File downloaders accept a lot of parameters. In order not to saturate
456 the object constructor with arguments, it receives a dictionary of
457 options instead. These options are available through the params
458 attribute for the InfoExtractors to use. The FileDownloader also
459 registers itself as the downloader in charge for the InfoExtractors
460 that are added to it, so this is a "mutual registration".
464 username: Username for authentication purposes.
465 password: Password for authentication purposes.
466 usenetrc: Use netrc for authentication instead.
467 quiet: Do not print messages to stdout.
468 forceurl: Force printing final URL.
469 forcetitle: Force printing title.
470 forcethumbnail: Force printing thumbnail URL.
471 forcedescription: Force printing description.
472 forcefilename: Force printing final filename.
473 simulate: Do not download the video files.
474 format: Video format code.
475 format_limit: Highest quality format to try.
476 outtmpl: Template for output names.
477 ignoreerrors: Do not stop on download errors.
478 ratelimit: Download speed limit, in bytes/sec.
479 nooverwrites: Prevent overwriting files.
480 retries: Number of times to retry for HTTP error 5xx
481 continuedl: Try to continue downloads if possible.
482 noprogress: Do not print the progress bar.
483 playliststart: Playlist item to start at.
484 playlistend: Playlist item to end at.
485 matchtitle: Download only matching titles.
486 rejecttitle: Reject downloads for matching titles.
487 logtostderr: Log messages to stderr instead of stdout.
488 consoletitle: Display progress in console window's titlebar.
489 nopart: Do not use temporary .part files.
490 updatetime: Use the Last-modified header to set output file timestamps.
491 writedescription: Write the video description to a .description file
492 writeinfojson: Write the video description to a .info.json file
493 writesubtitles: Write the video subtitles to a .srt file
494 subtitleslang: Language of the subtitles to download
500 _download_retcode = None
501 _num_downloads = None
504 def __init__(self, params):
505 """Create a FileDownloader object with the given options."""
508 self._download_retcode = 0
509 self._num_downloads = 0
510 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
514 def format_bytes(bytes):
517 if type(bytes) is str:
522 exponent = long(math.log(bytes, 1024.0))
523 suffix = 'bkMGTPEZY'[exponent]
524 converted = float(bytes) / float(1024 ** exponent)
525 return '%.2f%s' % (converted, suffix)
528 def calc_percent(byte_counter, data_len):
531 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
534 def calc_eta(start, now, total, current):
538 if current == 0 or dif < 0.001: # One millisecond
540 rate = float(current) / dif
541 eta = long((float(total) - float(current)) / rate)
542 (eta_mins, eta_secs) = divmod(eta, 60)
545 return '%02d:%02d' % (eta_mins, eta_secs)
548 def calc_speed(start, now, bytes):
550 if bytes == 0 or dif < 0.001: # One millisecond
551 return '%10s' % '---b/s'
552 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
555 def best_block_size(elapsed_time, bytes):
556 new_min = max(bytes / 2.0, 1.0)
557 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
558 if elapsed_time < 0.001:
560 rate = bytes / elapsed_time
568 def parse_bytes(bytestr):
569 """Parse a string indicating a byte quantity into a long integer."""
570 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
573 number = float(matchobj.group(1))
574 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
575 return long(round(number * multiplier))
577 def add_info_extractor(self, ie):
578 """Add an InfoExtractor object to the end of the list."""
580 ie.set_downloader(self)
582 def add_post_processor(self, pp):
583 """Add a PostProcessor object to the end of the chain."""
585 pp.set_downloader(self)
587 def to_screen(self, message, skip_eol=False):
588 """Print message to stdout if not in quiet mode."""
589 assert type(message) == type(u'')
590 if not self.params.get('quiet', False):
591 terminator = [u'\n', u''][skip_eol]
592 output = message + terminator
594 if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
595 output = output.encode(preferredencoding(), 'ignore')
596 self._screen_file.write(output)
597 self._screen_file.flush()
599 def to_stderr(self, message):
600 """Print message to stderr."""
601 print >>sys.stderr, message.encode(preferredencoding())
603 def to_cons_title(self, message):
604 """Set console/terminal window title to message."""
605 if not self.params.get('consoletitle', False):
607 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
608 # c_wchar_p() might not be necessary if `message` is
609 # already of type unicode()
610 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
611 elif 'TERM' in os.environ:
612 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
614 def fixed_template(self):
615 """Checks if the output template is fixed."""
616 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
618 def trouble(self, message=None):
619 """Determine action to take when a download problem appears.
621 Depending on if the downloader has been configured to ignore
622 download errors or not, this method may throw an exception or
623 not when errors are found, after printing the message.
625 if message is not None:
626 self.to_stderr(message)
627 if not self.params.get('ignoreerrors', False):
628 raise DownloadError(message)
629 self._download_retcode = 1
631 def slow_down(self, start_time, byte_counter):
632 """Sleep if the download speed is over the rate limit."""
633 rate_limit = self.params.get('ratelimit', None)
634 if rate_limit is None or byte_counter == 0:
637 elapsed = now - start_time
640 speed = float(byte_counter) / elapsed
641 if speed > rate_limit:
642 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
644 def temp_name(self, filename):
645 """Returns a temporary filename for the given filename."""
646 if self.params.get('nopart', False) or filename == u'-' or \
647 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
649 return filename + u'.part'
651 def undo_temp_name(self, filename):
652 if filename.endswith(u'.part'):
653 return filename[:-len(u'.part')]
656 def try_rename(self, old_filename, new_filename):
658 if old_filename == new_filename:
660 os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
661 except (IOError, OSError), err:
662 self.trouble(u'ERROR: unable to rename file')
664 def try_utime(self, filename, last_modified_hdr):
665 """Try to set the last-modified time of the given file."""
666 if last_modified_hdr is None:
668 if not os.path.isfile(_encodeFilename(filename)):
670 timestr = last_modified_hdr
673 filetime = timeconvert(timestr)
677 os.utime(filename, (time.time(), filetime))
682 def report_writedescription(self, descfn):
683 """ Report that the description file is being written """
684 self.to_screen(u'[info] Writing video description to: ' + descfn)
686 def report_writesubtitles(self, srtfn):
687 """ Report that the subtitles file is being written """
688 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
690 def report_writeinfojson(self, infofn):
691 """ Report that the metadata file has been written """
692 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
694 def report_destination(self, filename):
695 """Report destination filename."""
696 self.to_screen(u'[download] Destination: ' + filename)
698 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
699 """Report download progress."""
700 if self.params.get('noprogress', False):
702 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
703 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
704 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
705 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
707 def report_resuming_byte(self, resume_len):
708 """Report attempt to resume at given byte."""
709 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
711 def report_retry(self, count, retries):
712 """Report retry in case of HTTP error 5xx"""
713 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
715 def report_file_already_downloaded(self, file_name):
716 """Report file has already been fully downloaded."""
718 self.to_screen(u'[download] %s has already been downloaded' % file_name)
719 except (UnicodeEncodeError), err:
720 self.to_screen(u'[download] The file has already been downloaded')
722 def report_unable_to_resume(self):
723 """Report it was impossible to resume download."""
724 self.to_screen(u'[download] Unable to resume')
726 def report_finish(self):
727 """Report download finished."""
728 if self.params.get('noprogress', False):
729 self.to_screen(u'[download] Download completed')
733 def increment_downloads(self):
734 """Increment the ordinal that assigns a number to each file."""
735 self._num_downloads += 1
737 def prepare_filename(self, info_dict):
738 """Generate the output filename."""
740 template_dict = dict(info_dict)
741 template_dict['epoch'] = unicode(long(time.time()))
742 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
743 filename = self.params['outtmpl'] % template_dict
745 except (ValueError, KeyError), err:
746 self.trouble(u'ERROR: invalid system charset or erroneous output template')
749 def _match_entry(self, info_dict):
750 """ Returns None iff the file should be downloaded """
752 title = info_dict['title']
753 matchtitle = self.params.get('matchtitle', False)
754 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
755 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
756 rejecttitle = self.params.get('rejecttitle', False)
757 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
758 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
761 def process_info(self, info_dict):
762 """Process a single dictionary returned by an InfoExtractor."""
764 reason = self._match_entry(info_dict)
765 if reason is not None:
766 self.to_screen(u'[download] ' + reason)
769 max_downloads = self.params.get('max_downloads')
770 if max_downloads is not None:
771 if self._num_downloads > int(max_downloads):
772 raise MaxDownloadsReached()
774 filename = self.prepare_filename(info_dict)
777 if self.params.get('forcetitle', False):
778 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
779 if self.params.get('forceurl', False):
780 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
781 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
782 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
783 if self.params.get('forcedescription', False) and 'description' in info_dict:
784 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
785 if self.params.get('forcefilename', False) and filename is not None:
786 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
787 if self.params.get('forceformat', False):
788 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
790 # Do nothing else if in simulate mode
791 if self.params.get('simulate', False):
798 dn = os.path.dirname(_encodeFilename(filename))
799 if dn != '' and not os.path.exists(dn): # dn is already encoded
801 except (OSError, IOError), err:
802 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
805 if self.params.get('writedescription', False):
807 descfn = filename + u'.description'
808 self.report_writedescription(descfn)
809 descfile = open(_encodeFilename(descfn), 'wb')
811 descfile.write(info_dict['description'].encode('utf-8'))
814 except (OSError, IOError):
815 self.trouble(u'ERROR: Cannot write description file ' + descfn)
818 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
819 # subtitles download errors are already managed as troubles in relevant IE
820 # that way it will silently go on when used with unsupporting IE
822 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
823 self.report_writesubtitles(srtfn)
824 srtfile = open(_encodeFilename(srtfn), 'wb')
826 srtfile.write(info_dict['subtitles'].encode('utf-8'))
829 except (OSError, IOError):
830 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
833 if self.params.get('writeinfojson', False):
834 infofn = filename + u'.info.json'
835 self.report_writeinfojson(infofn)
838 except (NameError,AttributeError):
839 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
842 infof = open(_encodeFilename(infofn), 'wb')
844 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
845 json.dump(json_info_dict, infof)
848 except (OSError, IOError):
849 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
852 if not self.params.get('skip_download', False):
853 if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
857 success = self._do_download(filename, info_dict)
858 except (OSError, IOError), err:
859 raise UnavailableVideoError
860 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
861 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
863 except (ContentTooShortError, ), err:
864 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
869 self.post_process(filename, info_dict)
870 except (PostProcessingError), err:
871 self.trouble(u'ERROR: postprocessing: %s' % str(err))
874 def download(self, url_list):
875 """Download a given list of URLs."""
876 if len(url_list) > 1 and self.fixed_template():
877 raise SameFileError(self.params['outtmpl'])
880 suitable_found = False
882 # Go to next InfoExtractor if not suitable
883 if not ie.suitable(url):
886 # Suitable InfoExtractor found
887 suitable_found = True
889 # Extract information from URL and process it
892 # Suitable InfoExtractor had been found; go to next URL
895 if not suitable_found:
896 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
898 return self._download_retcode
900 def post_process(self, filename, ie_info):
901 """Run the postprocessing chain on the given file."""
903 info['filepath'] = filename
909 def _download_with_rtmpdump(self, filename, url, player_url):
910 self.report_destination(filename)
911 tmpfilename = self.temp_name(filename)
913 # Check for rtmpdump first
915 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
916 except (OSError, IOError):
917 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
920 # Download using rtmpdump. rtmpdump returns exit code 2 when
921 # the connection was interrumpted and resuming appears to be
922 # possible. This is part of rtmpdump's normal usage, AFAIK.
923 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
924 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
925 if self.params.get('verbose', False):
928 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
931 self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
932 retval = subprocess.call(args)
933 while retval == 2 or retval == 1:
934 prevsize = os.path.getsize(_encodeFilename(tmpfilename))
935 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
936 time.sleep(5.0) # This seems to be needed
937 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
938 cursize = os.path.getsize(_encodeFilename(tmpfilename))
939 if prevsize == cursize and retval == 1:
941 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
942 if prevsize == cursize and retval == 2 and cursize > 1024:
943 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
947 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
948 self.try_rename(tmpfilename, filename)
951 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
954 def _do_download(self, filename, info_dict):
955 url = info_dict['url']
956 player_url = info_dict.get('player_url', None)
958 # Check file already present
959 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
960 self.report_file_already_downloaded(filename)
963 # Attempt to download using rtmpdump
964 if url.startswith('rtmp'):
965 return self._download_with_rtmpdump(filename, url, player_url)
967 tmpfilename = self.temp_name(filename)
970 # Do not include the Accept-Encoding header
971 headers = {'Youtubedl-no-compression': 'True'}
972 basic_request = urllib2.Request(url, None, headers)
973 request = urllib2.Request(url, None, headers)
975 # Establish possible resume length
976 if os.path.isfile(_encodeFilename(tmpfilename)):
977 resume_len = os.path.getsize(_encodeFilename(tmpfilename))
983 if self.params.get('continuedl', False):
984 self.report_resuming_byte(resume_len)
985 request.add_header('Range','bytes=%d-' % resume_len)
991 retries = self.params.get('retries', 0)
992 while count <= retries:
993 # Establish connection
995 if count == 0 and 'urlhandle' in info_dict:
996 data = info_dict['urlhandle']
997 data = urllib2.urlopen(request)
999 except (urllib2.HTTPError, ), err:
1000 if (err.code < 500 or err.code >= 600) and err.code != 416:
1001 # Unexpected HTTP error
1003 elif err.code == 416:
1004 # Unable to resume (requested range not satisfiable)
1006 # Open the connection again without the range header
1007 data = urllib2.urlopen(basic_request)
1008 content_length = data.info()['Content-Length']
1009 except (urllib2.HTTPError, ), err:
1010 if err.code < 500 or err.code >= 600:
1013 # Examine the reported length
1014 if (content_length is not None and
1015 (resume_len - 100 < long(content_length) < resume_len + 100)):
1016 # The file had already been fully downloaded.
1017 # Explanation to the above condition: in issue #175 it was revealed that
1018 # YouTube sometimes adds or removes a few bytes from the end of the file,
1019 # changing the file size slightly and causing problems for some users. So
1020 # I decided to implement a suggested change and consider the file
1021 # completely downloaded if the file size differs less than 100 bytes from
1022 # the one in the hard drive.
1023 self.report_file_already_downloaded(filename)
1024 self.try_rename(tmpfilename, filename)
1027 # The length does not match, we start the download over
1028 self.report_unable_to_resume()
1033 if count <= retries:
1034 self.report_retry(count, retries)
1037 self.trouble(u'ERROR: giving up after %s retries' % retries)
1040 data_len = data.info().get('Content-length', None)
1041 if data_len is not None:
1042 data_len = long(data_len) + resume_len
1043 data_len_str = self.format_bytes(data_len)
1044 byte_counter = 0 + resume_len
1048 # Download and write
1049 before = time.time()
1050 data_block = data.read(block_size)
1052 if len(data_block) == 0:
1054 byte_counter += len(data_block)
1056 # Open file just in time
1059 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1060 assert stream is not None
1061 filename = self.undo_temp_name(tmpfilename)
1062 self.report_destination(filename)
1063 except (OSError, IOError), err:
1064 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1067 stream.write(data_block)
1068 except (IOError, OSError), err:
1069 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1071 block_size = self.best_block_size(after - before, len(data_block))
1074 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1075 if data_len is None:
1076 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1078 percent_str = self.calc_percent(byte_counter, data_len)
1079 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1080 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1083 self.slow_down(start, byte_counter - resume_len)
1086 self.trouble(u'\nERROR: Did not get any data blocks')
1089 self.report_finish()
1090 if data_len is not None and byte_counter != data_len:
1091 raise ContentTooShortError(byte_counter, long(data_len))
1092 self.try_rename(tmpfilename, filename)
1094 # Update file modification time
1095 if self.params.get('updatetime', True):
1096 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1101 class InfoExtractor(object):
1102 """Information Extractor class.
1104 Information extractors are the classes that, given a URL, extract
1105 information from the video (or videos) the URL refers to. This
1106 information includes the real video URL, the video title and simplified
1107 title, author and others. The information is stored in a dictionary
1108 which is then passed to the FileDownloader. The FileDownloader
1109 processes this information possibly downloading the video to the file
1110 system, among other possible outcomes. The dictionaries must include
1111 the following fields:
1113 id: Video identifier.
1114 url: Final video URL.
1115 uploader: Nickname of the video uploader.
1116 title: Literal title.
1117 stitle: Simplified title.
1118 ext: Video filename extension.
1119 format: Video format.
1120 player_url: SWF Player URL (may be None).
1122 The following fields are optional. Their primary purpose is to allow
1123 youtube-dl to serve as the backend for a video search function, such
1124 as the one in youtube2mp3. They are only used when their respective
1125 forced printing functions are called:
1127 thumbnail: Full URL to a video thumbnail image.
1128 description: One-line video description.
1130 Subclasses of this one should re-define the _real_initialize() and
1131 _real_extract() methods and define a _VALID_URL regexp.
1132 Probably, they should also be added to the list of extractors.
1138 def __init__(self, downloader=None):
1139 """Constructor. Receives an optional downloader."""
1141 self.set_downloader(downloader)
1143 def suitable(self, url):
1144 """Receives a URL and returns True if suitable for this IE."""
1145 return re.match(self._VALID_URL, url) is not None
1147 def initialize(self):
1148 """Initializes an instance (authentication, etc)."""
1150 self._real_initialize()
1153 def extract(self, url):
1154 """Extracts URL information and returns it in list of dicts."""
1156 return self._real_extract(url)
1158 def set_downloader(self, downloader):
1159 """Sets the downloader for this IE."""
1160 self._downloader = downloader
1162 def _real_initialize(self):
1163 """Real initialization process. Redefine in subclasses."""
1166 def _real_extract(self, url):
1167 """Real extraction process. Redefine in subclasses."""
1171 class YoutubeIE(InfoExtractor):
1172 """Information extractor for youtube.com."""
1174 _PREFIX = r'(?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)'
1175 _VALID_URL = r'^('+_PREFIX+r'(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1176 _VALID_URL_WITH_AGE = r'^('+_PREFIX+')verify_age\?next_url=([^&]+)(?:.+)?$'
1177 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1178 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1179 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1180 _NETRC_MACHINE = 'youtube'
1181 # Listed in order of quality
1182 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1183 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1184 _video_extensions = {
1190 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1195 _video_dimensions = {
1210 IE_NAME = u'youtube'
1212 def report_lang(self):
1213 """Report attempt to set language."""
1214 self._downloader.to_screen(u'[youtube] Setting language')
1216 def report_login(self):
1217 """Report attempt to log in."""
1218 self._downloader.to_screen(u'[youtube] Logging in')
1220 def report_age_confirmation(self):
1221 """Report attempt to confirm age."""
1222 self._downloader.to_screen(u'[youtube] Confirming age')
1224 def report_video_webpage_download(self, video_id):
1225 """Report attempt to download video webpage."""
1226 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1228 def report_video_info_webpage_download(self, video_id):
1229 """Report attempt to download video info webpage."""
1230 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1232 def report_video_subtitles_download(self, video_id):
1233 """Report attempt to download video info webpage."""
1234 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1236 def report_information_extraction(self, video_id):
1237 """Report attempt to extract video information."""
1238 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1240 def report_unavailable_format(self, video_id, format):
1241 """Report extracted video URL."""
1242 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1244 def report_rtmp_download(self):
1245 """Indicate the download will use the RTMP protocol."""
1246 self._downloader.to_screen(u'[youtube] RTMP download detected')
1248 def _closed_captions_xml_to_srt(self, xml_string):
1250 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1251 # TODO parse xml instead of regex
1252 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1253 if not dur: dur = '4'
1254 start = float(start)
1255 end = start + float(dur)
1256 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1257 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1258 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
1259 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
1260 srt += str(n) + '\n'
1261 srt += start + ' --> ' + end + '\n'
1262 srt += caption + '\n\n'
1265 def _print_formats(self, formats):
1266 print 'Available formats:'
1268 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1270 def _real_initialize(self):
1271 if self._downloader is None:
1276 downloader_params = self._downloader.params
1278 # Attempt to use provided username and password or .netrc data
1279 if downloader_params.get('username', None) is not None:
1280 username = downloader_params['username']
1281 password = downloader_params['password']
1282 elif downloader_params.get('usenetrc', False):
1284 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1285 if info is not None:
1289 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1290 except (IOError, netrc.NetrcParseError), err:
1291 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1295 request = urllib2.Request(self._LANG_URL)
1298 urllib2.urlopen(request).read()
1299 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1300 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1303 # No authentication to be performed
1304 if username is None:
1309 'current_form': 'loginForm',
1311 'action_login': 'Log In',
1312 'username': username,
1313 'password': password,
1315 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1318 login_results = urllib2.urlopen(request).read()
1319 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1320 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1322 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1323 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1329 'action_confirm': 'Confirm',
1331 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1333 self.report_age_confirmation()
1334 age_results = urllib2.urlopen(request).read()
1335 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1336 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1339 def _real_extract(self, url):
1340 # Extract original video URL from URL with age verification, using next_url parameter
1341 mobj = re.match(self._VALID_URL_WITH_AGE, url)
1343 urldecode = lambda x: re.sub(r'%([0-9a-hA-H][0-9a-hA-H])', lambda m: chr(int(m.group(1), 16)), x)
1344 # Keep original domain. We can probably change to www.youtube.com, but it should not hurt so keep it.
1345 # We just make sure we do not have double //, in URL, so we strip starting slash in next_url.
1346 url = mobj.group(1) + re.sub(r'^/', '', urldecode(mobj.group(2)))
1348 # Extract video id from URL
1349 mobj = re.match(self._VALID_URL, url)
1351 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1353 video_id = mobj.group(2)
1356 self.report_video_webpage_download(video_id)
1357 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1359 video_webpage = urllib2.urlopen(request).read()
1360 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1361 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1364 # Attempt to extract SWF player URL
1365 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1366 if mobj is not None:
1367 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1372 self.report_video_info_webpage_download(video_id)
1373 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1374 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1375 % (video_id, el_type))
1376 request = urllib2.Request(video_info_url)
1378 video_info_webpage = urllib2.urlopen(request).read()
1379 video_info = parse_qs(video_info_webpage)
1380 if 'token' in video_info:
1382 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1383 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1385 if 'token' not in video_info:
1386 if 'reason' in video_info:
1387 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1389 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1392 # Start extracting information
1393 self.report_information_extraction(video_id)
1396 if 'author' not in video_info:
1397 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1399 video_uploader = urllib.unquote_plus(video_info['author'][0])
1402 if 'title' not in video_info:
1403 self._downloader.trouble(u'ERROR: unable to extract video title')
1405 video_title = urllib.unquote_plus(video_info['title'][0])
1406 video_title = video_title.decode('utf-8')
1407 video_title = sanitize_title(video_title)
1410 simple_title = _simplify_title(video_title)
1413 if 'thumbnail_url' not in video_info:
1414 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1415 video_thumbnail = ''
1416 else: # don't panic if we can't find it
1417 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1421 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1422 if mobj is not None:
1423 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1424 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1425 for expression in format_expressions:
1427 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1435 video_description = u'No description available.'
1436 mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1437 if mobj is not None:
1438 video_description = mobj.group(1).decode('utf-8')
1440 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1441 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1442 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1443 # TODO use another parser
1446 video_subtitles = None
1447 if self._downloader.params.get('writesubtitles', False):
1448 self.report_video_subtitles_download(video_id)
1449 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1451 srt_list = urllib2.urlopen(request).read()
1452 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1453 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1455 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1457 if self._downloader.params.get('subtitleslang', False):
1458 srt_lang = self._downloader.params.get('subtitleslang')
1459 elif 'en' in srt_lang_list:
1462 srt_lang = srt_lang_list[0]
1463 if not srt_lang in srt_lang_list:
1464 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
1466 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1468 srt_xml = urllib2.urlopen(request).read()
1469 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1470 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1472 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1474 self._downloader.trouble(u'WARNING: video has no closed captions')
1477 video_token = urllib.unquote_plus(video_info['token'][0])
1479 # Decide which formats to download
1480 req_format = self._downloader.params.get('format', None)
1482 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1483 self.report_rtmp_download()
1484 video_url_list = [(None, video_info['conn'][0])]
1485 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1486 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1487 url_data = [parse_qs(uds) for uds in url_data_strs]
1488 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1489 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1491 format_limit = self._downloader.params.get('format_limit', None)
1492 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1493 if format_limit is not None and format_limit in available_formats:
1494 format_list = available_formats[available_formats.index(format_limit):]
1496 format_list = available_formats
1497 existing_formats = [x for x in format_list if x in url_map]
1498 if len(existing_formats) == 0:
1499 self._downloader.trouble(u'ERROR: no known formats available for video')
1501 if self._downloader.params.get('listformats', None):
1502 self._print_formats(existing_formats)
1504 if req_format is None or req_format == 'best':
1505 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1506 elif req_format == 'worst':
1507 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1508 elif req_format in ('-1', 'all'):
1509 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1511 # Specific formats. We pick the first in a slash-delimeted sequence.
1512 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1513 req_formats = req_format.split('/')
1514 video_url_list = None
1515 for rf in req_formats:
1517 video_url_list = [(rf, url_map[rf])]
1519 if video_url_list is None:
1520 self._downloader.trouble(u'ERROR: requested format not available')
1523 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1526 for format_param, video_real_url in video_url_list:
1527 # At this point we have a new video
1528 self._downloader.increment_downloads()
1531 video_extension = self._video_extensions.get(format_param, 'flv')
1534 # Process video information
1535 self._downloader.process_info({
1536 'id': video_id.decode('utf-8'),
1537 'url': video_real_url.decode('utf-8'),
1538 'uploader': video_uploader.decode('utf-8'),
1539 'upload_date': upload_date,
1540 'title': video_title,
1541 'stitle': simple_title,
1542 'ext': video_extension.decode('utf-8'),
1543 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1544 'thumbnail': video_thumbnail.decode('utf-8'),
1545 'description': video_description,
1546 'player_url': player_url,
1547 'subtitles': video_subtitles
1549 except UnavailableVideoError, err:
1550 self._downloader.trouble(u'\nERROR: unable to download video')
1553 class MetacafeIE(InfoExtractor):
1554 """Information Extractor for metacafe.com."""
1556 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1557 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1558 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1560 IE_NAME = u'metacafe'
1562 def __init__(self, youtube_ie, downloader=None):
1563 InfoExtractor.__init__(self, downloader)
1564 self._youtube_ie = youtube_ie
1566 def report_disclaimer(self):
1567 """Report disclaimer retrieval."""
1568 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1570 def report_age_confirmation(self):
1571 """Report attempt to confirm age."""
1572 self._downloader.to_screen(u'[metacafe] Confirming age')
1574 def report_download_webpage(self, video_id):
1575 """Report webpage download."""
1576 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1578 def report_extraction(self, video_id):
1579 """Report information extraction."""
1580 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1582 def _real_initialize(self):
1583 # Retrieve disclaimer
1584 request = urllib2.Request(self._DISCLAIMER)
1586 self.report_disclaimer()
1587 disclaimer = urllib2.urlopen(request).read()
1588 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1589 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1595 'submit': "Continue - I'm over 18",
1597 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1599 self.report_age_confirmation()
1600 disclaimer = urllib2.urlopen(request).read()
1601 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1602 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1605 def _real_extract(self, url):
1606 # Extract id and simplified title from URL
1607 mobj = re.match(self._VALID_URL, url)
1609 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1612 video_id = mobj.group(1)
1614 # Check if video comes from YouTube
1615 mobj2 = re.match(r'^yt-(.*)$', video_id)
1616 if mobj2 is not None:
1617 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1620 # At this point we have a new video
1621 self._downloader.increment_downloads()
1623 simple_title = mobj.group(2).decode('utf-8')
1625 # Retrieve video webpage to extract further information
1626 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1628 self.report_download_webpage(video_id)
1629 webpage = urllib2.urlopen(request).read()
1630 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1631 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1634 # Extract URL, uploader and title from webpage
1635 self.report_extraction(video_id)
1636 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1637 if mobj is not None:
1638 mediaURL = urllib.unquote(mobj.group(1))
1639 video_extension = mediaURL[-3:]
1641 # Extract gdaKey if available
1642 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1644 video_url = mediaURL
1646 gdaKey = mobj.group(1)
1647 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1649 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1651 self._downloader.trouble(u'ERROR: unable to extract media URL')
1653 vardict = parse_qs(mobj.group(1))
1654 if 'mediaData' not in vardict:
1655 self._downloader.trouble(u'ERROR: unable to extract media URL')
1657 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1659 self._downloader.trouble(u'ERROR: unable to extract media URL')
1661 mediaURL = mobj.group(1).replace('\\/', '/')
1662 video_extension = mediaURL[-3:]
1663 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1665 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1667 self._downloader.trouble(u'ERROR: unable to extract title')
1669 video_title = mobj.group(1).decode('utf-8')
1670 video_title = sanitize_title(video_title)
1672 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1674 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1676 video_uploader = mobj.group(1)
1679 # Process video information
1680 self._downloader.process_info({
1681 'id': video_id.decode('utf-8'),
1682 'url': video_url.decode('utf-8'),
1683 'uploader': video_uploader.decode('utf-8'),
1684 'upload_date': u'NA',
1685 'title': video_title,
1686 'stitle': simple_title,
1687 'ext': video_extension.decode('utf-8'),
1691 except UnavailableVideoError:
1692 self._downloader.trouble(u'\nERROR: unable to download video')
1695 class DailymotionIE(InfoExtractor):
1696 """Information Extractor for Dailymotion"""
1698 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1699 IE_NAME = u'dailymotion'
1701 def __init__(self, downloader=None):
1702 InfoExtractor.__init__(self, downloader)
1704 def report_download_webpage(self, video_id):
1705 """Report webpage download."""
1706 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1708 def report_extraction(self, video_id):
1709 """Report information extraction."""
1710 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1712 def _real_extract(self, url):
1713 # Extract id and simplified title from URL
1714 mobj = re.match(self._VALID_URL, url)
1716 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1719 # At this point we have a new video
1720 self._downloader.increment_downloads()
1721 video_id = mobj.group(1)
1723 video_extension = 'flv'
1725 # Retrieve video webpage to extract further information
1726 request = urllib2.Request(url)
1727 request.add_header('Cookie', 'family_filter=off')
1729 self.report_download_webpage(video_id)
1730 webpage = urllib2.urlopen(request).read()
1731 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1732 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1735 # Extract URL, uploader and title from webpage
1736 self.report_extraction(video_id)
1737 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1739 self._downloader.trouble(u'ERROR: unable to extract media URL')
1741 sequence = urllib.unquote(mobj.group(1))
1742 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1744 self._downloader.trouble(u'ERROR: unable to extract media URL')
1746 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1748 # if needed add http://www.dailymotion.com/ if relative URL
1750 video_url = mediaURL
1752 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1754 self._downloader.trouble(u'ERROR: unable to extract title')
1756 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1757 video_title = sanitize_title(video_title)
1758 simple_title = _simplify_title(video_title)
1760 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1762 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1764 video_uploader = mobj.group(1)
1767 # Process video information
1768 self._downloader.process_info({
1769 'id': video_id.decode('utf-8'),
1770 'url': video_url.decode('utf-8'),
1771 'uploader': video_uploader.decode('utf-8'),
1772 'upload_date': u'NA',
1773 'title': video_title,
1774 'stitle': simple_title,
1775 'ext': video_extension.decode('utf-8'),
1779 except UnavailableVideoError:
1780 self._downloader.trouble(u'\nERROR: unable to download video')
1783 class GoogleIE(InfoExtractor):
1784 """Information extractor for video.google.com."""
1786 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1787 IE_NAME = u'video.google'
1789 def __init__(self, downloader=None):
1790 InfoExtractor.__init__(self, downloader)
1792 def report_download_webpage(self, video_id):
1793 """Report webpage download."""
1794 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1796 def report_extraction(self, video_id):
1797 """Report information extraction."""
1798 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1800 def _real_extract(self, url):
1801 # Extract id from URL
1802 mobj = re.match(self._VALID_URL, url)
1804 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1807 # At this point we have a new video
1808 self._downloader.increment_downloads()
1809 video_id = mobj.group(1)
1811 video_extension = 'mp4'
1813 # Retrieve video webpage to extract further information
1814 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1816 self.report_download_webpage(video_id)
1817 webpage = urllib2.urlopen(request).read()
1818 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1819 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1822 # Extract URL, uploader, and title from webpage
1823 self.report_extraction(video_id)
1824 mobj = re.search(r"download_url:'([^']+)'", webpage)
1826 video_extension = 'flv'
1827 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1829 self._downloader.trouble(u'ERROR: unable to extract media URL')
1831 mediaURL = urllib.unquote(mobj.group(1))
1832 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1833 mediaURL = mediaURL.replace('\\x26', '\x26')
1835 video_url = mediaURL
1837 mobj = re.search(r'<title>(.*)</title>', webpage)
1839 self._downloader.trouble(u'ERROR: unable to extract title')
1841 video_title = mobj.group(1).decode('utf-8')
1842 video_title = sanitize_title(video_title)
1843 simple_title = _simplify_title(video_title)
1845 # Extract video description
1846 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1848 self._downloader.trouble(u'ERROR: unable to extract video description')
1850 video_description = mobj.group(1).decode('utf-8')
1851 if not video_description:
1852 video_description = 'No description available.'
1854 # Extract video thumbnail
1855 if self._downloader.params.get('forcethumbnail', False):
1856 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1858 webpage = urllib2.urlopen(request).read()
1859 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1860 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1862 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1864 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1866 video_thumbnail = mobj.group(1)
1867 else: # we need something to pass to process_info
1868 video_thumbnail = ''
1871 # Process video information
1872 self._downloader.process_info({
1873 'id': video_id.decode('utf-8'),
1874 'url': video_url.decode('utf-8'),
1876 'upload_date': u'NA',
1877 'title': video_title,
1878 'stitle': simple_title,
1879 'ext': video_extension.decode('utf-8'),
1883 except UnavailableVideoError:
1884 self._downloader.trouble(u'\nERROR: unable to download video')
1887 class PhotobucketIE(InfoExtractor):
1888 """Information extractor for photobucket.com."""
1890 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1891 IE_NAME = u'photobucket'
1893 def __init__(self, downloader=None):
1894 InfoExtractor.__init__(self, downloader)
1896 def report_download_webpage(self, video_id):
1897 """Report webpage download."""
1898 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1900 def report_extraction(self, video_id):
1901 """Report information extraction."""
1902 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1904 def _real_extract(self, url):
1905 # Extract id from URL
1906 mobj = re.match(self._VALID_URL, url)
1908 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1911 # At this point we have a new video
1912 self._downloader.increment_downloads()
1913 video_id = mobj.group(1)
1915 video_extension = 'flv'
1917 # Retrieve video webpage to extract further information
1918 request = urllib2.Request(url)
1920 self.report_download_webpage(video_id)
1921 webpage = urllib2.urlopen(request).read()
1922 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1923 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1926 # Extract URL, uploader, and title from webpage
1927 self.report_extraction(video_id)
1928 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1930 self._downloader.trouble(u'ERROR: unable to extract media URL')
1932 mediaURL = urllib.unquote(mobj.group(1))
1934 video_url = mediaURL
1936 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1938 self._downloader.trouble(u'ERROR: unable to extract title')
1940 video_title = mobj.group(1).decode('utf-8')
1941 video_title = sanitize_title(video_title)
1942 simple_title = _simplify_title(vide_title)
1944 video_uploader = mobj.group(2).decode('utf-8')
1947 # Process video information
1948 self._downloader.process_info({
1949 'id': video_id.decode('utf-8'),
1950 'url': video_url.decode('utf-8'),
1951 'uploader': video_uploader,
1952 'upload_date': u'NA',
1953 'title': video_title,
1954 'stitle': simple_title,
1955 'ext': video_extension.decode('utf-8'),
1959 except UnavailableVideoError:
1960 self._downloader.trouble(u'\nERROR: unable to download video')
1963 class YahooIE(InfoExtractor):
1964 """Information extractor for video.yahoo.com."""
1966 # _VALID_URL matches all Yahoo! Video URLs
1967 # _VPAGE_URL matches only the extractable '/watch/' URLs
1968 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1969 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1970 IE_NAME = u'video.yahoo'
1972 def __init__(self, downloader=None):
1973 InfoExtractor.__init__(self, downloader)
1975 def report_download_webpage(self, video_id):
1976 """Report webpage download."""
1977 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1979 def report_extraction(self, video_id):
1980 """Report information extraction."""
1981 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1983 def _real_extract(self, url, new_video=True):
1984 # Extract ID from URL
1985 mobj = re.match(self._VALID_URL, url)
1987 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1990 # At this point we have a new video
1991 self._downloader.increment_downloads()
1992 video_id = mobj.group(2)
1993 video_extension = 'flv'
1995 # Rewrite valid but non-extractable URLs as
1996 # extractable English language /watch/ URLs
1997 if re.match(self._VPAGE_URL, url) is None:
1998 request = urllib2.Request(url)
2000 webpage = urllib2.urlopen(request).read()
2001 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2002 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2005 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
2007 self._downloader.trouble(u'ERROR: Unable to extract id field')
2009 yahoo_id = mobj.group(1)
2011 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2013 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2015 yahoo_vid = mobj.group(1)
2017 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2018 return self._real_extract(url, new_video=False)
2020 # Retrieve video webpage to extract further information
2021 request = urllib2.Request(url)
2023 self.report_download_webpage(video_id)
2024 webpage = urllib2.urlopen(request).read()
2025 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2026 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2029 # Extract uploader and title from webpage
2030 self.report_extraction(video_id)
2031 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2033 self._downloader.trouble(u'ERROR: unable to extract video title')
2035 video_title = mobj.group(1).decode('utf-8')
2036 simple_title = _simplify_title(video_title)
2038 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2040 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2042 video_uploader = mobj.group(1).decode('utf-8')
2044 # Extract video thumbnail
2045 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2047 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2049 video_thumbnail = mobj.group(1).decode('utf-8')
2051 # Extract video description
2052 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2054 self._downloader.trouble(u'ERROR: unable to extract video description')
2056 video_description = mobj.group(1).decode('utf-8')
2057 if not video_description:
2058 video_description = 'No description available.'
2060 # Extract video height and width
2061 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2063 self._downloader.trouble(u'ERROR: unable to extract video height')
2065 yv_video_height = mobj.group(1)
2067 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2069 self._downloader.trouble(u'ERROR: unable to extract video width')
2071 yv_video_width = mobj.group(1)
2073 # Retrieve video playlist to extract media URL
2074 # I'm not completely sure what all these options are, but we
2075 # seem to need most of them, otherwise the server sends a 401.
2076 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
2077 yv_bitrate = '700' # according to Wikipedia this is hard-coded
2078 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2079 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2080 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2082 self.report_download_webpage(video_id)
2083 webpage = urllib2.urlopen(request).read()
2084 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2085 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2088 # Extract media URL from playlist XML
2089 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2091 self._downloader.trouble(u'ERROR: Unable to extract media URL')
2093 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2094 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2097 # Process video information
2098 self._downloader.process_info({
2099 'id': video_id.decode('utf-8'),
2101 'uploader': video_uploader,
2102 'upload_date': u'NA',
2103 'title': video_title,
2104 'stitle': simple_title,
2105 'ext': video_extension.decode('utf-8'),
2106 'thumbnail': video_thumbnail.decode('utf-8'),
2107 'description': video_description,
2108 'thumbnail': video_thumbnail,
2111 except UnavailableVideoError:
2112 self._downloader.trouble(u'\nERROR: unable to download video')
2115 class VimeoIE(InfoExtractor):
2116 """Information extractor for vimeo.com."""
2118 # _VALID_URL matches Vimeo URLs
2119 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2122 def __init__(self, downloader=None):
2123 InfoExtractor.__init__(self, downloader)
2125 def report_download_webpage(self, video_id):
2126 """Report webpage download."""
2127 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2129 def report_extraction(self, video_id):
2130 """Report information extraction."""
2131 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2133 def _real_extract(self, url, new_video=True):
2134 # Extract ID from URL
2135 mobj = re.match(self._VALID_URL, url)
2137 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2140 # At this point we have a new video
2141 self._downloader.increment_downloads()
2142 video_id = mobj.group(1)
2144 # Retrieve video webpage to extract further information
2145 request = urllib2.Request(url, None, std_headers)
2147 self.report_download_webpage(video_id)
2148 webpage = urllib2.urlopen(request).read()
2149 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2150 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2153 # Now we begin extracting as much information as we can from what we
2154 # retrieved. First we extract the information common to all extractors,
2155 # and latter we extract those that are Vimeo specific.
2156 self.report_extraction(video_id)
2158 # Extract the config JSON
2159 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2161 config = json.loads(config)
2163 self._downloader.trouble(u'ERROR: unable to extract info section')
2167 video_title = config["video"]["title"]
2168 simple_title = _simplify_title(video_title)
2171 video_uploader = config["video"]["owner"]["name"]
2173 # Extract video thumbnail
2174 video_thumbnail = config["video"]["thumbnail"]
2176 # Extract video description
2180 video_description = u'No description available.'
2181 mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2182 if mobj is not None:
2183 video_description = mobj.group(1)
2185 html_parser = lxml.etree.HTMLParser()
2186 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2187 video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2188 # TODO use another parser
2190 # Extract upload date
2191 video_upload_date = u'NA'
2192 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2193 if mobj is not None:
2194 video_upload_date = mobj.group(1)
2196 # Vimeo specific: extract request signature and timestamp
2197 sig = config['request']['signature']
2198 timestamp = config['request']['timestamp']
2200 # Vimeo specific: extract video codec and quality information
2201 # TODO bind to format param
2202 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2203 for codec in codecs:
2204 if codec[0] in config["video"]["files"]:
2205 video_codec = codec[0]
2206 video_extension = codec[1]
2207 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2208 else: quality = 'sd'
2211 self._downloader.trouble(u'ERROR: no known codec found')
2214 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2215 %(video_id, sig, timestamp, quality, video_codec.upper())
2218 # Process video information
2219 self._downloader.process_info({
2222 'uploader': video_uploader,
2223 'upload_date': video_upload_date,
2224 'title': video_title,
2225 'stitle': simple_title,
2226 'ext': video_extension,
2227 'thumbnail': video_thumbnail,
2228 'description': video_description,
2231 except UnavailableVideoError:
2232 self._downloader.trouble(u'ERROR: unable to download video')
2235 class GenericIE(InfoExtractor):
2236 """Generic last-resort information extractor."""
2239 IE_NAME = u'generic'
2241 def __init__(self, downloader=None):
2242 InfoExtractor.__init__(self, downloader)
2244 def report_download_webpage(self, video_id):
2245 """Report webpage download."""
2246 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2247 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2249 def report_extraction(self, video_id):
2250 """Report information extraction."""
2251 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2253 def _real_extract(self, url):
2254 # At this point we have a new video
2255 self._downloader.increment_downloads()
2257 video_id = url.split('/')[-1]
2258 request = urllib2.Request(url)
2260 self.report_download_webpage(video_id)
2261 webpage = urllib2.urlopen(request).read()
2262 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2263 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2265 except ValueError, err:
2266 # since this is the last-resort InfoExtractor, if
2267 # this error is thrown, it'll be thrown here
2268 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2271 self.report_extraction(video_id)
2272 # Start with something easy: JW Player in SWFObject
2273 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2275 # Broaden the search a little bit
2276 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2278 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2281 # It's possible that one of the regexes
2282 # matched, but returned an empty group:
2283 if mobj.group(1) is None:
2284 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2287 video_url = urllib.unquote(mobj.group(1))
2288 video_id = os.path.basename(video_url)
2290 # here's a fun little line of code for you:
2291 video_extension = os.path.splitext(video_id)[1][1:]
2292 video_id = os.path.splitext(video_id)[0]
2294 # it's tempting to parse this further, but you would
2295 # have to take into account all the variations like
2296 # Video Title - Site Name
2297 # Site Name | Video Title
2298 # Video Title - Tagline | Site Name
2299 # and so on and so forth; it's just not practical
2300 mobj = re.search(r'<title>(.*)</title>', webpage)
2302 self._downloader.trouble(u'ERROR: unable to extract title')
2304 video_title = mobj.group(1).decode('utf-8')
2305 video_title = sanitize_title(video_title)
2306 simple_title = _simplify_title(video_title)
2308 # video uploader is domain name
2309 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2311 self._downloader.trouble(u'ERROR: unable to extract title')
2313 video_uploader = mobj.group(1).decode('utf-8')
2316 # Process video information
2317 self._downloader.process_info({
2318 'id': video_id.decode('utf-8'),
2319 'url': video_url.decode('utf-8'),
2320 'uploader': video_uploader,
2321 'upload_date': u'NA',
2322 'title': video_title,
2323 'stitle': simple_title,
2324 'ext': video_extension.decode('utf-8'),
2328 except UnavailableVideoError, err:
2329 self._downloader.trouble(u'\nERROR: unable to download video')
2332 class YoutubeSearchIE(InfoExtractor):
2333 """Information Extractor for YouTube search queries."""
2334 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2335 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
2337 _max_youtube_results = 1000
2338 IE_NAME = u'youtube:search'
2340 def __init__(self, youtube_ie, downloader=None):
2341 InfoExtractor.__init__(self, downloader)
2342 self._youtube_ie = youtube_ie
2344 def report_download_page(self, query, pagenum):
2345 """Report attempt to download playlist page with given number."""
2346 query = query.decode(preferredencoding())
2347 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2349 def _real_initialize(self):
2350 self._youtube_ie.initialize()
2352 def _real_extract(self, query):
2353 mobj = re.match(self._VALID_URL, query)
2355 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2358 prefix, query = query.split(':')
2360 query = query.encode('utf-8')
2362 self._download_n_results(query, 1)
2364 elif prefix == 'all':
2365 self._download_n_results(query, self._max_youtube_results)
2371 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2373 elif n > self._max_youtube_results:
2374 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2375 n = self._max_youtube_results
2376 self._download_n_results(query, n)
2378 except ValueError: # parsing prefix as integer fails
2379 self._download_n_results(query, 1)
2382 def _download_n_results(self, query, n):
2383 """Downloads a specified number of results for a query"""
2389 while (50 * pagenum) < limit:
2390 self.report_download_page(query, pagenum+1)
2391 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
2392 request = urllib2.Request(result_url)
2394 data = urllib2.urlopen(request).read()
2395 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2396 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
2398 api_response = json.loads(data)['data']
2400 new_ids = list(video['id'] for video in api_response['items'])
2401 video_ids += new_ids
2403 limit = min(n, api_response['totalItems'])
2406 if len(video_ids) > n:
2407 video_ids = video_ids[:n]
2408 for id in video_ids:
2409 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2413 class GoogleSearchIE(InfoExtractor):
2414 """Information Extractor for Google Video search queries."""
2415 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2416 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2417 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2418 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2420 _max_google_results = 1000
2421 IE_NAME = u'video.google:search'
2423 def __init__(self, google_ie, downloader=None):
2424 InfoExtractor.__init__(self, downloader)
2425 self._google_ie = google_ie
2427 def report_download_page(self, query, pagenum):
2428 """Report attempt to download playlist page with given number."""
2429 query = query.decode(preferredencoding())
2430 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2432 def _real_initialize(self):
2433 self._google_ie.initialize()
2435 def _real_extract(self, query):
2436 mobj = re.match(self._VALID_URL, query)
2438 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2441 prefix, query = query.split(':')
2443 query = query.encode('utf-8')
2445 self._download_n_results(query, 1)
2447 elif prefix == 'all':
2448 self._download_n_results(query, self._max_google_results)
2454 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2456 elif n > self._max_google_results:
2457 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2458 n = self._max_google_results
2459 self._download_n_results(query, n)
2461 except ValueError: # parsing prefix as integer fails
2462 self._download_n_results(query, 1)
2465 def _download_n_results(self, query, n):
2466 """Downloads a specified number of results for a query"""
2472 self.report_download_page(query, pagenum)
2473 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2474 request = urllib2.Request(result_url)
2476 page = urllib2.urlopen(request).read()
2477 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2478 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2481 # Extract video identifiers
2482 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2483 video_id = mobj.group(1)
2484 if video_id not in video_ids:
2485 video_ids.append(video_id)
2486 if len(video_ids) == n:
2487 # Specified n videos reached
2488 for id in video_ids:
2489 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2492 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2493 for id in video_ids:
2494 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2497 pagenum = pagenum + 1
2500 class YahooSearchIE(InfoExtractor):
2501 """Information Extractor for Yahoo! Video search queries."""
2502 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2503 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2504 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2505 _MORE_PAGES_INDICATOR = r'\s*Next'
2507 _max_yahoo_results = 1000
2508 IE_NAME = u'video.yahoo:search'
2510 def __init__(self, yahoo_ie, downloader=None):
2511 InfoExtractor.__init__(self, downloader)
2512 self._yahoo_ie = yahoo_ie
2514 def report_download_page(self, query, pagenum):
2515 """Report attempt to download playlist page with given number."""
2516 query = query.decode(preferredencoding())
2517 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2519 def _real_initialize(self):
2520 self._yahoo_ie.initialize()
2522 def _real_extract(self, query):
2523 mobj = re.match(self._VALID_URL, query)
2525 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2528 prefix, query = query.split(':')
2530 query = query.encode('utf-8')
2532 self._download_n_results(query, 1)
2534 elif prefix == 'all':
2535 self._download_n_results(query, self._max_yahoo_results)
2541 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2543 elif n > self._max_yahoo_results:
2544 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2545 n = self._max_yahoo_results
2546 self._download_n_results(query, n)
2548 except ValueError: # parsing prefix as integer fails
2549 self._download_n_results(query, 1)
2552 def _download_n_results(self, query, n):
2553 """Downloads a specified number of results for a query"""
2556 already_seen = set()
2560 self.report_download_page(query, pagenum)
2561 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2562 request = urllib2.Request(result_url)
2564 page = urllib2.urlopen(request).read()
2565 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2566 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2569 # Extract video identifiers
2570 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2571 video_id = mobj.group(1)
2572 if video_id not in already_seen:
2573 video_ids.append(video_id)
2574 already_seen.add(video_id)
2575 if len(video_ids) == n:
2576 # Specified n videos reached
2577 for id in video_ids:
2578 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2581 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2582 for id in video_ids:
2583 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2586 pagenum = pagenum + 1
2589 class YoutubePlaylistIE(InfoExtractor):
2590 """Information Extractor for YouTube playlists."""
2592 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2593 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2594 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=PL%s&'
2595 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2597 IE_NAME = u'youtube:playlist'
2599 def __init__(self, youtube_ie, downloader=None):
2600 InfoExtractor.__init__(self, downloader)
2601 self._youtube_ie = youtube_ie
2603 def report_download_page(self, playlist_id, pagenum):
2604 """Report attempt to download playlist page with given number."""
2605 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2607 def _real_initialize(self):
2608 self._youtube_ie.initialize()
2610 def _real_extract(self, url):
2611 # Extract playlist id
2612 mobj = re.match(self._VALID_URL, url)
2614 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2618 if mobj.group(3) is not None:
2619 self._youtube_ie.extract(mobj.group(3))
2622 # Download playlist pages
2623 # prefix is 'p' as default for playlists but there are other types that need extra care
2624 playlist_prefix = mobj.group(1)
2625 if playlist_prefix == 'a':
2626 playlist_access = 'artist'
2628 playlist_prefix = 'p'
2629 playlist_access = 'view_play_list'
2630 playlist_id = mobj.group(2)
2635 self.report_download_page(playlist_id, pagenum)
2636 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2637 request = urllib2.Request(url)
2639 page = urllib2.urlopen(request).read()
2640 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2641 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2644 # Extract video identifiers
2646 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
2647 if mobj.group(1) not in ids_in_page:
2648 ids_in_page.append(mobj.group(1))
2649 video_ids.extend(ids_in_page)
2651 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2653 pagenum = pagenum + 1
2655 playliststart = self._downloader.params.get('playliststart', 1) - 1
2656 playlistend = self._downloader.params.get('playlistend', -1)
2657 if playlistend == -1:
2658 video_ids = video_ids[playliststart:]
2660 video_ids = video_ids[playliststart:playlistend]
2662 for id in video_ids:
2663 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2667 class YoutubeUserIE(InfoExtractor):
2668 """Information Extractor for YouTube users."""
2670 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2671 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2672 _GDATA_PAGE_SIZE = 50
2673 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2674 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2676 IE_NAME = u'youtube:user'
2678 def __init__(self, youtube_ie, downloader=None):
2679 InfoExtractor.__init__(self, downloader)
2680 self._youtube_ie = youtube_ie
2682 def report_download_page(self, username, start_index):
2683 """Report attempt to download user page."""
2684 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2685 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2687 def _real_initialize(self):
2688 self._youtube_ie.initialize()
2690 def _real_extract(self, url):
2692 mobj = re.match(self._VALID_URL, url)
2694 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2697 username = mobj.group(1)
2699 # Download video ids using YouTube Data API. Result size per
2700 # query is limited (currently to 50 videos) so we need to query
2701 # page by page until there are no video ids - it means we got
2708 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2709 self.report_download_page(username, start_index)
2711 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2714 page = urllib2.urlopen(request).read()
2715 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2716 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2719 # Extract video identifiers
2722 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2723 if mobj.group(1) not in ids_in_page:
2724 ids_in_page.append(mobj.group(1))
2726 video_ids.extend(ids_in_page)
2728 # A little optimization - if current page is not
2729 # "full", ie. does not contain PAGE_SIZE video ids then
2730 # we can assume that this page is the last one - there
2731 # are no more ids on further pages - no need to query
2734 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2739 all_ids_count = len(video_ids)
2740 playliststart = self._downloader.params.get('playliststart', 1) - 1
2741 playlistend = self._downloader.params.get('playlistend', -1)
2743 if playlistend == -1:
2744 video_ids = video_ids[playliststart:]
2746 video_ids = video_ids[playliststart:playlistend]
2748 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2749 (username, all_ids_count, len(video_ids)))
2751 for video_id in video_ids:
2752 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2755 class DepositFilesIE(InfoExtractor):
2756 """Information extractor for depositfiles.com"""
2758 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2759 IE_NAME = u'DepositFiles'
2761 def __init__(self, downloader=None):
2762 InfoExtractor.__init__(self, downloader)
2764 def report_download_webpage(self, file_id):
2765 """Report webpage download."""
2766 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2768 def report_extraction(self, file_id):
2769 """Report information extraction."""
2770 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2772 def _real_extract(self, url):
2773 # At this point we have a new file
2774 self._downloader.increment_downloads()
2776 file_id = url.split('/')[-1]
2777 # Rebuild url in english locale
2778 url = 'http://depositfiles.com/en/files/' + file_id
2780 # Retrieve file webpage with 'Free download' button pressed
2781 free_download_indication = { 'gateway_result' : '1' }
2782 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2784 self.report_download_webpage(file_id)
2785 webpage = urllib2.urlopen(request).read()
2786 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2787 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2790 # Search for the real file URL
2791 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2792 if (mobj is None) or (mobj.group(1) is None):
2793 # Try to figure out reason of the error.
2794 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2795 if (mobj is not None) and (mobj.group(1) is not None):
2796 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2797 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2799 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2802 file_url = mobj.group(1)
2803 file_extension = os.path.splitext(file_url)[1][1:]
2805 # Search for file title
2806 mobj = re.search(r'<b title="(.*?)">', webpage)
2808 self._downloader.trouble(u'ERROR: unable to extract title')
2810 file_title = mobj.group(1).decode('utf-8')
2813 # Process file information
2814 self._downloader.process_info({
2815 'id': file_id.decode('utf-8'),
2816 'url': file_url.decode('utf-8'),
2818 'upload_date': u'NA',
2819 'title': file_title,
2820 'stitle': file_title,
2821 'ext': file_extension.decode('utf-8'),
2825 except UnavailableVideoError, err:
2826 self._downloader.trouble(u'ERROR: unable to download file')
2829 class FacebookIE(InfoExtractor):
2830 """Information Extractor for Facebook"""
2832 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2833 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2834 _NETRC_MACHINE = 'facebook'
2835 _available_formats = ['video', 'highqual', 'lowqual']
2836 _video_extensions = {
2841 IE_NAME = u'facebook'
2843 def __init__(self, downloader=None):
2844 InfoExtractor.__init__(self, downloader)
2846 def _reporter(self, message):
2847 """Add header and report message."""
2848 self._downloader.to_screen(u'[facebook] %s' % message)
2850 def report_login(self):
2851 """Report attempt to log in."""
2852 self._reporter(u'Logging in')
2854 def report_video_webpage_download(self, video_id):
2855 """Report attempt to download video webpage."""
2856 self._reporter(u'%s: Downloading video webpage' % video_id)
2858 def report_information_extraction(self, video_id):
2859 """Report attempt to extract video information."""
2860 self._reporter(u'%s: Extracting video information' % video_id)
2862 def _parse_page(self, video_webpage):
2863 """Extract video information from page"""
2865 data = {'title': r'\("video_title", "(.*?)"\)',
2866 'description': r'<div class="datawrap">(.*?)</div>',
2867 'owner': r'\("video_owner_name", "(.*?)"\)',
2868 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2871 for piece in data.keys():
2872 mobj = re.search(data[piece], video_webpage)
2873 if mobj is not None:
2874 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2878 for fmt in self._available_formats:
2879 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2880 if mobj is not None:
2881 # URL is in a Javascript segment inside an escaped Unicode format within
2882 # the generally utf-8 page
2883 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2884 video_info['video_urls'] = video_urls
2888 def _real_initialize(self):
2889 if self._downloader is None:
2894 downloader_params = self._downloader.params
2896 # Attempt to use provided username and password or .netrc data
2897 if downloader_params.get('username', None) is not None:
2898 useremail = downloader_params['username']
2899 password = downloader_params['password']
2900 elif downloader_params.get('usenetrc', False):
2902 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2903 if info is not None:
2907 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2908 except (IOError, netrc.NetrcParseError), err:
2909 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2912 if useremail is None:
2921 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2924 login_results = urllib2.urlopen(request).read()
2925 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2926 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2928 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2929 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2932 def _real_extract(self, url):
2933 mobj = re.match(self._VALID_URL, url)
2935 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2937 video_id = mobj.group('ID')
2940 self.report_video_webpage_download(video_id)
2941 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2943 page = urllib2.urlopen(request)
2944 video_webpage = page.read()
2945 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2946 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2949 # Start extracting information
2950 self.report_information_extraction(video_id)
2952 # Extract information
2953 video_info = self._parse_page(video_webpage)
2956 if 'owner' not in video_info:
2957 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2959 video_uploader = video_info['owner']
2962 if 'title' not in video_info:
2963 self._downloader.trouble(u'ERROR: unable to extract video title')
2965 video_title = video_info['title']
2966 video_title = video_title.decode('utf-8')
2967 video_title = sanitize_title(video_title)
2969 simple_title = _simplify_title(video_title)
2972 if 'thumbnail' not in video_info:
2973 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2974 video_thumbnail = ''
2976 video_thumbnail = video_info['thumbnail']
2980 if 'upload_date' in video_info:
2981 upload_time = video_info['upload_date']
2982 timetuple = email.utils.parsedate_tz(upload_time)
2983 if timetuple is not None:
2985 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2990 video_description = video_info.get('description', 'No description available.')
2992 url_map = video_info['video_urls']
2993 if len(url_map.keys()) > 0:
2994 # Decide which formats to download
2995 req_format = self._downloader.params.get('format', None)
2996 format_limit = self._downloader.params.get('format_limit', None)
2998 if format_limit is not None and format_limit in self._available_formats:
2999 format_list = self._available_formats[self._available_formats.index(format_limit):]
3001 format_list = self._available_formats
3002 existing_formats = [x for x in format_list if x in url_map]
3003 if len(existing_formats) == 0:
3004 self._downloader.trouble(u'ERROR: no known formats available for video')
3006 if req_format is None:
3007 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
3008 elif req_format == 'worst':
3009 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3010 elif req_format == '-1':
3011 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3014 if req_format not in url_map:
3015 self._downloader.trouble(u'ERROR: requested format not available')
3017 video_url_list = [(req_format, url_map[req_format])] # Specific format
3019 for format_param, video_real_url in video_url_list:
3021 # At this point we have a new video
3022 self._downloader.increment_downloads()
3025 video_extension = self._video_extensions.get(format_param, 'mp4')
3028 # Process video information
3029 self._downloader.process_info({
3030 'id': video_id.decode('utf-8'),
3031 'url': video_real_url.decode('utf-8'),
3032 'uploader': video_uploader.decode('utf-8'),
3033 'upload_date': upload_date,
3034 'title': video_title,
3035 'stitle': simple_title,
3036 'ext': video_extension.decode('utf-8'),
3037 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3038 'thumbnail': video_thumbnail.decode('utf-8'),
3039 'description': video_description.decode('utf-8'),
3042 except UnavailableVideoError, err:
3043 self._downloader.trouble(u'\nERROR: unable to download video')
3045 class BlipTVIE(InfoExtractor):
3046 """Information extractor for blip.tv"""
3048 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3049 _URL_EXT = r'^.*\.([a-z0-9]+)$'
3050 IE_NAME = u'blip.tv'
3052 def report_extraction(self, file_id):
3053 """Report information extraction."""
3054 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3056 def report_direct_download(self, title):
3057 """Report information extraction."""
3058 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3060 def _real_extract(self, url):
3061 mobj = re.match(self._VALID_URL, url)
3063 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3070 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3071 request = urllib2.Request(json_url)
3072 self.report_extraction(mobj.group(1))
3075 urlh = urllib2.urlopen(request)
3076 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3077 basename = url.split('/')[-1]
3078 title,ext = os.path.splitext(basename)
3079 title = title.decode('UTF-8')
3080 ext = ext.replace('.', '')
3081 self.report_direct_download(title)
3086 'stitle': _simplify_title(title),
3090 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3091 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3093 if info is None: # Regular URL
3095 json_code = urlh.read()
3096 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3097 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3101 json_data = json.loads(json_code)
3102 if 'Post' in json_data:
3103 data = json_data['Post']
3107 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3108 video_url = data['media']['url']
3109 umobj = re.match(self._URL_EXT, video_url)
3111 raise ValueError('Can not determine filename extension')
3112 ext = umobj.group(1)
3115 'id': data['item_id'],
3117 'uploader': data['display_name'],
3118 'upload_date': upload_date,
3119 'title': data['title'],
3120 'stitle': _simplify_title(data['title']),
3122 'format': data['media']['mimeType'],
3123 'thumbnail': data['thumbnailUrl'],
3124 'description': data['description'],
3125 'player_url': data['embedUrl']
3127 except (ValueError,KeyError), err:
3128 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3131 self._downloader.increment_downloads()
3134 self._downloader.process_info(info)
3135 except UnavailableVideoError, err:
3136 self._downloader.trouble(u'\nERROR: unable to download video')
3139 class MyVideoIE(InfoExtractor):
3140 """Information Extractor for myvideo.de."""
3142 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3143 IE_NAME = u'myvideo'
3145 def __init__(self, downloader=None):
3146 InfoExtractor.__init__(self, downloader)
3148 def report_download_webpage(self, video_id):
3149 """Report webpage download."""
3150 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3152 def report_extraction(self, video_id):
3153 """Report information extraction."""
3154 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3156 def _real_extract(self,url):
3157 mobj = re.match(self._VALID_URL, url)
3159 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3162 video_id = mobj.group(1)
3165 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3167 self.report_download_webpage(video_id)
3168 webpage = urllib2.urlopen(request).read()
3169 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3170 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3173 self.report_extraction(video_id)
3174 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3177 self._downloader.trouble(u'ERROR: unable to extract media URL')
3179 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3181 mobj = re.search('<title>([^<]+)</title>', webpage)
3183 self._downloader.trouble(u'ERROR: unable to extract title')
3186 video_title = mobj.group(1)
3187 video_title = sanitize_title(video_title)
3189 simple_title = _simplify_title(video_title)
3192 self._downloader.process_info({
3196 'upload_date': u'NA',
3197 'title': video_title,
3198 'stitle': simple_title,
3203 except UnavailableVideoError:
3204 self._downloader.trouble(u'\nERROR: Unable to download video')
3206 class ComedyCentralIE(InfoExtractor):
3207 """Information extractor for The Daily Show and Colbert Report """
3209 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3210 IE_NAME = u'comedycentral'
3212 def report_extraction(self, episode_id):
3213 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3215 def report_config_download(self, episode_id):
3216 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3218 def report_index_download(self, episode_id):
3219 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3221 def report_player_url(self, episode_id):
3222 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3224 def _real_extract(self, url):
3225 mobj = re.match(self._VALID_URL, url)
3227 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3230 if mobj.group('shortname'):
3231 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3232 url = u'http://www.thedailyshow.com/full-episodes/'
3234 url = u'http://www.colbertnation.com/full-episodes/'
3235 mobj = re.match(self._VALID_URL, url)
3236 assert mobj is not None
3238 dlNewest = not mobj.group('episode')
3240 epTitle = mobj.group('showname')
3242 epTitle = mobj.group('episode')
3244 req = urllib2.Request(url)
3245 self.report_extraction(epTitle)
3247 htmlHandle = urllib2.urlopen(req)
3248 html = htmlHandle.read()
3249 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3250 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3253 url = htmlHandle.geturl()
3254 mobj = re.match(self._VALID_URL, url)
3256 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3258 if mobj.group('episode') == '':
3259 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3261 epTitle = mobj.group('episode')
3263 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3264 if len(mMovieParams) == 0:
3265 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3268 playerUrl_raw = mMovieParams[0][0]
3269 self.report_player_url(epTitle)
3271 urlHandle = urllib2.urlopen(playerUrl_raw)
3272 playerUrl = urlHandle.geturl()
3273 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3274 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3277 uri = mMovieParams[0][1]
3278 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3279 self.report_index_download(epTitle)
3281 indexXml = urllib2.urlopen(indexUrl).read()
3282 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3283 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3286 idoc = xml.etree.ElementTree.fromstring(indexXml)
3287 itemEls = idoc.findall('.//item')
3288 for itemEl in itemEls:
3289 mediaId = itemEl.findall('./guid')[0].text
3290 shortMediaId = mediaId.split(':')[-1]
3291 showId = mediaId.split(':')[-2].replace('.com', '')
3292 officialTitle = itemEl.findall('./title')[0].text
3293 officialDate = itemEl.findall('./pubDate')[0].text
3295 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3296 urllib.urlencode({'uri': mediaId}))
3297 configReq = urllib2.Request(configUrl)
3298 self.report_config_download(epTitle)
3300 configXml = urllib2.urlopen(configReq).read()
3301 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3302 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3305 cdoc = xml.etree.ElementTree.fromstring(configXml)
3307 for rendition in cdoc.findall('.//rendition'):
3308 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3312 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3315 # For now, just pick the highest bitrate
3316 format,video_url = turls[-1]
3318 self._downloader.increment_downloads()
3320 effTitle = showId + u'-' + epTitle
3325 'upload_date': officialDate,
3327 'stitle': _simplify_title(effTitle),
3331 'description': officialTitle,
3332 'player_url': playerUrl
3336 self._downloader.process_info(info)
3337 except UnavailableVideoError, err:
3338 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3342 class EscapistIE(InfoExtractor):
3343 """Information extractor for The Escapist """
3345 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3346 IE_NAME = u'escapist'
3348 def report_extraction(self, showName):
3349 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3351 def report_config_download(self, showName):
3352 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3354 def _real_extract(self, url):
3355 htmlParser = HTMLParser.HTMLParser()
3357 mobj = re.match(self._VALID_URL, url)
3359 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3361 showName = mobj.group('showname')
3362 videoId = mobj.group('episode')
3364 self.report_extraction(showName)
3366 webPage = urllib2.urlopen(url).read()
3367 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3368 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3371 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3372 description = htmlParser.unescape(descMatch.group(1))
3373 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3374 imgUrl = htmlParser.unescape(imgMatch.group(1))
3375 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3376 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3377 configUrlMatch = re.search('config=(.*)$', playerUrl)
3378 configUrl = urllib2.unquote(configUrlMatch.group(1))
3380 self.report_config_download(showName)
3382 configJSON = urllib2.urlopen(configUrl).read()
3383 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3384 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3387 # Technically, it's JavaScript, not JSON
3388 configJSON = configJSON.replace("'", '"')
3391 config = json.loads(configJSON)
3392 except (ValueError,), err:
3393 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3396 playlist = config['playlist']
3397 videoUrl = playlist[1]['url']
3399 self._downloader.increment_downloads()
3403 'uploader': showName,
3404 'upload_date': None,
3406 'stitle': _simplify_title(showName),
3409 'thumbnail': imgUrl,
3410 'description': description,
3411 'player_url': playerUrl,
3415 self._downloader.process_info(info)
3416 except UnavailableVideoError, err:
3417 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3420 class CollegeHumorIE(InfoExtractor):
3421 """Information extractor for collegehumor.com"""
3423 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3424 IE_NAME = u'collegehumor'
3426 def report_webpage(self, video_id):
3427 """Report information extraction."""
3428 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3430 def report_extraction(self, video_id):
3431 """Report information extraction."""
3432 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3434 def _real_extract(self, url):
3435 htmlParser = HTMLParser.HTMLParser()
3437 mobj = re.match(self._VALID_URL, url)
3439 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3441 video_id = mobj.group('videoid')
3443 self.report_webpage(video_id)
3444 request = urllib2.Request(url)
3446 webpage = urllib2.urlopen(request).read()
3447 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3448 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3451 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3453 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3455 internal_video_id = m.group('internalvideoid')
3459 'internal_id': internal_video_id,
3462 self.report_extraction(video_id)
3463 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3465 metaXml = urllib2.urlopen(xmlUrl).read()
3466 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3467 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3470 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3472 videoNode = mdoc.findall('./video')[0]
3473 info['description'] = videoNode.findall('./description')[0].text
3474 info['title'] = videoNode.findall('./caption')[0].text
3475 info['stitle'] = _simplify_title(info['title'])
3476 info['url'] = videoNode.findall('./file')[0].text
3477 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3478 info['ext'] = info['url'].rpartition('.')[2]
3479 info['format'] = info['ext']
3481 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3484 self._downloader.increment_downloads()
3487 self._downloader.process_info(info)
3488 except UnavailableVideoError, err:
3489 self._downloader.trouble(u'\nERROR: unable to download video')
3492 class XVideosIE(InfoExtractor):
3493 """Information extractor for xvideos.com"""
3495 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3496 IE_NAME = u'xvideos'
3498 def report_webpage(self, video_id):
3499 """Report information extraction."""
3500 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3502 def report_extraction(self, video_id):
3503 """Report information extraction."""
3504 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3506 def _real_extract(self, url):
3507 htmlParser = HTMLParser.HTMLParser()
3509 mobj = re.match(self._VALID_URL, url)
3511 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3513 video_id = mobj.group(1).decode('utf-8')
3515 self.report_webpage(video_id)
3517 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3519 webpage = urllib2.urlopen(request).read()
3520 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3521 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3524 self.report_extraction(video_id)
3528 mobj = re.search(r'flv_url=(.+?)&', webpage)
3530 self._downloader.trouble(u'ERROR: unable to extract video url')
3532 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3536 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3538 self._downloader.trouble(u'ERROR: unable to extract video title')
3540 video_title = mobj.group(1).decode('utf-8')
3543 # Extract video thumbnail
3544 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3546 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3548 video_thumbnail = mobj.group(1).decode('utf-8')
3552 self._downloader.increment_downloads()
3557 'upload_date': None,
3558 'title': video_title,
3559 'stitle': _simplify_title(video_title),
3562 'thumbnail': video_thumbnail,
3563 'description': None,
3568 self._downloader.process_info(info)
3569 except UnavailableVideoError, err:
3570 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3573 class SoundcloudIE(InfoExtractor):
3574 """Information extractor for soundcloud.com
3575 To access the media, the uid of the song and a stream token
3576 must be extracted from the page source and the script must make
3577 a request to media.soundcloud.com/crossdomain.xml. Then
3578 the media can be grabbed by requesting from an url composed
3579 of the stream token and uid
3582 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3583 IE_NAME = u'soundcloud'
3585 def __init__(self, downloader=None):
3586 InfoExtractor.__init__(self, downloader)
3588 def report_webpage(self, video_id):
3589 """Report information extraction."""
3590 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3592 def report_extraction(self, video_id):
3593 """Report information extraction."""
3594 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3596 def _real_extract(self, url):
3597 htmlParser = HTMLParser.HTMLParser()
3599 mobj = re.match(self._VALID_URL, url)
3601 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3604 # extract uploader (which is in the url)
3605 uploader = mobj.group(1).decode('utf-8')
3606 # extract simple title (uploader + slug of song title)
3607 slug_title = mobj.group(2).decode('utf-8')
3608 simple_title = uploader + '-' + slug_title
3610 self.report_webpage('%s/%s' % (uploader, slug_title))
3612 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3614 webpage = urllib2.urlopen(request).read()
3615 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3616 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3619 self.report_extraction('%s/%s' % (uploader, slug_title))
3621 # extract uid and stream token that soundcloud hands out for access
3622 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3624 video_id = mobj.group(1)
3625 stream_token = mobj.group(2)
3627 # extract unsimplified title
3628 mobj = re.search('"title":"(.*?)",', webpage)
3630 title = mobj.group(1)
3632 # construct media url (with uid/token)
3633 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3634 mediaURL = mediaURL % (video_id, stream_token)
3637 description = u'No description available'
3638 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3640 description = mobj.group(1)
3644 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3647 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3648 except Exception, e:
3651 # for soundcloud, a request to a cross domain is required for cookies
3652 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3655 self._downloader.process_info({
3656 'id': video_id.decode('utf-8'),
3658 'uploader': uploader.decode('utf-8'),
3659 'upload_date': upload_date,
3660 'title': simple_title.decode('utf-8'),
3661 'stitle': simple_title.decode('utf-8'),
3665 'description': description.decode('utf-8')
3667 except UnavailableVideoError:
3668 self._downloader.trouble(u'\nERROR: unable to download video')
3671 class InfoQIE(InfoExtractor):
3672 """Information extractor for infoq.com"""
3674 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3677 def report_webpage(self, video_id):
3678 """Report information extraction."""
3679 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3681 def report_extraction(self, video_id):
3682 """Report information extraction."""
3683 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3685 def _real_extract(self, url):
3686 htmlParser = HTMLParser.HTMLParser()
3688 mobj = re.match(self._VALID_URL, url)
3690 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3693 self.report_webpage(url)
3695 request = urllib2.Request(url)
3697 webpage = urllib2.urlopen(request).read()
3698 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3699 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3702 self.report_extraction(url)
3706 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3708 self._downloader.trouble(u'ERROR: unable to extract video url')
3710 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3714 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3716 self._downloader.trouble(u'ERROR: unable to extract video title')
3718 video_title = mobj.group(1).decode('utf-8')
3720 # Extract description
3721 video_description = u'No description available.'
3722 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3723 if mobj is not None:
3724 video_description = mobj.group(1).decode('utf-8')
3726 video_filename = video_url.split('/')[-1]
3727 video_id, extension = video_filename.split('.')
3729 self._downloader.increment_downloads()
3734 'upload_date': None,
3735 'title': video_title,
3736 'stitle': _simplify_title(video_title),
3738 'format': extension, # Extension is always(?) mp4, but seems to be flv
3740 'description': video_description,
3745 self._downloader.process_info(info)
3746 except UnavailableVideoError, err:
3747 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3749 class MixcloudIE(InfoExtractor):
3750 """Information extractor for www.mixcloud.com"""
3751 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3752 IE_NAME = u'mixcloud'
3754 def __init__(self, downloader=None):
3755 InfoExtractor.__init__(self, downloader)
3757 def report_download_json(self, file_id):
3758 """Report JSON download."""
3759 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3761 def report_extraction(self, file_id):
3762 """Report information extraction."""
3763 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3765 def get_urls(self, jsonData, fmt, bitrate='best'):
3766 """Get urls from 'audio_formats' section in json"""
3769 bitrate_list = jsonData[fmt]
3770 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3771 bitrate = max(bitrate_list) # select highest
3773 url_list = jsonData[fmt][bitrate]
3774 except TypeError: # we have no bitrate info.
3775 url_list = jsonData[fmt]
3779 def check_urls(self, url_list):
3780 """Returns 1st active url from list"""
3781 for url in url_list:
3783 urllib2.urlopen(url)
3785 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3790 def _print_formats(self, formats):
3791 print 'Available formats:'
3792 for fmt in formats.keys():
3793 for b in formats[fmt]:
3795 ext = formats[fmt][b][0]
3796 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3797 except TypeError: # we have no bitrate info
3798 ext = formats[fmt][0]
3799 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3802 def _real_extract(self, url):
3803 mobj = re.match(self._VALID_URL, url)
3805 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3807 # extract uploader & filename from url
3808 uploader = mobj.group(1).decode('utf-8')
3809 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3811 # construct API request
3812 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3813 # retrieve .json file with links to files
3814 request = urllib2.Request(file_url)
3816 self.report_download_json(file_url)
3817 jsonData = urllib2.urlopen(request).read()
3818 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3819 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3823 json_data = json.loads(jsonData)
3824 player_url = json_data['player_swf_url']
3825 formats = dict(json_data['audio_formats'])
3827 req_format = self._downloader.params.get('format', None)
3830 if self._downloader.params.get('listformats', None):
3831 self._print_formats(formats)
3834 if req_format is None or req_format == 'best':
3835 for format_param in formats.keys():
3836 url_list = self.get_urls(formats, format_param)
3838 file_url = self.check_urls(url_list)
3839 if file_url is not None:
3842 if req_format not in formats.keys():
3843 self._downloader.trouble(u'ERROR: format is not available')
3846 url_list = self.get_urls(formats, req_format)
3847 file_url = self.check_urls(url_list)
3848 format_param = req_format
3851 self._downloader.increment_downloads()
3853 # Process file information
3854 self._downloader.process_info({
3855 'id': file_id.decode('utf-8'),
3856 'url': file_url.decode('utf-8'),
3857 'uploader': uploader.decode('utf-8'),
3858 'upload_date': u'NA',
3859 'title': json_data['name'],
3860 'stitle': _simplify_title(json_data['name']),
3861 'ext': file_url.split('.')[-1].decode('utf-8'),
3862 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3863 'thumbnail': json_data['thumbnail_url'],
3864 'description': json_data['description'],
3865 'player_url': player_url.decode('utf-8'),
3867 except UnavailableVideoError, err:
3868 self._downloader.trouble(u'ERROR: unable to download file')
3870 class StanfordOpenClassroomIE(InfoExtractor):
3871 """Information extractor for Stanford's Open ClassRoom"""
3873 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3874 IE_NAME = u'stanfordoc'
3876 def report_download_webpage(self, objid):
3877 """Report information extraction."""
3878 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3880 def report_extraction(self, video_id):
3881 """Report information extraction."""
3882 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3884 def _real_extract(self, url):
3885 mobj = re.match(self._VALID_URL, url)
3887 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3890 if mobj.group('course') and mobj.group('video'): # A specific video
3891 course = mobj.group('course')
3892 video = mobj.group('video')
3894 'id': _simplify_title(course + '_' + video),
3897 self.report_extraction(info['id'])
3898 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3899 xmlUrl = baseUrl + video + '.xml'
3901 metaXml = urllib2.urlopen(xmlUrl).read()
3902 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3903 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3905 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3907 info['title'] = mdoc.findall('./title')[0].text
3908 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3910 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3912 info['stitle'] = _simplify_title(info['title'])
3913 info['ext'] = info['url'].rpartition('.')[2]
3914 info['format'] = info['ext']
3915 self._downloader.increment_downloads()
3917 self._downloader.process_info(info)
3918 except UnavailableVideoError, err:
3919 self._downloader.trouble(u'\nERROR: unable to download video')
3920 elif mobj.group('course'): # A course page
3921 unescapeHTML = HTMLParser.HTMLParser().unescape
3923 course = mobj.group('course')
3925 'id': _simplify_title(course),
3929 self.report_download_webpage(info['id'])
3931 coursepage = urllib2.urlopen(url).read()
3932 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3933 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3936 m = re.search('<h1>([^<]+)</h1>', coursepage)
3938 info['title'] = unescapeHTML(m.group(1))
3940 info['title'] = info['id']
3941 info['stitle'] = _simplify_title(info['title'])
3943 m = re.search('<description>([^<]+)</description>', coursepage)
3945 info['description'] = unescapeHTML(m.group(1))
3947 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3950 'type': 'reference',
3951 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3955 for entry in info['list']:
3956 assert entry['type'] == 'reference'
3957 self.extract(entry['url'])
3959 unescapeHTML = HTMLParser.HTMLParser().unescape
3962 'id': 'Stanford OpenClassroom',
3966 self.report_download_webpage(info['id'])
3967 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3969 rootpage = urllib2.urlopen(rootURL).read()
3970 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3971 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3974 info['title'] = info['id']
3975 info['stitle'] = _simplify_title(info['title'])
3977 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3980 'type': 'reference',
3981 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3985 for entry in info['list']:
3986 assert entry['type'] == 'reference'
3987 self.extract(entry['url'])
3989 class MTVIE(InfoExtractor):
3990 """Information extractor for MTV.com"""
3992 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3995 def report_webpage(self, video_id):
3996 """Report information extraction."""
3997 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3999 def report_extraction(self, video_id):
4000 """Report information extraction."""
4001 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
4003 def _real_extract(self, url):
4004 mobj = re.match(self._VALID_URL, url)
4006 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4008 if not mobj.group('proto'):
4009 url = 'http://' + url
4010 video_id = mobj.group('videoid')
4011 self.report_webpage(video_id)
4013 request = urllib2.Request(url)
4015 webpage = urllib2.urlopen(request).read()
4016 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4017 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4020 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4022 self._downloader.trouble(u'ERROR: unable to extract song name')
4024 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4025 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4027 self._downloader.trouble(u'ERROR: unable to extract performer')
4029 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4030 video_title = performer + ' - ' + song_name
4032 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4034 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4036 mtvn_uri = mobj.group(1)
4038 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4040 self._downloader.trouble(u'ERROR: unable to extract content id')
4042 content_id = mobj.group(1)
4044 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4045 self.report_extraction(video_id)
4046 request = urllib2.Request(videogen_url)
4048 metadataXml = urllib2.urlopen(request).read()
4049 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4050 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4053 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4054 renditions = mdoc.findall('.//rendition')
4056 # For now, always pick the highest quality.
4057 rendition = renditions[-1]
4060 _,_,ext = rendition.attrib['type'].partition('/')
4061 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4062 video_url = rendition.find('./src').text
4064 self._downloader.trouble('Invalid rendition field.')
4067 self._downloader.increment_downloads()
4071 'uploader': performer,
4072 'title': video_title,
4073 'stitle': _simplify_title(video_title),
4079 self._downloader.process_info(info)
4080 except UnavailableVideoError, err:
4081 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4084 class PostProcessor(object):
4085 """Post Processor class.
4087 PostProcessor objects can be added to downloaders with their
4088 add_post_processor() method. When the downloader has finished a
4089 successful download, it will take its internal chain of PostProcessors
4090 and start calling the run() method on each one of them, first with
4091 an initial argument and then with the returned value of the previous
4094 The chain will be stopped if one of them ever returns None or the end
4095 of the chain is reached.
4097 PostProcessor objects follow a "mutual registration" process similar
4098 to InfoExtractor objects.
4103 def __init__(self, downloader=None):
4104 self._downloader = downloader
4106 def set_downloader(self, downloader):
4107 """Sets the downloader for this PP."""
4108 self._downloader = downloader
4110 def run(self, information):
4111 """Run the PostProcessor.
4113 The "information" argument is a dictionary like the ones
4114 composed by InfoExtractors. The only difference is that this
4115 one has an extra field called "filepath" that points to the
4118 When this method returns None, the postprocessing chain is
4119 stopped. However, this method may return an information
4120 dictionary that will be passed to the next postprocessing
4121 object in the chain. It can be the one it received after
4122 changing some fields.
4124 In addition, this method may raise a PostProcessingError
4125 exception that will be taken into account by the downloader
4128 return information # by default, do nothing
4130 class AudioConversionError(BaseException):
4131 def __init__(self, message):
4132 self.message = message
4134 class FFmpegExtractAudioPP(PostProcessor):
4136 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4137 PostProcessor.__init__(self, downloader)
4138 if preferredcodec is None:
4139 preferredcodec = 'best'
4140 self._preferredcodec = preferredcodec
4141 self._preferredquality = preferredquality
4142 self._keepvideo = keepvideo
4145 def get_audio_codec(path):
4147 cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4148 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4149 output = handle.communicate()[0]
4150 if handle.wait() != 0:
4152 except (IOError, OSError):
4155 for line in output.split('\n'):
4156 if line.startswith('codec_name='):
4157 audio_codec = line.split('=')[1].strip()
4158 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4163 def run_ffmpeg(path, out_path, codec, more_opts):
4167 acodec_opts = ['-acodec', codec]
4168 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4170 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4171 stdout,stderr = p.communicate()
4172 except (IOError, OSError):
4173 e = sys.exc_info()[1]
4174 if isinstance(e, OSError) and e.errno == 2:
4175 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4178 if p.returncode != 0:
4179 msg = stderr.strip().split('\n')[-1]
4180 raise AudioConversionError(msg)
4182 def run(self, information):
4183 path = information['filepath']
4185 filecodec = self.get_audio_codec(path)
4186 if filecodec is None:
4187 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4191 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4192 if self._preferredcodec == 'm4a' and filecodec == 'aac':
4193 # Lossless, but in another container
4195 extension = self._preferredcodec
4196 more_opts = ['-absf', 'aac_adtstoasc']
4197 elif filecodec in ['aac', 'mp3', 'vorbis']:
4198 # Lossless if possible
4200 extension = filecodec
4201 if filecodec == 'aac':
4202 more_opts = ['-f', 'adts']
4203 if filecodec == 'vorbis':
4207 acodec = 'libmp3lame'
4210 if self._preferredquality is not None:
4211 more_opts += ['-ab', self._preferredquality]
4213 # We convert the audio (lossy)
4214 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4215 extension = self._preferredcodec
4217 if self._preferredquality is not None:
4218 more_opts += ['-ab', self._preferredquality]
4219 if self._preferredcodec == 'aac':
4220 more_opts += ['-f', 'adts']
4221 if self._preferredcodec == 'm4a':
4222 more_opts += ['-absf', 'aac_adtstoasc']
4223 if self._preferredcodec == 'vorbis':
4225 if self._preferredcodec == 'wav':
4227 more_opts += ['-f', 'wav']
4229 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4230 new_path = prefix + sep + extension
4231 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4233 self.run_ffmpeg(path, new_path, acodec, more_opts)
4235 etype,e,tb = sys.exc_info()
4236 if isinstance(e, AudioConversionError):
4237 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4239 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4242 # Try to update the date time for extracted audio file.
4243 if information.get('filetime') is not None:
4245 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4247 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4249 if not self._keepvideo:
4251 os.remove(_encodeFilename(path))
4252 except (IOError, OSError):
4253 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4256 information['filepath'] = new_path
4260 def updateSelf(downloader, filename):
4261 ''' Update the program file with the latest version from the repository '''
4262 # Note: downloader only used for options
4263 if not os.access(filename, os.W_OK):
4264 sys.exit('ERROR: no write permissions on %s' % filename)
4266 downloader.to_screen(u'Updating to latest version...')
4270 urlh = urllib.urlopen(UPDATE_URL)
4271 newcontent = urlh.read()
4273 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4274 if vmatch is not None and vmatch.group(1) == __version__:
4275 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4279 except (IOError, OSError), err:
4280 sys.exit('ERROR: unable to download latest version')
4283 outf = open(filename, 'wb')
4285 outf.write(newcontent)
4288 except (IOError, OSError), err:
4289 sys.exit('ERROR: unable to overwrite current version')
4291 downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4294 def _readOptions(filename_bytes):
4296 optionf = open(filename_bytes)
4298 return [] # silently skip if file is not present
4302 res += shlex.split(l, comments=True)
4307 def _format_option_string(option):
4308 ''' ('-o', '--option') -> -o, --format METAVAR'''
4312 if option._short_opts: opts.append(option._short_opts[0])
4313 if option._long_opts: opts.append(option._long_opts[0])
4314 if len(opts) > 1: opts.insert(1, ', ')
4316 if option.takes_value(): opts.append(' %s' % option.metavar)
4318 return "".join(opts)
4320 def _find_term_columns():
4321 columns = os.environ.get('COLUMNS', None)
4326 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4327 out,err = sp.communicate()
4328 return int(out.split()[1])
4334 max_help_position = 80
4336 # No need to wrap help messages if we're on a wide console
4337 columns = _find_term_columns()
4338 if columns: max_width = columns
4340 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4341 fmt.format_option_strings = _format_option_string
4344 'version' : __version__,
4346 'usage' : '%prog [options] url [url...]',
4347 'conflict_handler' : 'resolve',
4350 parser = optparse.OptionParser(**kw)
4353 general = optparse.OptionGroup(parser, 'General Options')
4354 selection = optparse.OptionGroup(parser, 'Video Selection')
4355 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4356 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4357 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4358 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4359 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4361 general.add_option('-h', '--help',
4362 action='help', help='print this help text and exit')
4363 general.add_option('-v', '--version',
4364 action='version', help='print program version and exit')
4365 general.add_option('-U', '--update',
4366 action='store_true', dest='update_self', help='update this program to latest version')
4367 general.add_option('-i', '--ignore-errors',
4368 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4369 general.add_option('-r', '--rate-limit',
4370 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4371 general.add_option('-R', '--retries',
4372 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4373 general.add_option('--dump-user-agent',
4374 action='store_true', dest='dump_user_agent',
4375 help='display the current browser identification', default=False)
4376 general.add_option('--list-extractors',
4377 action='store_true', dest='list_extractors',
4378 help='List all supported extractors and the URLs they would handle', default=False)
4380 selection.add_option('--playlist-start',
4381 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4382 selection.add_option('--playlist-end',
4383 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4384 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4385 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4386 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4388 authentication.add_option('-u', '--username',
4389 dest='username', metavar='USERNAME', help='account username')
4390 authentication.add_option('-p', '--password',
4391 dest='password', metavar='PASSWORD', help='account password')
4392 authentication.add_option('-n', '--netrc',
4393 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4396 video_format.add_option('-f', '--format',
4397 action='store', dest='format', metavar='FORMAT', help='video format code')
4398 video_format.add_option('--all-formats',
4399 action='store_const', dest='format', help='download all available video formats', const='all')
4400 video_format.add_option('--prefer-free-formats',
4401 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4402 video_format.add_option('--max-quality',
4403 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4404 video_format.add_option('-F', '--list-formats',
4405 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4406 video_format.add_option('--write-srt',
4407 action='store_true', dest='writesubtitles',
4408 help='write video closed captions to a .srt file (currently youtube only)', default=False)
4409 video_format.add_option('--srt-lang',
4410 action='store', dest='subtitleslang', metavar='LANG',
4411 help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
4414 verbosity.add_option('-q', '--quiet',
4415 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4416 verbosity.add_option('-s', '--simulate',
4417 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4418 verbosity.add_option('--skip-download',
4419 action='store_true', dest='skip_download', help='do not download the video', default=False)
4420 verbosity.add_option('-g', '--get-url',
4421 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4422 verbosity.add_option('-e', '--get-title',
4423 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4424 verbosity.add_option('--get-thumbnail',
4425 action='store_true', dest='getthumbnail',
4426 help='simulate, quiet but print thumbnail URL', default=False)
4427 verbosity.add_option('--get-description',
4428 action='store_true', dest='getdescription',
4429 help='simulate, quiet but print video description', default=False)
4430 verbosity.add_option('--get-filename',
4431 action='store_true', dest='getfilename',
4432 help='simulate, quiet but print output filename', default=False)
4433 verbosity.add_option('--get-format',
4434 action='store_true', dest='getformat',
4435 help='simulate, quiet but print output format', default=False)
4436 verbosity.add_option('--no-progress',
4437 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4438 verbosity.add_option('--console-title',
4439 action='store_true', dest='consoletitle',
4440 help='display progress in console titlebar', default=False)
4441 verbosity.add_option('-v', '--verbose',
4442 action='store_true', dest='verbose', help='print various debugging information', default=False)
4445 filesystem.add_option('-t', '--title',
4446 action='store_true', dest='usetitle', help='use title in file name', default=False)
4447 filesystem.add_option('-l', '--literal',
4448 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4449 filesystem.add_option('-A', '--auto-number',
4450 action='store_true', dest='autonumber',
4451 help='number downloaded files starting from 00000', default=False)
4452 filesystem.add_option('-o', '--output',
4453 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4454 filesystem.add_option('-a', '--batch-file',
4455 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4456 filesystem.add_option('-w', '--no-overwrites',
4457 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4458 filesystem.add_option('-c', '--continue',
4459 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4460 filesystem.add_option('--no-continue',
4461 action='store_false', dest='continue_dl',
4462 help='do not resume partially downloaded files (restart from beginning)')
4463 filesystem.add_option('--cookies',
4464 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4465 filesystem.add_option('--no-part',
4466 action='store_true', dest='nopart', help='do not use .part files', default=False)
4467 filesystem.add_option('--no-mtime',
4468 action='store_false', dest='updatetime',
4469 help='do not use the Last-modified header to set the file modification time', default=True)
4470 filesystem.add_option('--write-description',
4471 action='store_true', dest='writedescription',
4472 help='write video description to a .description file', default=False)
4473 filesystem.add_option('--write-info-json',
4474 action='store_true', dest='writeinfojson',
4475 help='write video metadata to a .info.json file', default=False)
4478 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4479 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4480 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4481 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4482 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4483 help='ffmpeg audio bitrate specification, 128k by default')
4484 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4485 help='keeps the video file on disk after the post-processing; the video is erased by default')
4488 parser.add_option_group(general)
4489 parser.add_option_group(selection)
4490 parser.add_option_group(filesystem)
4491 parser.add_option_group(verbosity)
4492 parser.add_option_group(video_format)
4493 parser.add_option_group(authentication)
4494 parser.add_option_group(postproc)
4496 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4498 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4500 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4501 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4502 opts, args = parser.parse_args(argv)
4504 return parser, opts, args
4506 def gen_extractors():
4507 """ Return a list of an instance of every supported extractor.
4508 The order does matter; the first extractor matched is the one handling the URL.
4510 youtube_ie = YoutubeIE()
4511 google_ie = GoogleIE()
4512 yahoo_ie = YahooIE()
4514 YoutubePlaylistIE(youtube_ie),
4515 YoutubeUserIE(youtube_ie),
4516 YoutubeSearchIE(youtube_ie),
4518 MetacafeIE(youtube_ie),
4521 GoogleSearchIE(google_ie),
4524 YahooSearchIE(yahoo_ie),
4537 StanfordOpenClassroomIE(),
4544 parser, opts, args = parseOpts()
4546 # Open appropriate CookieJar
4547 if opts.cookiefile is None:
4548 jar = cookielib.CookieJar()
4551 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4552 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4554 except (IOError, OSError), err:
4555 sys.exit(u'ERROR: unable to open cookie file')
4558 if opts.dump_user_agent:
4559 print std_headers['User-Agent']
4562 # Batch file verification
4564 if opts.batchfile is not None:
4566 if opts.batchfile == '-':
4569 batchfd = open(opts.batchfile, 'r')
4570 batchurls = batchfd.readlines()
4571 batchurls = [x.strip() for x in batchurls]
4572 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4574 sys.exit(u'ERROR: batch file could not be read')
4575 all_urls = batchurls + args
4576 all_urls = map(lambda url: url.strip(), all_urls)
4578 # General configuration
4579 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4580 proxy_handler = urllib2.ProxyHandler()
4581 opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4582 urllib2.install_opener(opener)
4583 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4586 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4588 extractors = gen_extractors()
4590 if opts.list_extractors:
4591 for ie in extractors:
4593 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4594 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4595 for mu in matchedUrls:
4599 # Conflicting, missing and erroneous options
4600 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4601 parser.error(u'using .netrc conflicts with giving username/password')
4602 if opts.password is not None and opts.username is None:
4603 parser.error(u'account username missing')
4604 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4605 parser.error(u'using output template conflicts with using title, literal title or auto number')
4606 if opts.usetitle and opts.useliteral:
4607 parser.error(u'using title conflicts with using literal title')
4608 if opts.username is not None and opts.password is None:
4609 opts.password = getpass.getpass(u'Type account password and press return:')
4610 if opts.ratelimit is not None:
4611 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4612 if numeric_limit is None:
4613 parser.error(u'invalid rate limit specified')
4614 opts.ratelimit = numeric_limit
4615 if opts.retries is not None:
4617 opts.retries = long(opts.retries)
4618 except (TypeError, ValueError), err:
4619 parser.error(u'invalid retry count specified')
4621 opts.playliststart = int(opts.playliststart)
4622 if opts.playliststart <= 0:
4623 raise ValueError(u'Playlist start must be positive')
4624 except (TypeError, ValueError), err:
4625 parser.error(u'invalid playlist start number specified')
4627 opts.playlistend = int(opts.playlistend)
4628 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4629 raise ValueError(u'Playlist end must be greater than playlist start')
4630 except (TypeError, ValueError), err:
4631 parser.error(u'invalid playlist end number specified')
4632 if opts.extractaudio:
4633 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4634 parser.error(u'invalid audio format specified')
4637 fd = FileDownloader({
4638 'usenetrc': opts.usenetrc,
4639 'username': opts.username,
4640 'password': opts.password,
4641 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4642 'forceurl': opts.geturl,
4643 'forcetitle': opts.gettitle,
4644 'forcethumbnail': opts.getthumbnail,
4645 'forcedescription': opts.getdescription,
4646 'forcefilename': opts.getfilename,
4647 'forceformat': opts.getformat,
4648 'simulate': opts.simulate,
4649 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4650 'format': opts.format,
4651 'format_limit': opts.format_limit,
4652 'listformats': opts.listformats,
4653 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4654 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4655 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4656 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4657 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4658 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4659 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4660 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4661 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4662 or u'%(id)s.%(ext)s'),
4663 'ignoreerrors': opts.ignoreerrors,
4664 'ratelimit': opts.ratelimit,
4665 'nooverwrites': opts.nooverwrites,
4666 'retries': opts.retries,
4667 'continuedl': opts.continue_dl,
4668 'noprogress': opts.noprogress,
4669 'playliststart': opts.playliststart,
4670 'playlistend': opts.playlistend,
4671 'logtostderr': opts.outtmpl == '-',
4672 'consoletitle': opts.consoletitle,
4673 'nopart': opts.nopart,
4674 'updatetime': opts.updatetime,
4675 'writedescription': opts.writedescription,
4676 'writeinfojson': opts.writeinfojson,
4677 'writesubtitles': opts.writesubtitles,
4678 'subtitleslang': opts.subtitleslang,
4679 'matchtitle': opts.matchtitle,
4680 'rejecttitle': opts.rejecttitle,
4681 'max_downloads': opts.max_downloads,
4682 'prefer_free_formats': opts.prefer_free_formats,
4683 'verbose': opts.verbose,
4685 for extractor in extractors:
4686 fd.add_info_extractor(extractor)
4689 if opts.extractaudio:
4690 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4693 if opts.update_self:
4694 updateSelf(fd, sys.argv[0])
4697 if len(all_urls) < 1:
4698 if not opts.update_self:
4699 parser.error(u'you must provide at least one URL')
4704 retcode = fd.download(all_urls)
4705 except MaxDownloadsReached:
4706 fd.to_screen(u'--max-download limit reached, aborting.')
4709 # Dump cookie jar if requested
4710 if opts.cookiefile is not None:
4713 except (IOError, OSError), err:
4714 sys.exit(u'ERROR: unable to save cookie jar')
4721 except DownloadError:
4723 except SameFileError:
4724 sys.exit(u'ERROR: fixed output name but more than one file to download')
4725 except KeyboardInterrupt:
4726 sys.exit(u'\nERROR: Interrupted by user')
4728 if __name__ == '__main__':
4731 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: