2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
20 __license__ = 'Public Domain'
21 __version__ = '2012.02.27'
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
56 except ImportError: # Python 2.4
59 import cStringIO as StringIO
63 # parse_qs was moved from the cgi module to the urlparse module recently.
65 from urlparse import parse_qs
67 from cgi import parse_qs
75 import xml.etree.ElementTree
76 except ImportError: # Python<2.5: Not officially supported, but let it slip
77 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
80 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
81 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
82 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
83 'Accept-Encoding': 'gzip, deflate',
84 'Accept-Language': 'en-us,en;q=0.5',
89 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
95 def raiseError(msg, i):
96 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
97 def skipSpace(i, expectMore=True):
98 while i < len(s) and s[i] in ' \t\r\n':
102 raiseError('Premature end', i)
104 def decodeEscape(match):
120 return unichr(int(esc[1:5], 16))
121 if len(esc) == 5+6 and esc[5:7] == '\\u':
122 hi = int(esc[1:5], 16)
123 low = int(esc[7:11], 16)
124 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
125 raise ValueError('Unknown escape ' + str(esc))
132 while s[e-bslashes-1] == '\\':
134 if bslashes % 2 == 1:
138 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
139 stri = rexp.sub(decodeEscape, s[i:e])
145 if s[i] == '}': # Empty dictionary
149 raiseError('Expected a string object key', i)
150 i,key = parseString(i)
152 if i >= len(s) or s[i] != ':':
153 raiseError('Expected a colon', i)
160 raiseError('Expected comma or closing curly brace', i)
165 if s[i] == ']': # Empty array
170 i = skipSpace(i) # Raise exception if premature end
174 raiseError('Expected a comma or closing bracket', i)
176 def parseDiscrete(i):
177 for k,v in {'true': True, 'false': False, 'null': None}.items():
178 if s.startswith(k, i):
180 raiseError('Not a boolean (or null)', i)
182 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
184 raiseError('Not a number', i)
186 if '.' in nums or 'e' in nums or 'E' in nums:
187 return (i+len(nums), float(nums))
188 return (i+len(nums), int(nums))
189 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
192 i,res = CHARMAP.get(s[i], parseNumber)(i)
193 i = skipSpace(i, False)
197 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
200 def preferredencoding():
201 """Get preferred encoding.
203 Returns the best encoding scheme for the system, based on
204 locale.getpreferredencoding() and some further tweaks.
206 def yield_preferredencoding():
208 pref = locale.getpreferredencoding()
214 return yield_preferredencoding().next()
217 def htmlentity_transform(matchobj):
218 """Transforms an HTML entity to a Unicode character.
220 This function receives a match object and is intended to be used with
221 the re.sub() function.
223 entity = matchobj.group(1)
225 # Known non-numeric HTML entity
226 if entity in htmlentitydefs.name2codepoint:
227 return unichr(htmlentitydefs.name2codepoint[entity])
230 mobj = re.match(ur'(?u)#(x?\d+)', entity)
232 numstr = mobj.group(1)
233 if numstr.startswith(u'x'):
235 numstr = u'0%s' % numstr
238 return unichr(long(numstr, base))
240 # Unknown entity in name, return its literal representation
241 return (u'&%s;' % entity)
244 def sanitize_title(utitle):
245 """Sanitizes a video title so it could be used as part of a filename."""
246 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
247 return utitle.replace(unicode(os.sep), u'%')
250 def sanitize_open(filename, open_mode):
251 """Try to open the given filename, and slightly tweak it if this fails.
253 Attempts to open the given filename. If this fails, it tries to change
254 the filename slightly, step by step, until it's either able to open it
255 or it fails and raises a final exception, like the standard open()
258 It returns the tuple (stream, definitive_file_name).
262 if sys.platform == 'win32':
264 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
265 return (sys.stdout, filename)
266 stream = open(_encodeFilename(filename), open_mode)
267 return (stream, filename)
268 except (IOError, OSError), err:
269 # In case of error, try to remove win32 forbidden chars
270 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
272 # An exception here should be caught in the caller
273 stream = open(_encodeFilename(filename), open_mode)
274 return (stream, filename)
277 def timeconvert(timestr):
278 """Convert RFC 2822 defined time string into system timestamp"""
280 timetuple = email.utils.parsedate_tz(timestr)
281 if timetuple is not None:
282 timestamp = email.utils.mktime_tz(timetuple)
285 def _simplify_title(title):
286 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
287 return expr.sub(u'_', title).strip(u'_')
289 def _orderedSet(iterable):
290 """ Remove all duplicates from the input iterable """
297 def _unescapeHTML(s):
299 @param s a string (of type unicode)
301 assert type(s) == type(u'')
303 htmlParser = HTMLParser.HTMLParser()
304 return htmlParser.unescape(s)
306 def _encodeFilename(s):
308 @param s The name of the file (of type unicode)
311 assert type(s) == type(u'')
313 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
314 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
315 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
316 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
319 return s.encode(sys.getfilesystemencoding(), 'ignore')
321 class DownloadError(Exception):
322 """Download Error exception.
324 This exception may be thrown by FileDownloader objects if they are not
325 configured to continue on errors. They will contain the appropriate
331 class SameFileError(Exception):
332 """Same File exception.
334 This exception will be thrown by FileDownloader objects if they detect
335 multiple files would have to be downloaded to the same file on disk.
340 class PostProcessingError(Exception):
341 """Post Processing exception.
343 This exception may be raised by PostProcessor's .run() method to
344 indicate an error in the postprocessing task.
348 class MaxDownloadsReached(Exception):
349 """ --max-downloads limit has been reached. """
353 class UnavailableVideoError(Exception):
354 """Unavailable Format exception.
356 This exception will be thrown when a video is requested
357 in a format that is not available for that video.
362 class ContentTooShortError(Exception):
363 """Content Too Short exception.
365 This exception may be raised by FileDownloader objects when a file they
366 download is too small for what the server announced first, indicating
367 the connection was probably interrupted.
373 def __init__(self, downloaded, expected):
374 self.downloaded = downloaded
375 self.expected = expected
378 class YoutubeDLHandler(urllib2.HTTPHandler):
379 """Handler for HTTP requests and responses.
381 This class, when installed with an OpenerDirector, automatically adds
382 the standard headers to every HTTP request and handles gzipped and
383 deflated responses from web servers. If compression is to be avoided in
384 a particular request, the original request in the program code only has
385 to include the HTTP header "Youtubedl-No-Compression", which will be
386 removed before making the real request.
388 Part of this code was copied from:
390 http://techknack.net/python-urllib2-handlers/
392 Andrew Rowls, the author of that code, agreed to release it to the
399 return zlib.decompress(data, -zlib.MAX_WBITS)
401 return zlib.decompress(data)
404 def addinfourl_wrapper(stream, headers, url, code):
405 if hasattr(urllib2.addinfourl, 'getcode'):
406 return urllib2.addinfourl(stream, headers, url, code)
407 ret = urllib2.addinfourl(stream, headers, url)
411 def http_request(self, req):
412 for h in std_headers:
415 req.add_header(h, std_headers[h])
416 if 'Youtubedl-no-compression' in req.headers:
417 if 'Accept-encoding' in req.headers:
418 del req.headers['Accept-encoding']
419 del req.headers['Youtubedl-no-compression']
422 def http_response(self, req, resp):
425 if resp.headers.get('Content-encoding', '') == 'gzip':
426 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
427 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
428 resp.msg = old_resp.msg
430 if resp.headers.get('Content-encoding', '') == 'deflate':
431 gz = StringIO.StringIO(self.deflate(resp.read()))
432 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
433 resp.msg = old_resp.msg
437 class FileDownloader(object):
438 """File Downloader class.
440 File downloader objects are the ones responsible of downloading the
441 actual video file and writing it to disk if the user has requested
442 it, among some other tasks. In most cases there should be one per
443 program. As, given a video URL, the downloader doesn't know how to
444 extract all the needed information, task that InfoExtractors do, it
445 has to pass the URL to one of them.
447 For this, file downloader objects have a method that allows
448 InfoExtractors to be registered in a given order. When it is passed
449 a URL, the file downloader handles it to the first InfoExtractor it
450 finds that reports being able to handle it. The InfoExtractor extracts
451 all the information about the video or videos the URL refers to, and
452 asks the FileDownloader to process the video information, possibly
453 downloading the video.
455 File downloaders accept a lot of parameters. In order not to saturate
456 the object constructor with arguments, it receives a dictionary of
457 options instead. These options are available through the params
458 attribute for the InfoExtractors to use. The FileDownloader also
459 registers itself as the downloader in charge for the InfoExtractors
460 that are added to it, so this is a "mutual registration".
464 username: Username for authentication purposes.
465 password: Password for authentication purposes.
466 usenetrc: Use netrc for authentication instead.
467 quiet: Do not print messages to stdout.
468 forceurl: Force printing final URL.
469 forcetitle: Force printing title.
470 forcethumbnail: Force printing thumbnail URL.
471 forcedescription: Force printing description.
472 forcefilename: Force printing final filename.
473 simulate: Do not download the video files.
474 format: Video format code.
475 format_limit: Highest quality format to try.
476 outtmpl: Template for output names.
477 ignoreerrors: Do not stop on download errors.
478 ratelimit: Download speed limit, in bytes/sec.
479 nooverwrites: Prevent overwriting files.
480 retries: Number of times to retry for HTTP error 5xx
481 continuedl: Try to continue downloads if possible.
482 noprogress: Do not print the progress bar.
483 playliststart: Playlist item to start at.
484 playlistend: Playlist item to end at.
485 matchtitle: Download only matching titles.
486 rejecttitle: Reject downloads for matching titles.
487 logtostderr: Log messages to stderr instead of stdout.
488 consoletitle: Display progress in console window's titlebar.
489 nopart: Do not use temporary .part files.
490 updatetime: Use the Last-modified header to set output file timestamps.
491 writedescription: Write the video description to a .description file
492 writeinfojson: Write the video description to a .info.json file
493 writesubtitles: Write the video subtitles to a .srt file
494 subtitleslang: Language of the subtitles to download
500 _download_retcode = None
501 _num_downloads = None
504 def __init__(self, params):
505 """Create a FileDownloader object with the given options."""
508 self._download_retcode = 0
509 self._num_downloads = 0
510 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
514 def format_bytes(bytes):
517 if type(bytes) is str:
522 exponent = long(math.log(bytes, 1024.0))
523 suffix = 'bkMGTPEZY'[exponent]
524 converted = float(bytes) / float(1024 ** exponent)
525 return '%.2f%s' % (converted, suffix)
528 def calc_percent(byte_counter, data_len):
531 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
534 def calc_eta(start, now, total, current):
538 if current == 0 or dif < 0.001: # One millisecond
540 rate = float(current) / dif
541 eta = long((float(total) - float(current)) / rate)
542 (eta_mins, eta_secs) = divmod(eta, 60)
545 return '%02d:%02d' % (eta_mins, eta_secs)
548 def calc_speed(start, now, bytes):
550 if bytes == 0 or dif < 0.001: # One millisecond
551 return '%10s' % '---b/s'
552 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
555 def best_block_size(elapsed_time, bytes):
556 new_min = max(bytes / 2.0, 1.0)
557 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
558 if elapsed_time < 0.001:
560 rate = bytes / elapsed_time
568 def parse_bytes(bytestr):
569 """Parse a string indicating a byte quantity into a long integer."""
570 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
573 number = float(matchobj.group(1))
574 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
575 return long(round(number * multiplier))
577 def add_info_extractor(self, ie):
578 """Add an InfoExtractor object to the end of the list."""
580 ie.set_downloader(self)
582 def add_post_processor(self, pp):
583 """Add a PostProcessor object to the end of the chain."""
585 pp.set_downloader(self)
587 def to_screen(self, message, skip_eol=False):
588 """Print message to stdout if not in quiet mode."""
589 assert type(message) == type(u'')
590 if not self.params.get('quiet', False):
591 terminator = [u'\n', u''][skip_eol]
592 output = message + terminator
594 if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
595 output = output.encode(preferredencoding(), 'ignore')
596 self._screen_file.write(output)
597 self._screen_file.flush()
599 def to_stderr(self, message):
600 """Print message to stderr."""
601 print >>sys.stderr, message.encode(preferredencoding())
603 def to_cons_title(self, message):
604 """Set console/terminal window title to message."""
605 if not self.params.get('consoletitle', False):
607 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
608 # c_wchar_p() might not be necessary if `message` is
609 # already of type unicode()
610 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
611 elif 'TERM' in os.environ:
612 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
614 def fixed_template(self):
615 """Checks if the output template is fixed."""
616 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
618 def trouble(self, message=None):
619 """Determine action to take when a download problem appears.
621 Depending on if the downloader has been configured to ignore
622 download errors or not, this method may throw an exception or
623 not when errors are found, after printing the message.
625 if message is not None:
626 self.to_stderr(message)
627 if not self.params.get('ignoreerrors', False):
628 raise DownloadError(message)
629 self._download_retcode = 1
631 def slow_down(self, start_time, byte_counter):
632 """Sleep if the download speed is over the rate limit."""
633 rate_limit = self.params.get('ratelimit', None)
634 if rate_limit is None or byte_counter == 0:
637 elapsed = now - start_time
640 speed = float(byte_counter) / elapsed
641 if speed > rate_limit:
642 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
644 def temp_name(self, filename):
645 """Returns a temporary filename for the given filename."""
646 if self.params.get('nopart', False) or filename == u'-' or \
647 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
649 return filename + u'.part'
651 def undo_temp_name(self, filename):
652 if filename.endswith(u'.part'):
653 return filename[:-len(u'.part')]
656 def try_rename(self, old_filename, new_filename):
658 if old_filename == new_filename:
660 os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
661 except (IOError, OSError), err:
662 self.trouble(u'ERROR: unable to rename file')
664 def try_utime(self, filename, last_modified_hdr):
665 """Try to set the last-modified time of the given file."""
666 if last_modified_hdr is None:
668 if not os.path.isfile(_encodeFilename(filename)):
670 timestr = last_modified_hdr
673 filetime = timeconvert(timestr)
677 os.utime(filename, (time.time(), filetime))
682 def report_writedescription(self, descfn):
683 """ Report that the description file is being written """
684 self.to_screen(u'[info] Writing video description to: ' + descfn)
686 def report_writesubtitles(self, srtfn):
687 """ Report that the subtitles file is being written """
688 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
690 def report_writeinfojson(self, infofn):
691 """ Report that the metadata file has been written """
692 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
694 def report_destination(self, filename):
695 """Report destination filename."""
696 self.to_screen(u'[download] Destination: ' + filename)
698 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
699 """Report download progress."""
700 if self.params.get('noprogress', False):
702 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
703 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
704 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
705 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
707 def report_resuming_byte(self, resume_len):
708 """Report attempt to resume at given byte."""
709 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
711 def report_retry(self, count, retries):
712 """Report retry in case of HTTP error 5xx"""
713 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
715 def report_file_already_downloaded(self, file_name):
716 """Report file has already been fully downloaded."""
718 self.to_screen(u'[download] %s has already been downloaded' % file_name)
719 except (UnicodeEncodeError), err:
720 self.to_screen(u'[download] The file has already been downloaded')
722 def report_unable_to_resume(self):
723 """Report it was impossible to resume download."""
724 self.to_screen(u'[download] Unable to resume')
726 def report_finish(self):
727 """Report download finished."""
728 if self.params.get('noprogress', False):
729 self.to_screen(u'[download] Download completed')
733 def increment_downloads(self):
734 """Increment the ordinal that assigns a number to each file."""
735 self._num_downloads += 1
737 def prepare_filename(self, info_dict):
738 """Generate the output filename."""
740 template_dict = dict(info_dict)
741 template_dict['epoch'] = unicode(long(time.time()))
742 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
743 filename = self.params['outtmpl'] % template_dict
745 except (ValueError, KeyError), err:
746 self.trouble(u'ERROR: invalid system charset or erroneous output template')
749 def _match_entry(self, info_dict):
750 """ Returns None iff the file should be downloaded """
752 title = info_dict['title']
753 matchtitle = self.params.get('matchtitle', False)
754 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
755 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
756 rejecttitle = self.params.get('rejecttitle', False)
757 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
758 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
761 def process_info(self, info_dict):
762 """Process a single dictionary returned by an InfoExtractor."""
764 reason = self._match_entry(info_dict)
765 if reason is not None:
766 self.to_screen(u'[download] ' + reason)
769 max_downloads = self.params.get('max_downloads')
770 if max_downloads is not None:
771 if self._num_downloads > int(max_downloads):
772 raise MaxDownloadsReached()
774 filename = self.prepare_filename(info_dict)
777 if self.params.get('forcetitle', False):
778 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
779 if self.params.get('forceurl', False):
780 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
781 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
782 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
783 if self.params.get('forcedescription', False) and 'description' in info_dict:
784 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
785 if self.params.get('forcefilename', False) and filename is not None:
786 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
787 if self.params.get('forceformat', False):
788 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
790 # Do nothing else if in simulate mode
791 if self.params.get('simulate', False):
798 dn = os.path.dirname(_encodeFilename(filename))
799 if dn != '' and not os.path.exists(dn): # dn is already encoded
801 except (OSError, IOError), err:
802 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
805 if self.params.get('writedescription', False):
807 descfn = filename + u'.description'
808 self.report_writedescription(descfn)
809 descfile = open(_encodeFilename(descfn), 'wb')
811 descfile.write(info_dict['description'].encode('utf-8'))
814 except (OSError, IOError):
815 self.trouble(u'ERROR: Cannot write description file ' + descfn)
818 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
819 # subtitles download errors are already managed as troubles in relevant IE
820 # that way it will silently go on when used with unsupporting IE
822 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
823 self.report_writesubtitles(srtfn)
824 srtfile = open(_encodeFilename(srtfn), 'wb')
826 srtfile.write(info_dict['subtitles'].encode('utf-8'))
829 except (OSError, IOError):
830 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
833 if self.params.get('writeinfojson', False):
834 infofn = filename + u'.info.json'
835 self.report_writeinfojson(infofn)
838 except (NameError,AttributeError):
839 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
842 infof = open(_encodeFilename(infofn), 'wb')
844 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
845 json.dump(json_info_dict, infof)
848 except (OSError, IOError):
849 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
852 if not self.params.get('skip_download', False):
853 if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
857 success = self._do_download(filename, info_dict)
858 except (OSError, IOError), err:
859 raise UnavailableVideoError
860 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
861 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
863 except (ContentTooShortError, ), err:
864 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
869 self.post_process(filename, info_dict)
870 except (PostProcessingError), err:
871 self.trouble(u'ERROR: postprocessing: %s' % str(err))
874 def download(self, url_list):
875 """Download a given list of URLs."""
876 if len(url_list) > 1 and self.fixed_template():
877 raise SameFileError(self.params['outtmpl'])
880 suitable_found = False
882 # Go to next InfoExtractor if not suitable
883 if not ie.suitable(url):
886 # Suitable InfoExtractor found
887 suitable_found = True
889 # Extract information from URL and process it
892 # Suitable InfoExtractor had been found; go to next URL
895 if not suitable_found:
896 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
898 return self._download_retcode
900 def post_process(self, filename, ie_info):
901 """Run the postprocessing chain on the given file."""
903 info['filepath'] = filename
909 def _download_with_rtmpdump(self, filename, url, player_url):
910 self.report_destination(filename)
911 tmpfilename = self.temp_name(filename)
913 # Check for rtmpdump first
915 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
916 except (OSError, IOError):
917 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
920 # Download using rtmpdump. rtmpdump returns exit code 2 when
921 # the connection was interrumpted and resuming appears to be
922 # possible. This is part of rtmpdump's normal usage, AFAIK.
923 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
924 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
925 if self.params.get('verbose', False):
928 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
931 self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
932 retval = subprocess.call(args)
933 while retval == 2 or retval == 1:
934 prevsize = os.path.getsize(_encodeFilename(tmpfilename))
935 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
936 time.sleep(5.0) # This seems to be needed
937 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
938 cursize = os.path.getsize(_encodeFilename(tmpfilename))
939 if prevsize == cursize and retval == 1:
941 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
942 if prevsize == cursize and retval == 2 and cursize > 1024:
943 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
947 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
948 self.try_rename(tmpfilename, filename)
951 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
954 def _do_download(self, filename, info_dict):
955 url = info_dict['url']
956 player_url = info_dict.get('player_url', None)
958 # Check file already present
959 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
960 self.report_file_already_downloaded(filename)
963 # Attempt to download using rtmpdump
964 if url.startswith('rtmp'):
965 return self._download_with_rtmpdump(filename, url, player_url)
967 tmpfilename = self.temp_name(filename)
970 # Do not include the Accept-Encoding header
971 headers = {'Youtubedl-no-compression': 'True'}
972 basic_request = urllib2.Request(url, None, headers)
973 request = urllib2.Request(url, None, headers)
975 # Establish possible resume length
976 if os.path.isfile(_encodeFilename(tmpfilename)):
977 resume_len = os.path.getsize(_encodeFilename(tmpfilename))
983 if self.params.get('continuedl', False):
984 self.report_resuming_byte(resume_len)
985 request.add_header('Range','bytes=%d-' % resume_len)
991 retries = self.params.get('retries', 0)
992 while count <= retries:
993 # Establish connection
995 if count == 0 and 'urlhandle' in info_dict:
996 data = info_dict['urlhandle']
997 data = urllib2.urlopen(request)
999 except (urllib2.HTTPError, ), err:
1000 if (err.code < 500 or err.code >= 600) and err.code != 416:
1001 # Unexpected HTTP error
1003 elif err.code == 416:
1004 # Unable to resume (requested range not satisfiable)
1006 # Open the connection again without the range header
1007 data = urllib2.urlopen(basic_request)
1008 content_length = data.info()['Content-Length']
1009 except (urllib2.HTTPError, ), err:
1010 if err.code < 500 or err.code >= 600:
1013 # Examine the reported length
1014 if (content_length is not None and
1015 (resume_len - 100 < long(content_length) < resume_len + 100)):
1016 # The file had already been fully downloaded.
1017 # Explanation to the above condition: in issue #175 it was revealed that
1018 # YouTube sometimes adds or removes a few bytes from the end of the file,
1019 # changing the file size slightly and causing problems for some users. So
1020 # I decided to implement a suggested change and consider the file
1021 # completely downloaded if the file size differs less than 100 bytes from
1022 # the one in the hard drive.
1023 self.report_file_already_downloaded(filename)
1024 self.try_rename(tmpfilename, filename)
1027 # The length does not match, we start the download over
1028 self.report_unable_to_resume()
1033 if count <= retries:
1034 self.report_retry(count, retries)
1037 self.trouble(u'ERROR: giving up after %s retries' % retries)
1040 data_len = data.info().get('Content-length', None)
1041 if data_len is not None:
1042 data_len = long(data_len) + resume_len
1043 data_len_str = self.format_bytes(data_len)
1044 byte_counter = 0 + resume_len
1048 # Download and write
1049 before = time.time()
1050 data_block = data.read(block_size)
1052 if len(data_block) == 0:
1054 byte_counter += len(data_block)
1056 # Open file just in time
1059 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1060 assert stream is not None
1061 filename = self.undo_temp_name(tmpfilename)
1062 self.report_destination(filename)
1063 except (OSError, IOError), err:
1064 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1067 stream.write(data_block)
1068 except (IOError, OSError), err:
1069 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1071 block_size = self.best_block_size(after - before, len(data_block))
1074 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1075 if data_len is None:
1076 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1078 percent_str = self.calc_percent(byte_counter, data_len)
1079 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1080 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1083 self.slow_down(start, byte_counter - resume_len)
1086 self.trouble(u'\nERROR: Did not get any data blocks')
1089 self.report_finish()
1090 if data_len is not None and byte_counter != data_len:
1091 raise ContentTooShortError(byte_counter, long(data_len))
1092 self.try_rename(tmpfilename, filename)
1094 # Update file modification time
1095 if self.params.get('updatetime', True):
1096 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1101 class InfoExtractor(object):
1102 """Information Extractor class.
1104 Information extractors are the classes that, given a URL, extract
1105 information from the video (or videos) the URL refers to. This
1106 information includes the real video URL, the video title and simplified
1107 title, author and others. The information is stored in a dictionary
1108 which is then passed to the FileDownloader. The FileDownloader
1109 processes this information possibly downloading the video to the file
1110 system, among other possible outcomes. The dictionaries must include
1111 the following fields:
1113 id: Video identifier.
1114 url: Final video URL.
1115 uploader: Nickname of the video uploader.
1116 title: Literal title.
1117 stitle: Simplified title.
1118 ext: Video filename extension.
1119 format: Video format.
1120 player_url: SWF Player URL (may be None).
1122 The following fields are optional. Their primary purpose is to allow
1123 youtube-dl to serve as the backend for a video search function, such
1124 as the one in youtube2mp3. They are only used when their respective
1125 forced printing functions are called:
1127 thumbnail: Full URL to a video thumbnail image.
1128 description: One-line video description.
1130 Subclasses of this one should re-define the _real_initialize() and
1131 _real_extract() methods and define a _VALID_URL regexp.
1132 Probably, they should also be added to the list of extractors.
1138 def __init__(self, downloader=None):
1139 """Constructor. Receives an optional downloader."""
1141 self.set_downloader(downloader)
1143 def suitable(self, url):
1144 """Receives a URL and returns True if suitable for this IE."""
1145 return re.match(self._VALID_URL, url) is not None
1147 def initialize(self):
1148 """Initializes an instance (authentication, etc)."""
1150 self._real_initialize()
1153 def extract(self, url):
1154 """Extracts URL information and returns it in list of dicts."""
1156 return self._real_extract(url)
1158 def set_downloader(self, downloader):
1159 """Sets the downloader for this IE."""
1160 self._downloader = downloader
1162 def _real_initialize(self):
1163 """Real initialization process. Redefine in subclasses."""
1166 def _real_extract(self, url):
1167 """Real extraction process. Redefine in subclasses."""
1171 class YoutubeIE(InfoExtractor):
1172 """Information extractor for youtube.com."""
1174 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1175 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1176 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1177 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1178 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
1179 _NETRC_MACHINE = 'youtube'
1180 # Listed in order of quality
1181 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1182 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1183 _video_extensions = {
1189 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1194 _video_dimensions = {
1209 IE_NAME = u'youtube'
1211 def report_lang(self):
1212 """Report attempt to set language."""
1213 self._downloader.to_screen(u'[youtube] Setting language')
1215 def report_login(self):
1216 """Report attempt to log in."""
1217 self._downloader.to_screen(u'[youtube] Logging in')
1219 def report_age_confirmation(self):
1220 """Report attempt to confirm age."""
1221 self._downloader.to_screen(u'[youtube] Confirming age')
1223 def report_video_webpage_download(self, video_id):
1224 """Report attempt to download video webpage."""
1225 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1227 def report_video_info_webpage_download(self, video_id):
1228 """Report attempt to download video info webpage."""
1229 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1231 def report_video_subtitles_download(self, video_id):
1232 """Report attempt to download video info webpage."""
1233 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1235 def report_information_extraction(self, video_id):
1236 """Report attempt to extract video information."""
1237 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1239 def report_unavailable_format(self, video_id, format):
1240 """Report extracted video URL."""
1241 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1243 def report_rtmp_download(self):
1244 """Indicate the download will use the RTMP protocol."""
1245 self._downloader.to_screen(u'[youtube] RTMP download detected')
1247 def _closed_captions_xml_to_srt(self, xml_string):
1249 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1250 # TODO parse xml instead of regex
1251 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1252 if not dur: dur = '4'
1253 start = float(start)
1254 end = start + float(dur)
1255 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1256 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1257 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
1258 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
1259 srt += str(n) + '\n'
1260 srt += start + ' --> ' + end + '\n'
1261 srt += caption + '\n\n'
1264 def _print_formats(self, formats):
1265 print 'Available formats:'
1267 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1269 def _real_initialize(self):
1270 if self._downloader is None:
1275 downloader_params = self._downloader.params
1277 # Attempt to use provided username and password or .netrc data
1278 if downloader_params.get('username', None) is not None:
1279 username = downloader_params['username']
1280 password = downloader_params['password']
1281 elif downloader_params.get('usenetrc', False):
1283 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1284 if info is not None:
1288 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1289 except (IOError, netrc.NetrcParseError), err:
1290 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1294 request = urllib2.Request(self._LANG_URL)
1297 urllib2.urlopen(request).read()
1298 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1299 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1302 # No authentication to be performed
1303 if username is None:
1308 'current_form': 'loginForm',
1310 'action_login': 'Log In',
1311 'username': username,
1312 'password': password,
1314 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1317 login_results = urllib2.urlopen(request).read()
1318 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1319 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1321 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1322 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1328 'action_confirm': 'Confirm',
1330 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1332 self.report_age_confirmation()
1333 age_results = urllib2.urlopen(request).read()
1334 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1335 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1338 def _real_extract(self, url):
1339 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1340 mobj = re.search(self._NEXT_URL_RE, url)
1342 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
1344 # Extract video id from URL
1345 mobj = re.match(self._VALID_URL, url)
1347 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1349 video_id = mobj.group(2)
1352 self.report_video_webpage_download(video_id)
1353 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1355 video_webpage = urllib2.urlopen(request).read()
1356 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1357 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1360 # Attempt to extract SWF player URL
1361 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1362 if mobj is not None:
1363 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1368 self.report_video_info_webpage_download(video_id)
1369 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1370 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1371 % (video_id, el_type))
1372 request = urllib2.Request(video_info_url)
1374 video_info_webpage = urllib2.urlopen(request).read()
1375 video_info = parse_qs(video_info_webpage)
1376 if 'token' in video_info:
1378 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1379 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1381 if 'token' not in video_info:
1382 if 'reason' in video_info:
1383 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1385 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1388 # Start extracting information
1389 self.report_information_extraction(video_id)
1392 if 'author' not in video_info:
1393 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1395 video_uploader = urllib.unquote_plus(video_info['author'][0])
1398 if 'title' not in video_info:
1399 self._downloader.trouble(u'ERROR: unable to extract video title')
1401 video_title = urllib.unquote_plus(video_info['title'][0])
1402 video_title = video_title.decode('utf-8')
1403 video_title = sanitize_title(video_title)
1406 simple_title = _simplify_title(video_title)
1409 if 'thumbnail_url' not in video_info:
1410 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1411 video_thumbnail = ''
1412 else: # don't panic if we can't find it
1413 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1417 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1418 if mobj is not None:
1419 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1420 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1421 for expression in format_expressions:
1423 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1431 video_description = u'No description available.'
1432 mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1433 if mobj is not None:
1434 video_description = mobj.group(1).decode('utf-8')
1436 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1437 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1438 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1439 # TODO use another parser
1442 video_subtitles = None
1443 if self._downloader.params.get('writesubtitles', False):
1444 self.report_video_subtitles_download(video_id)
1445 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1447 srt_list = urllib2.urlopen(request).read()
1448 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1449 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1451 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1453 if self._downloader.params.get('subtitleslang', False):
1454 srt_lang = self._downloader.params.get('subtitleslang')
1455 elif 'en' in srt_lang_list:
1458 srt_lang = srt_lang_list[0]
1459 if not srt_lang in srt_lang_list:
1460 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
1462 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1464 srt_xml = urllib2.urlopen(request).read()
1465 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1466 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1468 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1470 self._downloader.trouble(u'WARNING: video has no closed captions')
1473 video_token = urllib.unquote_plus(video_info['token'][0])
1475 # Decide which formats to download
1476 req_format = self._downloader.params.get('format', None)
1478 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1479 self.report_rtmp_download()
1480 video_url_list = [(None, video_info['conn'][0])]
1481 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1482 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1483 url_data = [parse_qs(uds) for uds in url_data_strs]
1484 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1485 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1487 format_limit = self._downloader.params.get('format_limit', None)
1488 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1489 if format_limit is not None and format_limit in available_formats:
1490 format_list = available_formats[available_formats.index(format_limit):]
1492 format_list = available_formats
1493 existing_formats = [x for x in format_list if x in url_map]
1494 if len(existing_formats) == 0:
1495 self._downloader.trouble(u'ERROR: no known formats available for video')
1497 if self._downloader.params.get('listformats', None):
1498 self._print_formats(existing_formats)
1500 if req_format is None or req_format == 'best':
1501 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1502 elif req_format == 'worst':
1503 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1504 elif req_format in ('-1', 'all'):
1505 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1507 # Specific formats. We pick the first in a slash-delimeted sequence.
1508 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1509 req_formats = req_format.split('/')
1510 video_url_list = None
1511 for rf in req_formats:
1513 video_url_list = [(rf, url_map[rf])]
1515 if video_url_list is None:
1516 self._downloader.trouble(u'ERROR: requested format not available')
1519 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1522 for format_param, video_real_url in video_url_list:
1523 # At this point we have a new video
1524 self._downloader.increment_downloads()
1527 video_extension = self._video_extensions.get(format_param, 'flv')
1530 # Process video information
1531 self._downloader.process_info({
1532 'id': video_id.decode('utf-8'),
1533 'url': video_real_url.decode('utf-8'),
1534 'uploader': video_uploader.decode('utf-8'),
1535 'upload_date': upload_date,
1536 'title': video_title,
1537 'stitle': simple_title,
1538 'ext': video_extension.decode('utf-8'),
1539 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1540 'thumbnail': video_thumbnail.decode('utf-8'),
1541 'description': video_description,
1542 'player_url': player_url,
1543 'subtitles': video_subtitles
1545 except UnavailableVideoError, err:
1546 self._downloader.trouble(u'\nERROR: unable to download video')
1549 class MetacafeIE(InfoExtractor):
1550 """Information Extractor for metacafe.com."""
1552 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1553 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1554 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1556 IE_NAME = u'metacafe'
1558 def __init__(self, youtube_ie, downloader=None):
1559 InfoExtractor.__init__(self, downloader)
1560 self._youtube_ie = youtube_ie
1562 def report_disclaimer(self):
1563 """Report disclaimer retrieval."""
1564 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1566 def report_age_confirmation(self):
1567 """Report attempt to confirm age."""
1568 self._downloader.to_screen(u'[metacafe] Confirming age')
1570 def report_download_webpage(self, video_id):
1571 """Report webpage download."""
1572 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1574 def report_extraction(self, video_id):
1575 """Report information extraction."""
1576 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1578 def _real_initialize(self):
1579 # Retrieve disclaimer
1580 request = urllib2.Request(self._DISCLAIMER)
1582 self.report_disclaimer()
1583 disclaimer = urllib2.urlopen(request).read()
1584 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1585 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1591 'submit': "Continue - I'm over 18",
1593 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1595 self.report_age_confirmation()
1596 disclaimer = urllib2.urlopen(request).read()
1597 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1598 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1601 def _real_extract(self, url):
1602 # Extract id and simplified title from URL
1603 mobj = re.match(self._VALID_URL, url)
1605 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1608 video_id = mobj.group(1)
1610 # Check if video comes from YouTube
1611 mobj2 = re.match(r'^yt-(.*)$', video_id)
1612 if mobj2 is not None:
1613 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1616 # At this point we have a new video
1617 self._downloader.increment_downloads()
1619 simple_title = mobj.group(2).decode('utf-8')
1621 # Retrieve video webpage to extract further information
1622 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1624 self.report_download_webpage(video_id)
1625 webpage = urllib2.urlopen(request).read()
1626 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1627 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1630 # Extract URL, uploader and title from webpage
1631 self.report_extraction(video_id)
1632 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1633 if mobj is not None:
1634 mediaURL = urllib.unquote(mobj.group(1))
1635 video_extension = mediaURL[-3:]
1637 # Extract gdaKey if available
1638 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1640 video_url = mediaURL
1642 gdaKey = mobj.group(1)
1643 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1645 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1647 self._downloader.trouble(u'ERROR: unable to extract media URL')
1649 vardict = parse_qs(mobj.group(1))
1650 if 'mediaData' not in vardict:
1651 self._downloader.trouble(u'ERROR: unable to extract media URL')
1653 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1655 self._downloader.trouble(u'ERROR: unable to extract media URL')
1657 mediaURL = mobj.group(1).replace('\\/', '/')
1658 video_extension = mediaURL[-3:]
1659 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1661 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1663 self._downloader.trouble(u'ERROR: unable to extract title')
1665 video_title = mobj.group(1).decode('utf-8')
1666 video_title = sanitize_title(video_title)
1668 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1670 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1672 video_uploader = mobj.group(1)
1675 # Process video information
1676 self._downloader.process_info({
1677 'id': video_id.decode('utf-8'),
1678 'url': video_url.decode('utf-8'),
1679 'uploader': video_uploader.decode('utf-8'),
1680 'upload_date': u'NA',
1681 'title': video_title,
1682 'stitle': simple_title,
1683 'ext': video_extension.decode('utf-8'),
1687 except UnavailableVideoError:
1688 self._downloader.trouble(u'\nERROR: unable to download video')
1691 class DailymotionIE(InfoExtractor):
1692 """Information Extractor for Dailymotion"""
1694 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1695 IE_NAME = u'dailymotion'
1697 def __init__(self, downloader=None):
1698 InfoExtractor.__init__(self, downloader)
1700 def report_download_webpage(self, video_id):
1701 """Report webpage download."""
1702 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1704 def report_extraction(self, video_id):
1705 """Report information extraction."""
1706 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1708 def _real_extract(self, url):
1709 # Extract id and simplified title from URL
1710 mobj = re.match(self._VALID_URL, url)
1712 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1715 # At this point we have a new video
1716 self._downloader.increment_downloads()
1717 video_id = mobj.group(1)
1719 video_extension = 'flv'
1721 # Retrieve video webpage to extract further information
1722 request = urllib2.Request(url)
1723 request.add_header('Cookie', 'family_filter=off')
1725 self.report_download_webpage(video_id)
1726 webpage = urllib2.urlopen(request).read()
1727 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1728 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1731 # Extract URL, uploader and title from webpage
1732 self.report_extraction(video_id)
1733 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1735 self._downloader.trouble(u'ERROR: unable to extract media URL')
1737 sequence = urllib.unquote(mobj.group(1))
1738 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1740 self._downloader.trouble(u'ERROR: unable to extract media URL')
1742 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1744 # if needed add http://www.dailymotion.com/ if relative URL
1746 video_url = mediaURL
1748 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1750 self._downloader.trouble(u'ERROR: unable to extract title')
1752 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1753 video_title = sanitize_title(video_title)
1754 simple_title = _simplify_title(video_title)
1756 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1758 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1760 video_uploader = mobj.group(1)
1763 # Process video information
1764 self._downloader.process_info({
1765 'id': video_id.decode('utf-8'),
1766 'url': video_url.decode('utf-8'),
1767 'uploader': video_uploader.decode('utf-8'),
1768 'upload_date': u'NA',
1769 'title': video_title,
1770 'stitle': simple_title,
1771 'ext': video_extension.decode('utf-8'),
1775 except UnavailableVideoError:
1776 self._downloader.trouble(u'\nERROR: unable to download video')
1779 class GoogleIE(InfoExtractor):
1780 """Information extractor for video.google.com."""
1782 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1783 IE_NAME = u'video.google'
1785 def __init__(self, downloader=None):
1786 InfoExtractor.__init__(self, downloader)
1788 def report_download_webpage(self, video_id):
1789 """Report webpage download."""
1790 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1792 def report_extraction(self, video_id):
1793 """Report information extraction."""
1794 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1796 def _real_extract(self, url):
1797 # Extract id from URL
1798 mobj = re.match(self._VALID_URL, url)
1800 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1803 # At this point we have a new video
1804 self._downloader.increment_downloads()
1805 video_id = mobj.group(1)
1807 video_extension = 'mp4'
1809 # Retrieve video webpage to extract further information
1810 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1812 self.report_download_webpage(video_id)
1813 webpage = urllib2.urlopen(request).read()
1814 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1815 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1818 # Extract URL, uploader, and title from webpage
1819 self.report_extraction(video_id)
1820 mobj = re.search(r"download_url:'([^']+)'", webpage)
1822 video_extension = 'flv'
1823 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1825 self._downloader.trouble(u'ERROR: unable to extract media URL')
1827 mediaURL = urllib.unquote(mobj.group(1))
1828 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1829 mediaURL = mediaURL.replace('\\x26', '\x26')
1831 video_url = mediaURL
1833 mobj = re.search(r'<title>(.*)</title>', webpage)
1835 self._downloader.trouble(u'ERROR: unable to extract title')
1837 video_title = mobj.group(1).decode('utf-8')
1838 video_title = sanitize_title(video_title)
1839 simple_title = _simplify_title(video_title)
1841 # Extract video description
1842 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1844 self._downloader.trouble(u'ERROR: unable to extract video description')
1846 video_description = mobj.group(1).decode('utf-8')
1847 if not video_description:
1848 video_description = 'No description available.'
1850 # Extract video thumbnail
1851 if self._downloader.params.get('forcethumbnail', False):
1852 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1854 webpage = urllib2.urlopen(request).read()
1855 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1856 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1858 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1860 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1862 video_thumbnail = mobj.group(1)
1863 else: # we need something to pass to process_info
1864 video_thumbnail = ''
1867 # Process video information
1868 self._downloader.process_info({
1869 'id': video_id.decode('utf-8'),
1870 'url': video_url.decode('utf-8'),
1872 'upload_date': u'NA',
1873 'title': video_title,
1874 'stitle': simple_title,
1875 'ext': video_extension.decode('utf-8'),
1879 except UnavailableVideoError:
1880 self._downloader.trouble(u'\nERROR: unable to download video')
1883 class PhotobucketIE(InfoExtractor):
1884 """Information extractor for photobucket.com."""
1886 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1887 IE_NAME = u'photobucket'
1889 def __init__(self, downloader=None):
1890 InfoExtractor.__init__(self, downloader)
1892 def report_download_webpage(self, video_id):
1893 """Report webpage download."""
1894 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1896 def report_extraction(self, video_id):
1897 """Report information extraction."""
1898 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1900 def _real_extract(self, url):
1901 # Extract id from URL
1902 mobj = re.match(self._VALID_URL, url)
1904 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1907 # At this point we have a new video
1908 self._downloader.increment_downloads()
1909 video_id = mobj.group(1)
1911 video_extension = 'flv'
1913 # Retrieve video webpage to extract further information
1914 request = urllib2.Request(url)
1916 self.report_download_webpage(video_id)
1917 webpage = urllib2.urlopen(request).read()
1918 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1919 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1922 # Extract URL, uploader, and title from webpage
1923 self.report_extraction(video_id)
1924 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1926 self._downloader.trouble(u'ERROR: unable to extract media URL')
1928 mediaURL = urllib.unquote(mobj.group(1))
1930 video_url = mediaURL
1932 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1934 self._downloader.trouble(u'ERROR: unable to extract title')
1936 video_title = mobj.group(1).decode('utf-8')
1937 video_title = sanitize_title(video_title)
1938 simple_title = _simplify_title(vide_title)
1940 video_uploader = mobj.group(2).decode('utf-8')
1943 # Process video information
1944 self._downloader.process_info({
1945 'id': video_id.decode('utf-8'),
1946 'url': video_url.decode('utf-8'),
1947 'uploader': video_uploader,
1948 'upload_date': u'NA',
1949 'title': video_title,
1950 'stitle': simple_title,
1951 'ext': video_extension.decode('utf-8'),
1955 except UnavailableVideoError:
1956 self._downloader.trouble(u'\nERROR: unable to download video')
1959 class YahooIE(InfoExtractor):
1960 """Information extractor for video.yahoo.com."""
1962 # _VALID_URL matches all Yahoo! Video URLs
1963 # _VPAGE_URL matches only the extractable '/watch/' URLs
1964 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1965 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1966 IE_NAME = u'video.yahoo'
1968 def __init__(self, downloader=None):
1969 InfoExtractor.__init__(self, downloader)
1971 def report_download_webpage(self, video_id):
1972 """Report webpage download."""
1973 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1975 def report_extraction(self, video_id):
1976 """Report information extraction."""
1977 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1979 def _real_extract(self, url, new_video=True):
1980 # Extract ID from URL
1981 mobj = re.match(self._VALID_URL, url)
1983 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1986 # At this point we have a new video
1987 self._downloader.increment_downloads()
1988 video_id = mobj.group(2)
1989 video_extension = 'flv'
1991 # Rewrite valid but non-extractable URLs as
1992 # extractable English language /watch/ URLs
1993 if re.match(self._VPAGE_URL, url) is None:
1994 request = urllib2.Request(url)
1996 webpage = urllib2.urlopen(request).read()
1997 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1998 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2001 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
2003 self._downloader.trouble(u'ERROR: Unable to extract id field')
2005 yahoo_id = mobj.group(1)
2007 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2009 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2011 yahoo_vid = mobj.group(1)
2013 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2014 return self._real_extract(url, new_video=False)
2016 # Retrieve video webpage to extract further information
2017 request = urllib2.Request(url)
2019 self.report_download_webpage(video_id)
2020 webpage = urllib2.urlopen(request).read()
2021 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2022 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2025 # Extract uploader and title from webpage
2026 self.report_extraction(video_id)
2027 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2029 self._downloader.trouble(u'ERROR: unable to extract video title')
2031 video_title = mobj.group(1).decode('utf-8')
2032 simple_title = _simplify_title(video_title)
2034 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2036 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2038 video_uploader = mobj.group(1).decode('utf-8')
2040 # Extract video thumbnail
2041 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2043 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2045 video_thumbnail = mobj.group(1).decode('utf-8')
2047 # Extract video description
2048 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2050 self._downloader.trouble(u'ERROR: unable to extract video description')
2052 video_description = mobj.group(1).decode('utf-8')
2053 if not video_description:
2054 video_description = 'No description available.'
2056 # Extract video height and width
2057 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2059 self._downloader.trouble(u'ERROR: unable to extract video height')
2061 yv_video_height = mobj.group(1)
2063 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2065 self._downloader.trouble(u'ERROR: unable to extract video width')
2067 yv_video_width = mobj.group(1)
2069 # Retrieve video playlist to extract media URL
2070 # I'm not completely sure what all these options are, but we
2071 # seem to need most of them, otherwise the server sends a 401.
2072 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
2073 yv_bitrate = '700' # according to Wikipedia this is hard-coded
2074 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2075 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2076 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2078 self.report_download_webpage(video_id)
2079 webpage = urllib2.urlopen(request).read()
2080 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2081 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2084 # Extract media URL from playlist XML
2085 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2087 self._downloader.trouble(u'ERROR: Unable to extract media URL')
2089 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2090 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2093 # Process video information
2094 self._downloader.process_info({
2095 'id': video_id.decode('utf-8'),
2097 'uploader': video_uploader,
2098 'upload_date': u'NA',
2099 'title': video_title,
2100 'stitle': simple_title,
2101 'ext': video_extension.decode('utf-8'),
2102 'thumbnail': video_thumbnail.decode('utf-8'),
2103 'description': video_description,
2104 'thumbnail': video_thumbnail,
2107 except UnavailableVideoError:
2108 self._downloader.trouble(u'\nERROR: unable to download video')
2111 class VimeoIE(InfoExtractor):
2112 """Information extractor for vimeo.com."""
2114 # _VALID_URL matches Vimeo URLs
2115 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2118 def __init__(self, downloader=None):
2119 InfoExtractor.__init__(self, downloader)
2121 def report_download_webpage(self, video_id):
2122 """Report webpage download."""
2123 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2125 def report_extraction(self, video_id):
2126 """Report information extraction."""
2127 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2129 def _real_extract(self, url, new_video=True):
2130 # Extract ID from URL
2131 mobj = re.match(self._VALID_URL, url)
2133 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2136 # At this point we have a new video
2137 self._downloader.increment_downloads()
2138 video_id = mobj.group(1)
2140 # Retrieve video webpage to extract further information
2141 request = urllib2.Request(url, None, std_headers)
2143 self.report_download_webpage(video_id)
2144 webpage = urllib2.urlopen(request).read()
2145 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2146 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2149 # Now we begin extracting as much information as we can from what we
2150 # retrieved. First we extract the information common to all extractors,
2151 # and latter we extract those that are Vimeo specific.
2152 self.report_extraction(video_id)
2154 # Extract the config JSON
2155 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2157 config = json.loads(config)
2159 self._downloader.trouble(u'ERROR: unable to extract info section')
2163 video_title = config["video"]["title"]
2164 simple_title = _simplify_title(video_title)
2167 video_uploader = config["video"]["owner"]["name"]
2169 # Extract video thumbnail
2170 video_thumbnail = config["video"]["thumbnail"]
2172 # Extract video description
2176 video_description = u'No description available.'
2177 mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2178 if mobj is not None:
2179 video_description = mobj.group(1)
2181 html_parser = lxml.etree.HTMLParser()
2182 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2183 video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2184 # TODO use another parser
2186 # Extract upload date
2187 video_upload_date = u'NA'
2188 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2189 if mobj is not None:
2190 video_upload_date = mobj.group(1)
2192 # Vimeo specific: extract request signature and timestamp
2193 sig = config['request']['signature']
2194 timestamp = config['request']['timestamp']
2196 # Vimeo specific: extract video codec and quality information
2197 # TODO bind to format param
2198 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2199 for codec in codecs:
2200 if codec[0] in config["video"]["files"]:
2201 video_codec = codec[0]
2202 video_extension = codec[1]
2203 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2204 else: quality = 'sd'
2207 self._downloader.trouble(u'ERROR: no known codec found')
2210 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2211 %(video_id, sig, timestamp, quality, video_codec.upper())
2214 # Process video information
2215 self._downloader.process_info({
2218 'uploader': video_uploader,
2219 'upload_date': video_upload_date,
2220 'title': video_title,
2221 'stitle': simple_title,
2222 'ext': video_extension,
2223 'thumbnail': video_thumbnail,
2224 'description': video_description,
2227 except UnavailableVideoError:
2228 self._downloader.trouble(u'ERROR: unable to download video')
2231 class GenericIE(InfoExtractor):
2232 """Generic last-resort information extractor."""
2235 IE_NAME = u'generic'
2237 def __init__(self, downloader=None):
2238 InfoExtractor.__init__(self, downloader)
2240 def report_download_webpage(self, video_id):
2241 """Report webpage download."""
2242 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2243 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2245 def report_extraction(self, video_id):
2246 """Report information extraction."""
2247 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2249 def _real_extract(self, url):
2250 # At this point we have a new video
2251 self._downloader.increment_downloads()
2253 video_id = url.split('/')[-1]
2254 request = urllib2.Request(url)
2256 self.report_download_webpage(video_id)
2257 webpage = urllib2.urlopen(request).read()
2258 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2259 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2261 except ValueError, err:
2262 # since this is the last-resort InfoExtractor, if
2263 # this error is thrown, it'll be thrown here
2264 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2267 self.report_extraction(video_id)
2268 # Start with something easy: JW Player in SWFObject
2269 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2271 # Broaden the search a little bit
2272 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2274 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2277 # It's possible that one of the regexes
2278 # matched, but returned an empty group:
2279 if mobj.group(1) is None:
2280 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2283 video_url = urllib.unquote(mobj.group(1))
2284 video_id = os.path.basename(video_url)
2286 # here's a fun little line of code for you:
2287 video_extension = os.path.splitext(video_id)[1][1:]
2288 video_id = os.path.splitext(video_id)[0]
2290 # it's tempting to parse this further, but you would
2291 # have to take into account all the variations like
2292 # Video Title - Site Name
2293 # Site Name | Video Title
2294 # Video Title - Tagline | Site Name
2295 # and so on and so forth; it's just not practical
2296 mobj = re.search(r'<title>(.*)</title>', webpage)
2298 self._downloader.trouble(u'ERROR: unable to extract title')
2300 video_title = mobj.group(1).decode('utf-8')
2301 video_title = sanitize_title(video_title)
2302 simple_title = _simplify_title(video_title)
2304 # video uploader is domain name
2305 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2307 self._downloader.trouble(u'ERROR: unable to extract title')
2309 video_uploader = mobj.group(1).decode('utf-8')
2312 # Process video information
2313 self._downloader.process_info({
2314 'id': video_id.decode('utf-8'),
2315 'url': video_url.decode('utf-8'),
2316 'uploader': video_uploader,
2317 'upload_date': u'NA',
2318 'title': video_title,
2319 'stitle': simple_title,
2320 'ext': video_extension.decode('utf-8'),
2324 except UnavailableVideoError, err:
2325 self._downloader.trouble(u'\nERROR: unable to download video')
2328 class YoutubeSearchIE(InfoExtractor):
2329 """Information Extractor for YouTube search queries."""
2330 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2331 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
2333 _max_youtube_results = 1000
2334 IE_NAME = u'youtube:search'
2336 def __init__(self, youtube_ie, downloader=None):
2337 InfoExtractor.__init__(self, downloader)
2338 self._youtube_ie = youtube_ie
2340 def report_download_page(self, query, pagenum):
2341 """Report attempt to download playlist page with given number."""
2342 query = query.decode(preferredencoding())
2343 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2345 def _real_initialize(self):
2346 self._youtube_ie.initialize()
2348 def _real_extract(self, query):
2349 mobj = re.match(self._VALID_URL, query)
2351 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2354 prefix, query = query.split(':')
2356 query = query.encode('utf-8')
2358 self._download_n_results(query, 1)
2360 elif prefix == 'all':
2361 self._download_n_results(query, self._max_youtube_results)
2367 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2369 elif n > self._max_youtube_results:
2370 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2371 n = self._max_youtube_results
2372 self._download_n_results(query, n)
2374 except ValueError: # parsing prefix as integer fails
2375 self._download_n_results(query, 1)
2378 def _download_n_results(self, query, n):
2379 """Downloads a specified number of results for a query"""
2385 while (50 * pagenum) < limit:
2386 self.report_download_page(query, pagenum+1)
2387 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
2388 request = urllib2.Request(result_url)
2390 data = urllib2.urlopen(request).read()
2391 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2392 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
2394 api_response = json.loads(data)['data']
2396 new_ids = list(video['id'] for video in api_response['items'])
2397 video_ids += new_ids
2399 limit = min(n, api_response['totalItems'])
2402 if len(video_ids) > n:
2403 video_ids = video_ids[:n]
2404 for id in video_ids:
2405 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2409 class GoogleSearchIE(InfoExtractor):
2410 """Information Extractor for Google Video search queries."""
2411 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2412 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2413 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2414 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2416 _max_google_results = 1000
2417 IE_NAME = u'video.google:search'
2419 def __init__(self, google_ie, downloader=None):
2420 InfoExtractor.__init__(self, downloader)
2421 self._google_ie = google_ie
2423 def report_download_page(self, query, pagenum):
2424 """Report attempt to download playlist page with given number."""
2425 query = query.decode(preferredencoding())
2426 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2428 def _real_initialize(self):
2429 self._google_ie.initialize()
2431 def _real_extract(self, query):
2432 mobj = re.match(self._VALID_URL, query)
2434 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2437 prefix, query = query.split(':')
2439 query = query.encode('utf-8')
2441 self._download_n_results(query, 1)
2443 elif prefix == 'all':
2444 self._download_n_results(query, self._max_google_results)
2450 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2452 elif n > self._max_google_results:
2453 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2454 n = self._max_google_results
2455 self._download_n_results(query, n)
2457 except ValueError: # parsing prefix as integer fails
2458 self._download_n_results(query, 1)
2461 def _download_n_results(self, query, n):
2462 """Downloads a specified number of results for a query"""
2468 self.report_download_page(query, pagenum)
2469 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2470 request = urllib2.Request(result_url)
2472 page = urllib2.urlopen(request).read()
2473 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2474 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2477 # Extract video identifiers
2478 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2479 video_id = mobj.group(1)
2480 if video_id not in video_ids:
2481 video_ids.append(video_id)
2482 if len(video_ids) == n:
2483 # Specified n videos reached
2484 for id in video_ids:
2485 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2488 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2489 for id in video_ids:
2490 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2493 pagenum = pagenum + 1
2496 class YahooSearchIE(InfoExtractor):
2497 """Information Extractor for Yahoo! Video search queries."""
2498 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2499 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2500 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2501 _MORE_PAGES_INDICATOR = r'\s*Next'
2503 _max_yahoo_results = 1000
2504 IE_NAME = u'video.yahoo:search'
2506 def __init__(self, yahoo_ie, downloader=None):
2507 InfoExtractor.__init__(self, downloader)
2508 self._yahoo_ie = yahoo_ie
2510 def report_download_page(self, query, pagenum):
2511 """Report attempt to download playlist page with given number."""
2512 query = query.decode(preferredencoding())
2513 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2515 def _real_initialize(self):
2516 self._yahoo_ie.initialize()
2518 def _real_extract(self, query):
2519 mobj = re.match(self._VALID_URL, query)
2521 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2524 prefix, query = query.split(':')
2526 query = query.encode('utf-8')
2528 self._download_n_results(query, 1)
2530 elif prefix == 'all':
2531 self._download_n_results(query, self._max_yahoo_results)
2537 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2539 elif n > self._max_yahoo_results:
2540 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2541 n = self._max_yahoo_results
2542 self._download_n_results(query, n)
2544 except ValueError: # parsing prefix as integer fails
2545 self._download_n_results(query, 1)
2548 def _download_n_results(self, query, n):
2549 """Downloads a specified number of results for a query"""
2552 already_seen = set()
2556 self.report_download_page(query, pagenum)
2557 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2558 request = urllib2.Request(result_url)
2560 page = urllib2.urlopen(request).read()
2561 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2562 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2565 # Extract video identifiers
2566 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2567 video_id = mobj.group(1)
2568 if video_id not in already_seen:
2569 video_ids.append(video_id)
2570 already_seen.add(video_id)
2571 if len(video_ids) == n:
2572 # Specified n videos reached
2573 for id in video_ids:
2574 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2577 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2578 for id in video_ids:
2579 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2582 pagenum = pagenum + 1
2585 class YoutubePlaylistIE(InfoExtractor):
2586 """Information Extractor for YouTube playlists."""
2588 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2589 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2590 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=PL%s&'
2591 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2593 IE_NAME = u'youtube:playlist'
2595 def __init__(self, youtube_ie, downloader=None):
2596 InfoExtractor.__init__(self, downloader)
2597 self._youtube_ie = youtube_ie
2599 def report_download_page(self, playlist_id, pagenum):
2600 """Report attempt to download playlist page with given number."""
2601 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2603 def _real_initialize(self):
2604 self._youtube_ie.initialize()
2606 def _real_extract(self, url):
2607 # Extract playlist id
2608 mobj = re.match(self._VALID_URL, url)
2610 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2614 if mobj.group(3) is not None:
2615 self._youtube_ie.extract(mobj.group(3))
2618 # Download playlist pages
2619 # prefix is 'p' as default for playlists but there are other types that need extra care
2620 playlist_prefix = mobj.group(1)
2621 if playlist_prefix == 'a':
2622 playlist_access = 'artist'
2624 playlist_prefix = 'p'
2625 playlist_access = 'view_play_list'
2626 playlist_id = mobj.group(2)
2631 self.report_download_page(playlist_id, pagenum)
2632 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2633 request = urllib2.Request(url)
2635 page = urllib2.urlopen(request).read()
2636 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2637 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2640 # Extract video identifiers
2642 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
2643 if mobj.group(1) not in ids_in_page:
2644 ids_in_page.append(mobj.group(1))
2645 video_ids.extend(ids_in_page)
2647 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2649 pagenum = pagenum + 1
2651 playliststart = self._downloader.params.get('playliststart', 1) - 1
2652 playlistend = self._downloader.params.get('playlistend', -1)
2653 if playlistend == -1:
2654 video_ids = video_ids[playliststart:]
2656 video_ids = video_ids[playliststart:playlistend]
2658 for id in video_ids:
2659 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2663 class YoutubeUserIE(InfoExtractor):
2664 """Information Extractor for YouTube users."""
2666 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2667 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2668 _GDATA_PAGE_SIZE = 50
2669 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2670 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2672 IE_NAME = u'youtube:user'
2674 def __init__(self, youtube_ie, downloader=None):
2675 InfoExtractor.__init__(self, downloader)
2676 self._youtube_ie = youtube_ie
2678 def report_download_page(self, username, start_index):
2679 """Report attempt to download user page."""
2680 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2681 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2683 def _real_initialize(self):
2684 self._youtube_ie.initialize()
2686 def _real_extract(self, url):
2688 mobj = re.match(self._VALID_URL, url)
2690 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2693 username = mobj.group(1)
2695 # Download video ids using YouTube Data API. Result size per
2696 # query is limited (currently to 50 videos) so we need to query
2697 # page by page until there are no video ids - it means we got
2704 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2705 self.report_download_page(username, start_index)
2707 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2710 page = urllib2.urlopen(request).read()
2711 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2712 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2715 # Extract video identifiers
2718 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2719 if mobj.group(1) not in ids_in_page:
2720 ids_in_page.append(mobj.group(1))
2722 video_ids.extend(ids_in_page)
2724 # A little optimization - if current page is not
2725 # "full", ie. does not contain PAGE_SIZE video ids then
2726 # we can assume that this page is the last one - there
2727 # are no more ids on further pages - no need to query
2730 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2735 all_ids_count = len(video_ids)
2736 playliststart = self._downloader.params.get('playliststart', 1) - 1
2737 playlistend = self._downloader.params.get('playlistend', -1)
2739 if playlistend == -1:
2740 video_ids = video_ids[playliststart:]
2742 video_ids = video_ids[playliststart:playlistend]
2744 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2745 (username, all_ids_count, len(video_ids)))
2747 for video_id in video_ids:
2748 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2751 class DepositFilesIE(InfoExtractor):
2752 """Information extractor for depositfiles.com"""
2754 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2755 IE_NAME = u'DepositFiles'
2757 def __init__(self, downloader=None):
2758 InfoExtractor.__init__(self, downloader)
2760 def report_download_webpage(self, file_id):
2761 """Report webpage download."""
2762 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2764 def report_extraction(self, file_id):
2765 """Report information extraction."""
2766 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2768 def _real_extract(self, url):
2769 # At this point we have a new file
2770 self._downloader.increment_downloads()
2772 file_id = url.split('/')[-1]
2773 # Rebuild url in english locale
2774 url = 'http://depositfiles.com/en/files/' + file_id
2776 # Retrieve file webpage with 'Free download' button pressed
2777 free_download_indication = { 'gateway_result' : '1' }
2778 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2780 self.report_download_webpage(file_id)
2781 webpage = urllib2.urlopen(request).read()
2782 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2783 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2786 # Search for the real file URL
2787 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2788 if (mobj is None) or (mobj.group(1) is None):
2789 # Try to figure out reason of the error.
2790 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2791 if (mobj is not None) and (mobj.group(1) is not None):
2792 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2793 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2795 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2798 file_url = mobj.group(1)
2799 file_extension = os.path.splitext(file_url)[1][1:]
2801 # Search for file title
2802 mobj = re.search(r'<b title="(.*?)">', webpage)
2804 self._downloader.trouble(u'ERROR: unable to extract title')
2806 file_title = mobj.group(1).decode('utf-8')
2809 # Process file information
2810 self._downloader.process_info({
2811 'id': file_id.decode('utf-8'),
2812 'url': file_url.decode('utf-8'),
2814 'upload_date': u'NA',
2815 'title': file_title,
2816 'stitle': file_title,
2817 'ext': file_extension.decode('utf-8'),
2821 except UnavailableVideoError, err:
2822 self._downloader.trouble(u'ERROR: unable to download file')
2825 class FacebookIE(InfoExtractor):
2826 """Information Extractor for Facebook"""
2828 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2829 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2830 _NETRC_MACHINE = 'facebook'
2831 _available_formats = ['video', 'highqual', 'lowqual']
2832 _video_extensions = {
2837 IE_NAME = u'facebook'
2839 def __init__(self, downloader=None):
2840 InfoExtractor.__init__(self, downloader)
2842 def _reporter(self, message):
2843 """Add header and report message."""
2844 self._downloader.to_screen(u'[facebook] %s' % message)
2846 def report_login(self):
2847 """Report attempt to log in."""
2848 self._reporter(u'Logging in')
2850 def report_video_webpage_download(self, video_id):
2851 """Report attempt to download video webpage."""
2852 self._reporter(u'%s: Downloading video webpage' % video_id)
2854 def report_information_extraction(self, video_id):
2855 """Report attempt to extract video information."""
2856 self._reporter(u'%s: Extracting video information' % video_id)
2858 def _parse_page(self, video_webpage):
2859 """Extract video information from page"""
2861 data = {'title': r'\("video_title", "(.*?)"\)',
2862 'description': r'<div class="datawrap">(.*?)</div>',
2863 'owner': r'\("video_owner_name", "(.*?)"\)',
2864 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2867 for piece in data.keys():
2868 mobj = re.search(data[piece], video_webpage)
2869 if mobj is not None:
2870 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2874 for fmt in self._available_formats:
2875 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2876 if mobj is not None:
2877 # URL is in a Javascript segment inside an escaped Unicode format within
2878 # the generally utf-8 page
2879 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2880 video_info['video_urls'] = video_urls
2884 def _real_initialize(self):
2885 if self._downloader is None:
2890 downloader_params = self._downloader.params
2892 # Attempt to use provided username and password or .netrc data
2893 if downloader_params.get('username', None) is not None:
2894 useremail = downloader_params['username']
2895 password = downloader_params['password']
2896 elif downloader_params.get('usenetrc', False):
2898 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2899 if info is not None:
2903 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2904 except (IOError, netrc.NetrcParseError), err:
2905 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2908 if useremail is None:
2917 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2920 login_results = urllib2.urlopen(request).read()
2921 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2922 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2924 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2925 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2928 def _real_extract(self, url):
2929 mobj = re.match(self._VALID_URL, url)
2931 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2933 video_id = mobj.group('ID')
2936 self.report_video_webpage_download(video_id)
2937 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2939 page = urllib2.urlopen(request)
2940 video_webpage = page.read()
2941 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2942 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2945 # Start extracting information
2946 self.report_information_extraction(video_id)
2948 # Extract information
2949 video_info = self._parse_page(video_webpage)
2952 if 'owner' not in video_info:
2953 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2955 video_uploader = video_info['owner']
2958 if 'title' not in video_info:
2959 self._downloader.trouble(u'ERROR: unable to extract video title')
2961 video_title = video_info['title']
2962 video_title = video_title.decode('utf-8')
2963 video_title = sanitize_title(video_title)
2965 simple_title = _simplify_title(video_title)
2968 if 'thumbnail' not in video_info:
2969 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2970 video_thumbnail = ''
2972 video_thumbnail = video_info['thumbnail']
2976 if 'upload_date' in video_info:
2977 upload_time = video_info['upload_date']
2978 timetuple = email.utils.parsedate_tz(upload_time)
2979 if timetuple is not None:
2981 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2986 video_description = video_info.get('description', 'No description available.')
2988 url_map = video_info['video_urls']
2989 if len(url_map.keys()) > 0:
2990 # Decide which formats to download
2991 req_format = self._downloader.params.get('format', None)
2992 format_limit = self._downloader.params.get('format_limit', None)
2994 if format_limit is not None and format_limit in self._available_formats:
2995 format_list = self._available_formats[self._available_formats.index(format_limit):]
2997 format_list = self._available_formats
2998 existing_formats = [x for x in format_list if x in url_map]
2999 if len(existing_formats) == 0:
3000 self._downloader.trouble(u'ERROR: no known formats available for video')
3002 if req_format is None:
3003 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
3004 elif req_format == 'worst':
3005 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3006 elif req_format == '-1':
3007 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3010 if req_format not in url_map:
3011 self._downloader.trouble(u'ERROR: requested format not available')
3013 video_url_list = [(req_format, url_map[req_format])] # Specific format
3015 for format_param, video_real_url in video_url_list:
3017 # At this point we have a new video
3018 self._downloader.increment_downloads()
3021 video_extension = self._video_extensions.get(format_param, 'mp4')
3024 # Process video information
3025 self._downloader.process_info({
3026 'id': video_id.decode('utf-8'),
3027 'url': video_real_url.decode('utf-8'),
3028 'uploader': video_uploader.decode('utf-8'),
3029 'upload_date': upload_date,
3030 'title': video_title,
3031 'stitle': simple_title,
3032 'ext': video_extension.decode('utf-8'),
3033 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3034 'thumbnail': video_thumbnail.decode('utf-8'),
3035 'description': video_description.decode('utf-8'),
3038 except UnavailableVideoError, err:
3039 self._downloader.trouble(u'\nERROR: unable to download video')
3041 class BlipTVIE(InfoExtractor):
3042 """Information extractor for blip.tv"""
3044 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3045 _URL_EXT = r'^.*\.([a-z0-9]+)$'
3046 IE_NAME = u'blip.tv'
3048 def report_extraction(self, file_id):
3049 """Report information extraction."""
3050 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3052 def report_direct_download(self, title):
3053 """Report information extraction."""
3054 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3056 def _real_extract(self, url):
3057 mobj = re.match(self._VALID_URL, url)
3059 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3066 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3067 request = urllib2.Request(json_url)
3068 self.report_extraction(mobj.group(1))
3071 urlh = urllib2.urlopen(request)
3072 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3073 basename = url.split('/')[-1]
3074 title,ext = os.path.splitext(basename)
3075 title = title.decode('UTF-8')
3076 ext = ext.replace('.', '')
3077 self.report_direct_download(title)
3082 'stitle': _simplify_title(title),
3086 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3087 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3089 if info is None: # Regular URL
3091 json_code = urlh.read()
3092 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3093 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3097 json_data = json.loads(json_code)
3098 if 'Post' in json_data:
3099 data = json_data['Post']
3103 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3104 video_url = data['media']['url']
3105 umobj = re.match(self._URL_EXT, video_url)
3107 raise ValueError('Can not determine filename extension')
3108 ext = umobj.group(1)
3111 'id': data['item_id'],
3113 'uploader': data['display_name'],
3114 'upload_date': upload_date,
3115 'title': data['title'],
3116 'stitle': _simplify_title(data['title']),
3118 'format': data['media']['mimeType'],
3119 'thumbnail': data['thumbnailUrl'],
3120 'description': data['description'],
3121 'player_url': data['embedUrl']
3123 except (ValueError,KeyError), err:
3124 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3127 self._downloader.increment_downloads()
3130 self._downloader.process_info(info)
3131 except UnavailableVideoError, err:
3132 self._downloader.trouble(u'\nERROR: unable to download video')
3135 class MyVideoIE(InfoExtractor):
3136 """Information Extractor for myvideo.de."""
3138 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3139 IE_NAME = u'myvideo'
3141 def __init__(self, downloader=None):
3142 InfoExtractor.__init__(self, downloader)
3144 def report_download_webpage(self, video_id):
3145 """Report webpage download."""
3146 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3148 def report_extraction(self, video_id):
3149 """Report information extraction."""
3150 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3152 def _real_extract(self,url):
3153 mobj = re.match(self._VALID_URL, url)
3155 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3158 video_id = mobj.group(1)
3161 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3163 self.report_download_webpage(video_id)
3164 webpage = urllib2.urlopen(request).read()
3165 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3166 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3169 self.report_extraction(video_id)
3170 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3173 self._downloader.trouble(u'ERROR: unable to extract media URL')
3175 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3177 mobj = re.search('<title>([^<]+)</title>', webpage)
3179 self._downloader.trouble(u'ERROR: unable to extract title')
3182 video_title = mobj.group(1)
3183 video_title = sanitize_title(video_title)
3185 simple_title = _simplify_title(video_title)
3188 self._downloader.process_info({
3192 'upload_date': u'NA',
3193 'title': video_title,
3194 'stitle': simple_title,
3199 except UnavailableVideoError:
3200 self._downloader.trouble(u'\nERROR: Unable to download video')
3202 class ComedyCentralIE(InfoExtractor):
3203 """Information extractor for The Daily Show and Colbert Report """
3205 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3206 IE_NAME = u'comedycentral'
3208 def report_extraction(self, episode_id):
3209 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3211 def report_config_download(self, episode_id):
3212 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3214 def report_index_download(self, episode_id):
3215 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3217 def report_player_url(self, episode_id):
3218 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3220 def _real_extract(self, url):
3221 mobj = re.match(self._VALID_URL, url)
3223 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3226 if mobj.group('shortname'):
3227 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3228 url = u'http://www.thedailyshow.com/full-episodes/'
3230 url = u'http://www.colbertnation.com/full-episodes/'
3231 mobj = re.match(self._VALID_URL, url)
3232 assert mobj is not None
3234 dlNewest = not mobj.group('episode')
3236 epTitle = mobj.group('showname')
3238 epTitle = mobj.group('episode')
3240 req = urllib2.Request(url)
3241 self.report_extraction(epTitle)
3243 htmlHandle = urllib2.urlopen(req)
3244 html = htmlHandle.read()
3245 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3246 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3249 url = htmlHandle.geturl()
3250 mobj = re.match(self._VALID_URL, url)
3252 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3254 if mobj.group('episode') == '':
3255 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3257 epTitle = mobj.group('episode')
3259 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3260 if len(mMovieParams) == 0:
3261 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3264 playerUrl_raw = mMovieParams[0][0]
3265 self.report_player_url(epTitle)
3267 urlHandle = urllib2.urlopen(playerUrl_raw)
3268 playerUrl = urlHandle.geturl()
3269 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3270 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3273 uri = mMovieParams[0][1]
3274 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3275 self.report_index_download(epTitle)
3277 indexXml = urllib2.urlopen(indexUrl).read()
3278 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3279 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3282 idoc = xml.etree.ElementTree.fromstring(indexXml)
3283 itemEls = idoc.findall('.//item')
3284 for itemEl in itemEls:
3285 mediaId = itemEl.findall('./guid')[0].text
3286 shortMediaId = mediaId.split(':')[-1]
3287 showId = mediaId.split(':')[-2].replace('.com', '')
3288 officialTitle = itemEl.findall('./title')[0].text
3289 officialDate = itemEl.findall('./pubDate')[0].text
3291 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3292 urllib.urlencode({'uri': mediaId}))
3293 configReq = urllib2.Request(configUrl)
3294 self.report_config_download(epTitle)
3296 configXml = urllib2.urlopen(configReq).read()
3297 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3298 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3301 cdoc = xml.etree.ElementTree.fromstring(configXml)
3303 for rendition in cdoc.findall('.//rendition'):
3304 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3308 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3311 # For now, just pick the highest bitrate
3312 format,video_url = turls[-1]
3314 self._downloader.increment_downloads()
3316 effTitle = showId + u'-' + epTitle
3321 'upload_date': officialDate,
3323 'stitle': _simplify_title(effTitle),
3327 'description': officialTitle,
3328 'player_url': playerUrl
3332 self._downloader.process_info(info)
3333 except UnavailableVideoError, err:
3334 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3338 class EscapistIE(InfoExtractor):
3339 """Information extractor for The Escapist """
3341 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3342 IE_NAME = u'escapist'
3344 def report_extraction(self, showName):
3345 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3347 def report_config_download(self, showName):
3348 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3350 def _real_extract(self, url):
3351 htmlParser = HTMLParser.HTMLParser()
3353 mobj = re.match(self._VALID_URL, url)
3355 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3357 showName = mobj.group('showname')
3358 videoId = mobj.group('episode')
3360 self.report_extraction(showName)
3362 webPage = urllib2.urlopen(url).read()
3363 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3364 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3367 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3368 description = htmlParser.unescape(descMatch.group(1))
3369 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3370 imgUrl = htmlParser.unescape(imgMatch.group(1))
3371 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3372 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3373 configUrlMatch = re.search('config=(.*)$', playerUrl)
3374 configUrl = urllib2.unquote(configUrlMatch.group(1))
3376 self.report_config_download(showName)
3378 configJSON = urllib2.urlopen(configUrl).read()
3379 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3380 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3383 # Technically, it's JavaScript, not JSON
3384 configJSON = configJSON.replace("'", '"')
3387 config = json.loads(configJSON)
3388 except (ValueError,), err:
3389 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3392 playlist = config['playlist']
3393 videoUrl = playlist[1]['url']
3395 self._downloader.increment_downloads()
3399 'uploader': showName,
3400 'upload_date': None,
3402 'stitle': _simplify_title(showName),
3405 'thumbnail': imgUrl,
3406 'description': description,
3407 'player_url': playerUrl,
3411 self._downloader.process_info(info)
3412 except UnavailableVideoError, err:
3413 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3416 class CollegeHumorIE(InfoExtractor):
3417 """Information extractor for collegehumor.com"""
3419 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3420 IE_NAME = u'collegehumor'
3422 def report_webpage(self, video_id):
3423 """Report information extraction."""
3424 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3426 def report_extraction(self, video_id):
3427 """Report information extraction."""
3428 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3430 def _real_extract(self, url):
3431 htmlParser = HTMLParser.HTMLParser()
3433 mobj = re.match(self._VALID_URL, url)
3435 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3437 video_id = mobj.group('videoid')
3439 self.report_webpage(video_id)
3440 request = urllib2.Request(url)
3442 webpage = urllib2.urlopen(request).read()
3443 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3444 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3447 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3449 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3451 internal_video_id = m.group('internalvideoid')
3455 'internal_id': internal_video_id,
3458 self.report_extraction(video_id)
3459 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3461 metaXml = urllib2.urlopen(xmlUrl).read()
3462 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3463 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3466 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3468 videoNode = mdoc.findall('./video')[0]
3469 info['description'] = videoNode.findall('./description')[0].text
3470 info['title'] = videoNode.findall('./caption')[0].text
3471 info['stitle'] = _simplify_title(info['title'])
3472 info['url'] = videoNode.findall('./file')[0].text
3473 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3474 info['ext'] = info['url'].rpartition('.')[2]
3475 info['format'] = info['ext']
3477 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3480 self._downloader.increment_downloads()
3483 self._downloader.process_info(info)
3484 except UnavailableVideoError, err:
3485 self._downloader.trouble(u'\nERROR: unable to download video')
3488 class XVideosIE(InfoExtractor):
3489 """Information extractor for xvideos.com"""
3491 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3492 IE_NAME = u'xvideos'
3494 def report_webpage(self, video_id):
3495 """Report information extraction."""
3496 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3498 def report_extraction(self, video_id):
3499 """Report information extraction."""
3500 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3502 def _real_extract(self, url):
3503 htmlParser = HTMLParser.HTMLParser()
3505 mobj = re.match(self._VALID_URL, url)
3507 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3509 video_id = mobj.group(1).decode('utf-8')
3511 self.report_webpage(video_id)
3513 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3515 webpage = urllib2.urlopen(request).read()
3516 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3517 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3520 self.report_extraction(video_id)
3524 mobj = re.search(r'flv_url=(.+?)&', webpage)
3526 self._downloader.trouble(u'ERROR: unable to extract video url')
3528 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3532 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3534 self._downloader.trouble(u'ERROR: unable to extract video title')
3536 video_title = mobj.group(1).decode('utf-8')
3539 # Extract video thumbnail
3540 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3542 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3544 video_thumbnail = mobj.group(1).decode('utf-8')
3548 self._downloader.increment_downloads()
3553 'upload_date': None,
3554 'title': video_title,
3555 'stitle': _simplify_title(video_title),
3558 'thumbnail': video_thumbnail,
3559 'description': None,
3564 self._downloader.process_info(info)
3565 except UnavailableVideoError, err:
3566 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3569 class SoundcloudIE(InfoExtractor):
3570 """Information extractor for soundcloud.com
3571 To access the media, the uid of the song and a stream token
3572 must be extracted from the page source and the script must make
3573 a request to media.soundcloud.com/crossdomain.xml. Then
3574 the media can be grabbed by requesting from an url composed
3575 of the stream token and uid
3578 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3579 IE_NAME = u'soundcloud'
3581 def __init__(self, downloader=None):
3582 InfoExtractor.__init__(self, downloader)
3584 def report_webpage(self, video_id):
3585 """Report information extraction."""
3586 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3588 def report_extraction(self, video_id):
3589 """Report information extraction."""
3590 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3592 def _real_extract(self, url):
3593 htmlParser = HTMLParser.HTMLParser()
3595 mobj = re.match(self._VALID_URL, url)
3597 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3600 # extract uploader (which is in the url)
3601 uploader = mobj.group(1).decode('utf-8')
3602 # extract simple title (uploader + slug of song title)
3603 slug_title = mobj.group(2).decode('utf-8')
3604 simple_title = uploader + '-' + slug_title
3606 self.report_webpage('%s/%s' % (uploader, slug_title))
3608 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3610 webpage = urllib2.urlopen(request).read()
3611 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3612 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3615 self.report_extraction('%s/%s' % (uploader, slug_title))
3617 # extract uid and stream token that soundcloud hands out for access
3618 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3620 video_id = mobj.group(1)
3621 stream_token = mobj.group(2)
3623 # extract unsimplified title
3624 mobj = re.search('"title":"(.*?)",', webpage)
3626 title = mobj.group(1)
3628 # construct media url (with uid/token)
3629 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3630 mediaURL = mediaURL % (video_id, stream_token)
3633 description = u'No description available'
3634 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3636 description = mobj.group(1)
3640 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3643 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3644 except Exception, e:
3647 # for soundcloud, a request to a cross domain is required for cookies
3648 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3651 self._downloader.process_info({
3652 'id': video_id.decode('utf-8'),
3654 'uploader': uploader.decode('utf-8'),
3655 'upload_date': upload_date,
3656 'title': simple_title.decode('utf-8'),
3657 'stitle': simple_title.decode('utf-8'),
3661 'description': description.decode('utf-8')
3663 except UnavailableVideoError:
3664 self._downloader.trouble(u'\nERROR: unable to download video')
3667 class InfoQIE(InfoExtractor):
3668 """Information extractor for infoq.com"""
3670 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3673 def report_webpage(self, video_id):
3674 """Report information extraction."""
3675 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3677 def report_extraction(self, video_id):
3678 """Report information extraction."""
3679 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3681 def _real_extract(self, url):
3682 htmlParser = HTMLParser.HTMLParser()
3684 mobj = re.match(self._VALID_URL, url)
3686 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3689 self.report_webpage(url)
3691 request = urllib2.Request(url)
3693 webpage = urllib2.urlopen(request).read()
3694 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3695 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3698 self.report_extraction(url)
3702 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3704 self._downloader.trouble(u'ERROR: unable to extract video url')
3706 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3710 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3712 self._downloader.trouble(u'ERROR: unable to extract video title')
3714 video_title = mobj.group(1).decode('utf-8')
3716 # Extract description
3717 video_description = u'No description available.'
3718 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3719 if mobj is not None:
3720 video_description = mobj.group(1).decode('utf-8')
3722 video_filename = video_url.split('/')[-1]
3723 video_id, extension = video_filename.split('.')
3725 self._downloader.increment_downloads()
3730 'upload_date': None,
3731 'title': video_title,
3732 'stitle': _simplify_title(video_title),
3734 'format': extension, # Extension is always(?) mp4, but seems to be flv
3736 'description': video_description,
3741 self._downloader.process_info(info)
3742 except UnavailableVideoError, err:
3743 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3745 class MixcloudIE(InfoExtractor):
3746 """Information extractor for www.mixcloud.com"""
3747 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3748 IE_NAME = u'mixcloud'
3750 def __init__(self, downloader=None):
3751 InfoExtractor.__init__(self, downloader)
3753 def report_download_json(self, file_id):
3754 """Report JSON download."""
3755 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3757 def report_extraction(self, file_id):
3758 """Report information extraction."""
3759 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3761 def get_urls(self, jsonData, fmt, bitrate='best'):
3762 """Get urls from 'audio_formats' section in json"""
3765 bitrate_list = jsonData[fmt]
3766 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3767 bitrate = max(bitrate_list) # select highest
3769 url_list = jsonData[fmt][bitrate]
3770 except TypeError: # we have no bitrate info.
3771 url_list = jsonData[fmt]
3775 def check_urls(self, url_list):
3776 """Returns 1st active url from list"""
3777 for url in url_list:
3779 urllib2.urlopen(url)
3781 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3786 def _print_formats(self, formats):
3787 print 'Available formats:'
3788 for fmt in formats.keys():
3789 for b in formats[fmt]:
3791 ext = formats[fmt][b][0]
3792 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3793 except TypeError: # we have no bitrate info
3794 ext = formats[fmt][0]
3795 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3798 def _real_extract(self, url):
3799 mobj = re.match(self._VALID_URL, url)
3801 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3803 # extract uploader & filename from url
3804 uploader = mobj.group(1).decode('utf-8')
3805 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3807 # construct API request
3808 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3809 # retrieve .json file with links to files
3810 request = urllib2.Request(file_url)
3812 self.report_download_json(file_url)
3813 jsonData = urllib2.urlopen(request).read()
3814 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3815 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3819 json_data = json.loads(jsonData)
3820 player_url = json_data['player_swf_url']
3821 formats = dict(json_data['audio_formats'])
3823 req_format = self._downloader.params.get('format', None)
3826 if self._downloader.params.get('listformats', None):
3827 self._print_formats(formats)
3830 if req_format is None or req_format == 'best':
3831 for format_param in formats.keys():
3832 url_list = self.get_urls(formats, format_param)
3834 file_url = self.check_urls(url_list)
3835 if file_url is not None:
3838 if req_format not in formats.keys():
3839 self._downloader.trouble(u'ERROR: format is not available')
3842 url_list = self.get_urls(formats, req_format)
3843 file_url = self.check_urls(url_list)
3844 format_param = req_format
3847 self._downloader.increment_downloads()
3849 # Process file information
3850 self._downloader.process_info({
3851 'id': file_id.decode('utf-8'),
3852 'url': file_url.decode('utf-8'),
3853 'uploader': uploader.decode('utf-8'),
3854 'upload_date': u'NA',
3855 'title': json_data['name'],
3856 'stitle': _simplify_title(json_data['name']),
3857 'ext': file_url.split('.')[-1].decode('utf-8'),
3858 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3859 'thumbnail': json_data['thumbnail_url'],
3860 'description': json_data['description'],
3861 'player_url': player_url.decode('utf-8'),
3863 except UnavailableVideoError, err:
3864 self._downloader.trouble(u'ERROR: unable to download file')
3866 class StanfordOpenClassroomIE(InfoExtractor):
3867 """Information extractor for Stanford's Open ClassRoom"""
3869 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3870 IE_NAME = u'stanfordoc'
3872 def report_download_webpage(self, objid):
3873 """Report information extraction."""
3874 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3876 def report_extraction(self, video_id):
3877 """Report information extraction."""
3878 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3880 def _real_extract(self, url):
3881 mobj = re.match(self._VALID_URL, url)
3883 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3886 if mobj.group('course') and mobj.group('video'): # A specific video
3887 course = mobj.group('course')
3888 video = mobj.group('video')
3890 'id': _simplify_title(course + '_' + video),
3893 self.report_extraction(info['id'])
3894 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3895 xmlUrl = baseUrl + video + '.xml'
3897 metaXml = urllib2.urlopen(xmlUrl).read()
3898 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3899 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3901 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3903 info['title'] = mdoc.findall('./title')[0].text
3904 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3906 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3908 info['stitle'] = _simplify_title(info['title'])
3909 info['ext'] = info['url'].rpartition('.')[2]
3910 info['format'] = info['ext']
3911 self._downloader.increment_downloads()
3913 self._downloader.process_info(info)
3914 except UnavailableVideoError, err:
3915 self._downloader.trouble(u'\nERROR: unable to download video')
3916 elif mobj.group('course'): # A course page
3917 unescapeHTML = HTMLParser.HTMLParser().unescape
3919 course = mobj.group('course')
3921 'id': _simplify_title(course),
3925 self.report_download_webpage(info['id'])
3927 coursepage = urllib2.urlopen(url).read()
3928 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3929 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3932 m = re.search('<h1>([^<]+)</h1>', coursepage)
3934 info['title'] = unescapeHTML(m.group(1))
3936 info['title'] = info['id']
3937 info['stitle'] = _simplify_title(info['title'])
3939 m = re.search('<description>([^<]+)</description>', coursepage)
3941 info['description'] = unescapeHTML(m.group(1))
3943 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3946 'type': 'reference',
3947 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3951 for entry in info['list']:
3952 assert entry['type'] == 'reference'
3953 self.extract(entry['url'])
3955 unescapeHTML = HTMLParser.HTMLParser().unescape
3958 'id': 'Stanford OpenClassroom',
3962 self.report_download_webpage(info['id'])
3963 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3965 rootpage = urllib2.urlopen(rootURL).read()
3966 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3967 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3970 info['title'] = info['id']
3971 info['stitle'] = _simplify_title(info['title'])
3973 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3976 'type': 'reference',
3977 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3981 for entry in info['list']:
3982 assert entry['type'] == 'reference'
3983 self.extract(entry['url'])
3985 class MTVIE(InfoExtractor):
3986 """Information extractor for MTV.com"""
3988 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3991 def report_webpage(self, video_id):
3992 """Report information extraction."""
3993 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3995 def report_extraction(self, video_id):
3996 """Report information extraction."""
3997 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3999 def _real_extract(self, url):
4000 mobj = re.match(self._VALID_URL, url)
4002 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4004 if not mobj.group('proto'):
4005 url = 'http://' + url
4006 video_id = mobj.group('videoid')
4007 self.report_webpage(video_id)
4009 request = urllib2.Request(url)
4011 webpage = urllib2.urlopen(request).read()
4012 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4013 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4016 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4018 self._downloader.trouble(u'ERROR: unable to extract song name')
4020 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4021 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4023 self._downloader.trouble(u'ERROR: unable to extract performer')
4025 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4026 video_title = performer + ' - ' + song_name
4028 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4030 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4032 mtvn_uri = mobj.group(1)
4034 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4036 self._downloader.trouble(u'ERROR: unable to extract content id')
4038 content_id = mobj.group(1)
4040 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4041 self.report_extraction(video_id)
4042 request = urllib2.Request(videogen_url)
4044 metadataXml = urllib2.urlopen(request).read()
4045 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4046 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4049 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4050 renditions = mdoc.findall('.//rendition')
4052 # For now, always pick the highest quality.
4053 rendition = renditions[-1]
4056 _,_,ext = rendition.attrib['type'].partition('/')
4057 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4058 video_url = rendition.find('./src').text
4060 self._downloader.trouble('Invalid rendition field.')
4063 self._downloader.increment_downloads()
4067 'uploader': performer,
4068 'title': video_title,
4069 'stitle': _simplify_title(video_title),
4075 self._downloader.process_info(info)
4076 except UnavailableVideoError, err:
4077 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4080 class PostProcessor(object):
4081 """Post Processor class.
4083 PostProcessor objects can be added to downloaders with their
4084 add_post_processor() method. When the downloader has finished a
4085 successful download, it will take its internal chain of PostProcessors
4086 and start calling the run() method on each one of them, first with
4087 an initial argument and then with the returned value of the previous
4090 The chain will be stopped if one of them ever returns None or the end
4091 of the chain is reached.
4093 PostProcessor objects follow a "mutual registration" process similar
4094 to InfoExtractor objects.
4099 def __init__(self, downloader=None):
4100 self._downloader = downloader
4102 def set_downloader(self, downloader):
4103 """Sets the downloader for this PP."""
4104 self._downloader = downloader
4106 def run(self, information):
4107 """Run the PostProcessor.
4109 The "information" argument is a dictionary like the ones
4110 composed by InfoExtractors. The only difference is that this
4111 one has an extra field called "filepath" that points to the
4114 When this method returns None, the postprocessing chain is
4115 stopped. However, this method may return an information
4116 dictionary that will be passed to the next postprocessing
4117 object in the chain. It can be the one it received after
4118 changing some fields.
4120 In addition, this method may raise a PostProcessingError
4121 exception that will be taken into account by the downloader
4124 return information # by default, do nothing
4126 class AudioConversionError(BaseException):
4127 def __init__(self, message):
4128 self.message = message
4130 class FFmpegExtractAudioPP(PostProcessor):
4132 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4133 PostProcessor.__init__(self, downloader)
4134 if preferredcodec is None:
4135 preferredcodec = 'best'
4136 self._preferredcodec = preferredcodec
4137 self._preferredquality = preferredquality
4138 self._keepvideo = keepvideo
4141 def get_audio_codec(path):
4143 cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4144 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4145 output = handle.communicate()[0]
4146 if handle.wait() != 0:
4148 except (IOError, OSError):
4151 for line in output.split('\n'):
4152 if line.startswith('codec_name='):
4153 audio_codec = line.split('=')[1].strip()
4154 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4159 def run_ffmpeg(path, out_path, codec, more_opts):
4163 acodec_opts = ['-acodec', codec]
4164 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4166 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4167 stdout,stderr = p.communicate()
4168 except (IOError, OSError):
4169 e = sys.exc_info()[1]
4170 if isinstance(e, OSError) and e.errno == 2:
4171 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4174 if p.returncode != 0:
4175 msg = stderr.strip().split('\n')[-1]
4176 raise AudioConversionError(msg)
4178 def run(self, information):
4179 path = information['filepath']
4181 filecodec = self.get_audio_codec(path)
4182 if filecodec is None:
4183 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4187 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4188 if self._preferredcodec == 'm4a' and filecodec == 'aac':
4189 # Lossless, but in another container
4191 extension = self._preferredcodec
4192 more_opts = ['-absf', 'aac_adtstoasc']
4193 elif filecodec in ['aac', 'mp3', 'vorbis']:
4194 # Lossless if possible
4196 extension = filecodec
4197 if filecodec == 'aac':
4198 more_opts = ['-f', 'adts']
4199 if filecodec == 'vorbis':
4203 acodec = 'libmp3lame'
4206 if self._preferredquality is not None:
4207 more_opts += ['-ab', self._preferredquality]
4209 # We convert the audio (lossy)
4210 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4211 extension = self._preferredcodec
4213 if self._preferredquality is not None:
4214 more_opts += ['-ab', self._preferredquality]
4215 if self._preferredcodec == 'aac':
4216 more_opts += ['-f', 'adts']
4217 if self._preferredcodec == 'm4a':
4218 more_opts += ['-absf', 'aac_adtstoasc']
4219 if self._preferredcodec == 'vorbis':
4221 if self._preferredcodec == 'wav':
4223 more_opts += ['-f', 'wav']
4225 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4226 new_path = prefix + sep + extension
4227 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4229 self.run_ffmpeg(path, new_path, acodec, more_opts)
4231 etype,e,tb = sys.exc_info()
4232 if isinstance(e, AudioConversionError):
4233 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4235 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4238 # Try to update the date time for extracted audio file.
4239 if information.get('filetime') is not None:
4241 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4243 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4245 if not self._keepvideo:
4247 os.remove(_encodeFilename(path))
4248 except (IOError, OSError):
4249 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4252 information['filepath'] = new_path
4256 def updateSelf(downloader, filename):
4257 ''' Update the program file with the latest version from the repository '''
4258 # Note: downloader only used for options
4259 if not os.access(filename, os.W_OK):
4260 sys.exit('ERROR: no write permissions on %s' % filename)
4262 downloader.to_screen(u'Updating to latest version...')
4266 urlh = urllib.urlopen(UPDATE_URL)
4267 newcontent = urlh.read()
4269 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4270 if vmatch is not None and vmatch.group(1) == __version__:
4271 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4275 except (IOError, OSError), err:
4276 sys.exit('ERROR: unable to download latest version')
4279 outf = open(filename, 'wb')
4281 outf.write(newcontent)
4284 except (IOError, OSError), err:
4285 sys.exit('ERROR: unable to overwrite current version')
4287 downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4290 def _readOptions(filename_bytes):
4292 optionf = open(filename_bytes)
4294 return [] # silently skip if file is not present
4298 res += shlex.split(l, comments=True)
4303 def _format_option_string(option):
4304 ''' ('-o', '--option') -> -o, --format METAVAR'''
4308 if option._short_opts: opts.append(option._short_opts[0])
4309 if option._long_opts: opts.append(option._long_opts[0])
4310 if len(opts) > 1: opts.insert(1, ', ')
4312 if option.takes_value(): opts.append(' %s' % option.metavar)
4314 return "".join(opts)
4316 def _find_term_columns():
4317 columns = os.environ.get('COLUMNS', None)
4322 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4323 out,err = sp.communicate()
4324 return int(out.split()[1])
4330 max_help_position = 80
4332 # No need to wrap help messages if we're on a wide console
4333 columns = _find_term_columns()
4334 if columns: max_width = columns
4336 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4337 fmt.format_option_strings = _format_option_string
4340 'version' : __version__,
4342 'usage' : '%prog [options] url [url...]',
4343 'conflict_handler' : 'resolve',
4346 parser = optparse.OptionParser(**kw)
4349 general = optparse.OptionGroup(parser, 'General Options')
4350 selection = optparse.OptionGroup(parser, 'Video Selection')
4351 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4352 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4353 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4354 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4355 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4357 general.add_option('-h', '--help',
4358 action='help', help='print this help text and exit')
4359 general.add_option('-v', '--version',
4360 action='version', help='print program version and exit')
4361 general.add_option('-U', '--update',
4362 action='store_true', dest='update_self', help='update this program to latest version')
4363 general.add_option('-i', '--ignore-errors',
4364 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4365 general.add_option('-r', '--rate-limit',
4366 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4367 general.add_option('-R', '--retries',
4368 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4369 general.add_option('--dump-user-agent',
4370 action='store_true', dest='dump_user_agent',
4371 help='display the current browser identification', default=False)
4372 general.add_option('--list-extractors',
4373 action='store_true', dest='list_extractors',
4374 help='List all supported extractors and the URLs they would handle', default=False)
4376 selection.add_option('--playlist-start',
4377 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4378 selection.add_option('--playlist-end',
4379 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4380 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4381 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4382 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4384 authentication.add_option('-u', '--username',
4385 dest='username', metavar='USERNAME', help='account username')
4386 authentication.add_option('-p', '--password',
4387 dest='password', metavar='PASSWORD', help='account password')
4388 authentication.add_option('-n', '--netrc',
4389 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4392 video_format.add_option('-f', '--format',
4393 action='store', dest='format', metavar='FORMAT', help='video format code')
4394 video_format.add_option('--all-formats',
4395 action='store_const', dest='format', help='download all available video formats', const='all')
4396 video_format.add_option('--prefer-free-formats',
4397 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4398 video_format.add_option('--max-quality',
4399 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4400 video_format.add_option('-F', '--list-formats',
4401 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4402 video_format.add_option('--write-srt',
4403 action='store_true', dest='writesubtitles',
4404 help='write video closed captions to a .srt file (currently youtube only)', default=False)
4405 video_format.add_option('--srt-lang',
4406 action='store', dest='subtitleslang', metavar='LANG',
4407 help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
4410 verbosity.add_option('-q', '--quiet',
4411 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4412 verbosity.add_option('-s', '--simulate',
4413 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4414 verbosity.add_option('--skip-download',
4415 action='store_true', dest='skip_download', help='do not download the video', default=False)
4416 verbosity.add_option('-g', '--get-url',
4417 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4418 verbosity.add_option('-e', '--get-title',
4419 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4420 verbosity.add_option('--get-thumbnail',
4421 action='store_true', dest='getthumbnail',
4422 help='simulate, quiet but print thumbnail URL', default=False)
4423 verbosity.add_option('--get-description',
4424 action='store_true', dest='getdescription',
4425 help='simulate, quiet but print video description', default=False)
4426 verbosity.add_option('--get-filename',
4427 action='store_true', dest='getfilename',
4428 help='simulate, quiet but print output filename', default=False)
4429 verbosity.add_option('--get-format',
4430 action='store_true', dest='getformat',
4431 help='simulate, quiet but print output format', default=False)
4432 verbosity.add_option('--no-progress',
4433 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4434 verbosity.add_option('--console-title',
4435 action='store_true', dest='consoletitle',
4436 help='display progress in console titlebar', default=False)
4437 verbosity.add_option('-v', '--verbose',
4438 action='store_true', dest='verbose', help='print various debugging information', default=False)
4441 filesystem.add_option('-t', '--title',
4442 action='store_true', dest='usetitle', help='use title in file name', default=False)
4443 filesystem.add_option('-l', '--literal',
4444 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4445 filesystem.add_option('-A', '--auto-number',
4446 action='store_true', dest='autonumber',
4447 help='number downloaded files starting from 00000', default=False)
4448 filesystem.add_option('-o', '--output',
4449 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4450 filesystem.add_option('-a', '--batch-file',
4451 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4452 filesystem.add_option('-w', '--no-overwrites',
4453 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4454 filesystem.add_option('-c', '--continue',
4455 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4456 filesystem.add_option('--no-continue',
4457 action='store_false', dest='continue_dl',
4458 help='do not resume partially downloaded files (restart from beginning)')
4459 filesystem.add_option('--cookies',
4460 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4461 filesystem.add_option('--no-part',
4462 action='store_true', dest='nopart', help='do not use .part files', default=False)
4463 filesystem.add_option('--no-mtime',
4464 action='store_false', dest='updatetime',
4465 help='do not use the Last-modified header to set the file modification time', default=True)
4466 filesystem.add_option('--write-description',
4467 action='store_true', dest='writedescription',
4468 help='write video description to a .description file', default=False)
4469 filesystem.add_option('--write-info-json',
4470 action='store_true', dest='writeinfojson',
4471 help='write video metadata to a .info.json file', default=False)
4474 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4475 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4476 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4477 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4478 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4479 help='ffmpeg audio bitrate specification, 128k by default')
4480 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4481 help='keeps the video file on disk after the post-processing; the video is erased by default')
4484 parser.add_option_group(general)
4485 parser.add_option_group(selection)
4486 parser.add_option_group(filesystem)
4487 parser.add_option_group(verbosity)
4488 parser.add_option_group(video_format)
4489 parser.add_option_group(authentication)
4490 parser.add_option_group(postproc)
4492 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4494 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4496 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4497 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4498 opts, args = parser.parse_args(argv)
4500 return parser, opts, args
4502 def gen_extractors():
4503 """ Return a list of an instance of every supported extractor.
4504 The order does matter; the first extractor matched is the one handling the URL.
4506 youtube_ie = YoutubeIE()
4507 google_ie = GoogleIE()
4508 yahoo_ie = YahooIE()
4510 YoutubePlaylistIE(youtube_ie),
4511 YoutubeUserIE(youtube_ie),
4512 YoutubeSearchIE(youtube_ie),
4514 MetacafeIE(youtube_ie),
4517 GoogleSearchIE(google_ie),
4520 YahooSearchIE(yahoo_ie),
4533 StanfordOpenClassroomIE(),
4540 parser, opts, args = parseOpts()
4542 # Open appropriate CookieJar
4543 if opts.cookiefile is None:
4544 jar = cookielib.CookieJar()
4547 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4548 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4550 except (IOError, OSError), err:
4551 sys.exit(u'ERROR: unable to open cookie file')
4554 if opts.dump_user_agent:
4555 print std_headers['User-Agent']
4558 # Batch file verification
4560 if opts.batchfile is not None:
4562 if opts.batchfile == '-':
4565 batchfd = open(opts.batchfile, 'r')
4566 batchurls = batchfd.readlines()
4567 batchurls = [x.strip() for x in batchurls]
4568 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4570 sys.exit(u'ERROR: batch file could not be read')
4571 all_urls = batchurls + args
4572 all_urls = map(lambda url: url.strip(), all_urls)
4574 # General configuration
4575 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4576 proxy_handler = urllib2.ProxyHandler()
4577 opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4578 urllib2.install_opener(opener)
4579 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4582 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4584 extractors = gen_extractors()
4586 if opts.list_extractors:
4587 for ie in extractors:
4589 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4590 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4591 for mu in matchedUrls:
4595 # Conflicting, missing and erroneous options
4596 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4597 parser.error(u'using .netrc conflicts with giving username/password')
4598 if opts.password is not None and opts.username is None:
4599 parser.error(u'account username missing')
4600 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4601 parser.error(u'using output template conflicts with using title, literal title or auto number')
4602 if opts.usetitle and opts.useliteral:
4603 parser.error(u'using title conflicts with using literal title')
4604 if opts.username is not None and opts.password is None:
4605 opts.password = getpass.getpass(u'Type account password and press return:')
4606 if opts.ratelimit is not None:
4607 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4608 if numeric_limit is None:
4609 parser.error(u'invalid rate limit specified')
4610 opts.ratelimit = numeric_limit
4611 if opts.retries is not None:
4613 opts.retries = long(opts.retries)
4614 except (TypeError, ValueError), err:
4615 parser.error(u'invalid retry count specified')
4617 opts.playliststart = int(opts.playliststart)
4618 if opts.playliststart <= 0:
4619 raise ValueError(u'Playlist start must be positive')
4620 except (TypeError, ValueError), err:
4621 parser.error(u'invalid playlist start number specified')
4623 opts.playlistend = int(opts.playlistend)
4624 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4625 raise ValueError(u'Playlist end must be greater than playlist start')
4626 except (TypeError, ValueError), err:
4627 parser.error(u'invalid playlist end number specified')
4628 if opts.extractaudio:
4629 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4630 parser.error(u'invalid audio format specified')
4633 fd = FileDownloader({
4634 'usenetrc': opts.usenetrc,
4635 'username': opts.username,
4636 'password': opts.password,
4637 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4638 'forceurl': opts.geturl,
4639 'forcetitle': opts.gettitle,
4640 'forcethumbnail': opts.getthumbnail,
4641 'forcedescription': opts.getdescription,
4642 'forcefilename': opts.getfilename,
4643 'forceformat': opts.getformat,
4644 'simulate': opts.simulate,
4645 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4646 'format': opts.format,
4647 'format_limit': opts.format_limit,
4648 'listformats': opts.listformats,
4649 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4650 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4651 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4652 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4653 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4654 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4655 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4656 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4657 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4658 or u'%(id)s.%(ext)s'),
4659 'ignoreerrors': opts.ignoreerrors,
4660 'ratelimit': opts.ratelimit,
4661 'nooverwrites': opts.nooverwrites,
4662 'retries': opts.retries,
4663 'continuedl': opts.continue_dl,
4664 'noprogress': opts.noprogress,
4665 'playliststart': opts.playliststart,
4666 'playlistend': opts.playlistend,
4667 'logtostderr': opts.outtmpl == '-',
4668 'consoletitle': opts.consoletitle,
4669 'nopart': opts.nopart,
4670 'updatetime': opts.updatetime,
4671 'writedescription': opts.writedescription,
4672 'writeinfojson': opts.writeinfojson,
4673 'writesubtitles': opts.writesubtitles,
4674 'subtitleslang': opts.subtitleslang,
4675 'matchtitle': opts.matchtitle,
4676 'rejecttitle': opts.rejecttitle,
4677 'max_downloads': opts.max_downloads,
4678 'prefer_free_formats': opts.prefer_free_formats,
4679 'verbose': opts.verbose,
4681 for extractor in extractors:
4682 fd.add_info_extractor(extractor)
4685 if opts.extractaudio:
4686 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4689 if opts.update_self:
4690 updateSelf(fd, sys.argv[0])
4693 if len(all_urls) < 1:
4694 if not opts.update_self:
4695 parser.error(u'you must provide at least one URL')
4700 retcode = fd.download(all_urls)
4701 except MaxDownloadsReached:
4702 fd.to_screen(u'--max-download limit reached, aborting.')
4705 # Dump cookie jar if requested
4706 if opts.cookiefile is not None:
4709 except (IOError, OSError), err:
4710 sys.exit(u'ERROR: unable to save cookie jar')
4717 except DownloadError:
4719 except SameFileError:
4720 sys.exit(u'ERROR: fixed output name but more than one file to download')
4721 except KeyboardInterrupt:
4722 sys.exit(u'\nERROR: Interrupted by user')
4724 if __name__ == '__main__':
4727 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: