2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
20 __license__ = 'Public Domain'
21 __version__ = '2012.02.27'
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
56 except ImportError: # Python 2.4
59 import cStringIO as StringIO
63 # parse_qs was moved from the cgi module to the urlparse module recently.
65 from urlparse import parse_qs
67 from cgi import parse_qs
75 import xml.etree.ElementTree
76 except ImportError: # Python<2.5: Not officially supported, but let it slip
77 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
80 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
81 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
82 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
83 'Accept-Encoding': 'gzip, deflate',
84 'Accept-Language': 'en-us,en;q=0.5',
89 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
95 def raiseError(msg, i):
96 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
97 def skipSpace(i, expectMore=True):
98 while i < len(s) and s[i] in ' \t\r\n':
102 raiseError('Premature end', i)
104 def decodeEscape(match):
120 return unichr(int(esc[1:5], 16))
121 if len(esc) == 5+6 and esc[5:7] == '\\u':
122 hi = int(esc[1:5], 16)
123 low = int(esc[7:11], 16)
124 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
125 raise ValueError('Unknown escape ' + str(esc))
132 while s[e-bslashes-1] == '\\':
134 if bslashes % 2 == 1:
138 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
139 stri = rexp.sub(decodeEscape, s[i:e])
145 if s[i] == '}': # Empty dictionary
149 raiseError('Expected a string object key', i)
150 i,key = parseString(i)
152 if i >= len(s) or s[i] != ':':
153 raiseError('Expected a colon', i)
160 raiseError('Expected comma or closing curly brace', i)
165 if s[i] == ']': # Empty array
170 i = skipSpace(i) # Raise exception if premature end
174 raiseError('Expected a comma or closing bracket', i)
176 def parseDiscrete(i):
177 for k,v in {'true': True, 'false': False, 'null': None}.items():
178 if s.startswith(k, i):
180 raiseError('Not a boolean (or null)', i)
182 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
184 raiseError('Not a number', i)
186 if '.' in nums or 'e' in nums or 'E' in nums:
187 return (i+len(nums), float(nums))
188 return (i+len(nums), int(nums))
189 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
192 i,res = CHARMAP.get(s[i], parseNumber)(i)
193 i = skipSpace(i, False)
197 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
200 def preferredencoding():
201 """Get preferred encoding.
203 Returns the best encoding scheme for the system, based on
204 locale.getpreferredencoding() and some further tweaks.
206 def yield_preferredencoding():
208 pref = locale.getpreferredencoding()
214 return yield_preferredencoding().next()
217 def htmlentity_transform(matchobj):
218 """Transforms an HTML entity to a Unicode character.
220 This function receives a match object and is intended to be used with
221 the re.sub() function.
223 entity = matchobj.group(1)
225 # Known non-numeric HTML entity
226 if entity in htmlentitydefs.name2codepoint:
227 return unichr(htmlentitydefs.name2codepoint[entity])
230 mobj = re.match(ur'(?u)#(x?\d+)', entity)
232 numstr = mobj.group(1)
233 if numstr.startswith(u'x'):
235 numstr = u'0%s' % numstr
238 return unichr(long(numstr, base))
240 # Unknown entity in name, return its literal representation
241 return (u'&%s;' % entity)
244 def sanitize_title(utitle):
245 """Sanitizes a video title so it could be used as part of a filename."""
246 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
247 return utitle.replace(unicode(os.sep), u'%')
250 def sanitize_open(filename, open_mode):
251 """Try to open the given filename, and slightly tweak it if this fails.
253 Attempts to open the given filename. If this fails, it tries to change
254 the filename slightly, step by step, until it's either able to open it
255 or it fails and raises a final exception, like the standard open()
258 It returns the tuple (stream, definitive_file_name).
262 if sys.platform == 'win32':
264 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
265 return (sys.stdout, filename)
266 stream = open(_encodeFilename(filename), open_mode)
267 return (stream, filename)
268 except (IOError, OSError), err:
269 # In case of error, try to remove win32 forbidden chars
270 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
272 # An exception here should be caught in the caller
273 stream = open(_encodeFilename(filename), open_mode)
274 return (stream, filename)
277 def timeconvert(timestr):
278 """Convert RFC 2822 defined time string into system timestamp"""
280 timetuple = email.utils.parsedate_tz(timestr)
281 if timetuple is not None:
282 timestamp = email.utils.mktime_tz(timetuple)
285 def _simplify_title(title):
286 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
287 return expr.sub(u'_', title).strip(u'_')
289 def _orderedSet(iterable):
290 """ Remove all duplicates from the input iterable """
297 def _unescapeHTML(s):
299 @param s a string (of type unicode)
301 assert type(s) == type(u'')
303 htmlParser = HTMLParser.HTMLParser()
304 return htmlParser.unescape(s)
306 def _encodeFilename(s):
308 @param s The name of the file (of type unicode)
311 assert type(s) == type(u'')
313 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
314 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
315 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
316 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
319 return s.encode(sys.getfilesystemencoding(), 'ignore')
321 class DownloadError(Exception):
322 """Download Error exception.
324 This exception may be thrown by FileDownloader objects if they are not
325 configured to continue on errors. They will contain the appropriate
331 class SameFileError(Exception):
332 """Same File exception.
334 This exception will be thrown by FileDownloader objects if they detect
335 multiple files would have to be downloaded to the same file on disk.
340 class PostProcessingError(Exception):
341 """Post Processing exception.
343 This exception may be raised by PostProcessor's .run() method to
344 indicate an error in the postprocessing task.
348 class MaxDownloadsReached(Exception):
349 """ --max-downloads limit has been reached. """
353 class UnavailableVideoError(Exception):
354 """Unavailable Format exception.
356 This exception will be thrown when a video is requested
357 in a format that is not available for that video.
362 class ContentTooShortError(Exception):
363 """Content Too Short exception.
365 This exception may be raised by FileDownloader objects when a file they
366 download is too small for what the server announced first, indicating
367 the connection was probably interrupted.
373 def __init__(self, downloaded, expected):
374 self.downloaded = downloaded
375 self.expected = expected
378 class YoutubeDLHandler(urllib2.HTTPHandler):
379 """Handler for HTTP requests and responses.
381 This class, when installed with an OpenerDirector, automatically adds
382 the standard headers to every HTTP request and handles gzipped and
383 deflated responses from web servers. If compression is to be avoided in
384 a particular request, the original request in the program code only has
385 to include the HTTP header "Youtubedl-No-Compression", which will be
386 removed before making the real request.
388 Part of this code was copied from:
390 http://techknack.net/python-urllib2-handlers/
392 Andrew Rowls, the author of that code, agreed to release it to the
399 return zlib.decompress(data, -zlib.MAX_WBITS)
401 return zlib.decompress(data)
404 def addinfourl_wrapper(stream, headers, url, code):
405 if hasattr(urllib2.addinfourl, 'getcode'):
406 return urllib2.addinfourl(stream, headers, url, code)
407 ret = urllib2.addinfourl(stream, headers, url)
411 def http_request(self, req):
412 for h in std_headers:
415 req.add_header(h, std_headers[h])
416 if 'Youtubedl-no-compression' in req.headers:
417 if 'Accept-encoding' in req.headers:
418 del req.headers['Accept-encoding']
419 del req.headers['Youtubedl-no-compression']
422 def http_response(self, req, resp):
425 if resp.headers.get('Content-encoding', '') == 'gzip':
426 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
427 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
428 resp.msg = old_resp.msg
430 if resp.headers.get('Content-encoding', '') == 'deflate':
431 gz = StringIO.StringIO(self.deflate(resp.read()))
432 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
433 resp.msg = old_resp.msg
437 class FileDownloader(object):
438 """File Downloader class.
440 File downloader objects are the ones responsible of downloading the
441 actual video file and writing it to disk if the user has requested
442 it, among some other tasks. In most cases there should be one per
443 program. As, given a video URL, the downloader doesn't know how to
444 extract all the needed information, task that InfoExtractors do, it
445 has to pass the URL to one of them.
447 For this, file downloader objects have a method that allows
448 InfoExtractors to be registered in a given order. When it is passed
449 a URL, the file downloader handles it to the first InfoExtractor it
450 finds that reports being able to handle it. The InfoExtractor extracts
451 all the information about the video or videos the URL refers to, and
452 asks the FileDownloader to process the video information, possibly
453 downloading the video.
455 File downloaders accept a lot of parameters. In order not to saturate
456 the object constructor with arguments, it receives a dictionary of
457 options instead. These options are available through the params
458 attribute for the InfoExtractors to use. The FileDownloader also
459 registers itself as the downloader in charge for the InfoExtractors
460 that are added to it, so this is a "mutual registration".
464 username: Username for authentication purposes.
465 password: Password for authentication purposes.
466 usenetrc: Use netrc for authentication instead.
467 quiet: Do not print messages to stdout.
468 forceurl: Force printing final URL.
469 forcetitle: Force printing title.
470 forcethumbnail: Force printing thumbnail URL.
471 forcedescription: Force printing description.
472 forcefilename: Force printing final filename.
473 simulate: Do not download the video files.
474 format: Video format code.
475 format_limit: Highest quality format to try.
476 outtmpl: Template for output names.
477 ignoreerrors: Do not stop on download errors.
478 ratelimit: Download speed limit, in bytes/sec.
479 nooverwrites: Prevent overwriting files.
480 retries: Number of times to retry for HTTP error 5xx
481 continuedl: Try to continue downloads if possible.
482 noprogress: Do not print the progress bar.
483 playliststart: Playlist item to start at.
484 playlistend: Playlist item to end at.
485 matchtitle: Download only matching titles.
486 rejecttitle: Reject downloads for matching titles.
487 logtostderr: Log messages to stderr instead of stdout.
488 consoletitle: Display progress in console window's titlebar.
489 nopart: Do not use temporary .part files.
490 updatetime: Use the Last-modified header to set output file timestamps.
491 writedescription: Write the video description to a .description file
492 writeinfojson: Write the video description to a .info.json file
493 writesubtitles: Write the video subtitles to a .srt file
494 subtitleslang: Language of the subtitles to download
500 _download_retcode = None
501 _num_downloads = None
504 def __init__(self, params):
505 """Create a FileDownloader object with the given options."""
508 self._download_retcode = 0
509 self._num_downloads = 0
510 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
514 def format_bytes(bytes):
517 if type(bytes) is str:
522 exponent = long(math.log(bytes, 1024.0))
523 suffix = 'bkMGTPEZY'[exponent]
524 converted = float(bytes) / float(1024 ** exponent)
525 return '%.2f%s' % (converted, suffix)
528 def calc_percent(byte_counter, data_len):
531 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
534 def calc_eta(start, now, total, current):
538 if current == 0 or dif < 0.001: # One millisecond
540 rate = float(current) / dif
541 eta = long((float(total) - float(current)) / rate)
542 (eta_mins, eta_secs) = divmod(eta, 60)
545 return '%02d:%02d' % (eta_mins, eta_secs)
548 def calc_speed(start, now, bytes):
550 if bytes == 0 or dif < 0.001: # One millisecond
551 return '%10s' % '---b/s'
552 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
555 def best_block_size(elapsed_time, bytes):
556 new_min = max(bytes / 2.0, 1.0)
557 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
558 if elapsed_time < 0.001:
560 rate = bytes / elapsed_time
568 def parse_bytes(bytestr):
569 """Parse a string indicating a byte quantity into a long integer."""
570 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
573 number = float(matchobj.group(1))
574 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
575 return long(round(number * multiplier))
577 def add_info_extractor(self, ie):
578 """Add an InfoExtractor object to the end of the list."""
580 ie.set_downloader(self)
582 def add_post_processor(self, pp):
583 """Add a PostProcessor object to the end of the chain."""
585 pp.set_downloader(self)
587 def to_screen(self, message, skip_eol=False):
588 """Print message to stdout if not in quiet mode."""
589 assert type(message) == type(u'')
590 if not self.params.get('quiet', False):
591 terminator = [u'\n', u''][skip_eol]
592 output = message + terminator
594 if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
595 output = output.encode(preferredencoding(), 'ignore')
596 self._screen_file.write(output)
597 self._screen_file.flush()
599 def to_stderr(self, message):
600 """Print message to stderr."""
601 print >>sys.stderr, message.encode(preferredencoding())
603 def to_cons_title(self, message):
604 """Set console/terminal window title to message."""
605 if not self.params.get('consoletitle', False):
607 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
608 # c_wchar_p() might not be necessary if `message` is
609 # already of type unicode()
610 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
611 elif 'TERM' in os.environ:
612 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
614 def fixed_template(self):
615 """Checks if the output template is fixed."""
616 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
618 def trouble(self, message=None):
619 """Determine action to take when a download problem appears.
621 Depending on if the downloader has been configured to ignore
622 download errors or not, this method may throw an exception or
623 not when errors are found, after printing the message.
625 if message is not None:
626 self.to_stderr(message)
627 if not self.params.get('ignoreerrors', False):
628 raise DownloadError(message)
629 self._download_retcode = 1
631 def slow_down(self, start_time, byte_counter):
632 """Sleep if the download speed is over the rate limit."""
633 rate_limit = self.params.get('ratelimit', None)
634 if rate_limit is None or byte_counter == 0:
637 elapsed = now - start_time
640 speed = float(byte_counter) / elapsed
641 if speed > rate_limit:
642 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
644 def temp_name(self, filename):
645 """Returns a temporary filename for the given filename."""
646 if self.params.get('nopart', False) or filename == u'-' or \
647 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
649 return filename + u'.part'
651 def undo_temp_name(self, filename):
652 if filename.endswith(u'.part'):
653 return filename[:-len(u'.part')]
656 def try_rename(self, old_filename, new_filename):
658 if old_filename == new_filename:
660 os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
661 except (IOError, OSError), err:
662 self.trouble(u'ERROR: unable to rename file')
664 def try_utime(self, filename, last_modified_hdr):
665 """Try to set the last-modified time of the given file."""
666 if last_modified_hdr is None:
668 if not os.path.isfile(_encodeFilename(filename)):
670 timestr = last_modified_hdr
673 filetime = timeconvert(timestr)
677 os.utime(filename, (time.time(), filetime))
682 def report_writedescription(self, descfn):
683 """ Report that the description file is being written """
684 self.to_screen(u'[info] Writing video description to: ' + descfn)
686 def report_writesubtitles(self, srtfn):
687 """ Report that the subtitles file is being written """
688 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
690 def report_writeinfojson(self, infofn):
691 """ Report that the metadata file has been written """
692 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
694 def report_destination(self, filename):
695 """Report destination filename."""
696 self.to_screen(u'[download] Destination: ' + filename)
698 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
699 """Report download progress."""
700 if self.params.get('noprogress', False):
702 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
703 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
704 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
705 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
707 def report_resuming_byte(self, resume_len):
708 """Report attempt to resume at given byte."""
709 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
711 def report_retry(self, count, retries):
712 """Report retry in case of HTTP error 5xx"""
713 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
715 def report_file_already_downloaded(self, file_name):
716 """Report file has already been fully downloaded."""
718 self.to_screen(u'[download] %s has already been downloaded' % file_name)
719 except (UnicodeEncodeError), err:
720 self.to_screen(u'[download] The file has already been downloaded')
722 def report_unable_to_resume(self):
723 """Report it was impossible to resume download."""
724 self.to_screen(u'[download] Unable to resume')
726 def report_finish(self):
727 """Report download finished."""
728 if self.params.get('noprogress', False):
729 self.to_screen(u'[download] Download completed')
733 def increment_downloads(self):
734 """Increment the ordinal that assigns a number to each file."""
735 self._num_downloads += 1
737 def prepare_filename(self, info_dict):
738 """Generate the output filename."""
740 template_dict = dict(info_dict)
741 template_dict['epoch'] = unicode(long(time.time()))
742 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
743 filename = self.params['outtmpl'] % template_dict
745 except (ValueError, KeyError), err:
746 self.trouble(u'ERROR: invalid system charset or erroneous output template')
749 def _match_entry(self, info_dict):
750 """ Returns None iff the file should be downloaded """
752 title = info_dict['title']
753 matchtitle = self.params.get('matchtitle', False)
754 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
755 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
756 rejecttitle = self.params.get('rejecttitle', False)
757 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
758 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
761 def process_info(self, info_dict):
762 """Process a single dictionary returned by an InfoExtractor."""
764 reason = self._match_entry(info_dict)
765 if reason is not None:
766 self.to_screen(u'[download] ' + reason)
769 max_downloads = self.params.get('max_downloads')
770 if max_downloads is not None:
771 if self._num_downloads > int(max_downloads):
772 raise MaxDownloadsReached()
774 filename = self.prepare_filename(info_dict)
777 if self.params.get('forcetitle', False):
778 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
779 if self.params.get('forceurl', False):
780 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
781 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
782 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
783 if self.params.get('forcedescription', False) and 'description' in info_dict:
784 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
785 if self.params.get('forcefilename', False) and filename is not None:
786 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
787 if self.params.get('forceformat', False):
788 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
790 # Do nothing else if in simulate mode
791 if self.params.get('simulate', False):
798 dn = os.path.dirname(_encodeFilename(filename))
799 if dn != '' and not os.path.exists(dn): # dn is already encoded
801 except (OSError, IOError), err:
802 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
805 if self.params.get('writedescription', False):
807 descfn = filename + u'.description'
808 self.report_writedescription(descfn)
809 descfile = open(_encodeFilename(descfn), 'wb')
811 descfile.write(info_dict['description'].encode('utf-8'))
814 except (OSError, IOError):
815 self.trouble(u'ERROR: Cannot write description file ' + descfn)
818 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
819 # subtitles download errors are already managed as troubles in relevant IE
820 # that way it will silently go on when used with unsupporting IE
822 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
823 self.report_writesubtitles(srtfn)
824 srtfile = open(_encodeFilename(srtfn), 'wb')
826 srtfile.write(info_dict['subtitles'].encode('utf-8'))
829 except (OSError, IOError):
830 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
833 if self.params.get('writeinfojson', False):
834 infofn = filename + u'.info.json'
835 self.report_writeinfojson(infofn)
838 except (NameError,AttributeError):
839 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
842 infof = open(_encodeFilename(infofn), 'wb')
844 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
845 json.dump(json_info_dict, infof)
848 except (OSError, IOError):
849 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
852 if not self.params.get('skip_download', False):
853 if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
857 success = self._do_download(filename, info_dict)
858 except (OSError, IOError), err:
859 raise UnavailableVideoError
860 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
861 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
863 except (ContentTooShortError, ), err:
864 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
869 self.post_process(filename, info_dict)
870 except (PostProcessingError), err:
871 self.trouble(u'ERROR: postprocessing: %s' % str(err))
874 def download(self, url_list):
875 """Download a given list of URLs."""
876 if len(url_list) > 1 and self.fixed_template():
877 raise SameFileError(self.params['outtmpl'])
880 suitable_found = False
882 # Go to next InfoExtractor if not suitable
883 if not ie.suitable(url):
886 # Suitable InfoExtractor found
887 suitable_found = True
889 # Extract information from URL and process it
892 # Suitable InfoExtractor had been found; go to next URL
895 if not suitable_found:
896 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
898 return self._download_retcode
900 def post_process(self, filename, ie_info):
901 """Run the postprocessing chain on the given file."""
903 info['filepath'] = filename
909 def _download_with_rtmpdump(self, filename, url, player_url):
910 self.report_destination(filename)
911 tmpfilename = self.temp_name(filename)
913 # Check for rtmpdump first
915 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
916 except (OSError, IOError):
917 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
920 # Download using rtmpdump. rtmpdump returns exit code 2 when
921 # the connection was interrumpted and resuming appears to be
922 # possible. This is part of rtmpdump's normal usage, AFAIK.
923 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
924 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
925 if self.params.get('verbose', False):
928 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
931 self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
932 retval = subprocess.call(args)
933 while retval == 2 or retval == 1:
934 prevsize = os.path.getsize(_encodeFilename(tmpfilename))
935 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
936 time.sleep(5.0) # This seems to be needed
937 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
938 cursize = os.path.getsize(_encodeFilename(tmpfilename))
939 if prevsize == cursize and retval == 1:
941 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
942 if prevsize == cursize and retval == 2 and cursize > 1024:
943 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
947 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
948 self.try_rename(tmpfilename, filename)
951 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
954 def _do_download(self, filename, info_dict):
955 url = info_dict['url']
956 player_url = info_dict.get('player_url', None)
958 # Check file already present
959 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
960 self.report_file_already_downloaded(filename)
963 # Attempt to download using rtmpdump
964 if url.startswith('rtmp'):
965 return self._download_with_rtmpdump(filename, url, player_url)
967 tmpfilename = self.temp_name(filename)
970 # Do not include the Accept-Encoding header
971 headers = {'Youtubedl-no-compression': 'True'}
972 basic_request = urllib2.Request(url, None, headers)
973 request = urllib2.Request(url, None, headers)
975 # Establish possible resume length
976 if os.path.isfile(_encodeFilename(tmpfilename)):
977 resume_len = os.path.getsize(_encodeFilename(tmpfilename))
983 if self.params.get('continuedl', False):
984 self.report_resuming_byte(resume_len)
985 request.add_header('Range','bytes=%d-' % resume_len)
991 retries = self.params.get('retries', 0)
992 while count <= retries:
993 # Establish connection
995 if count == 0 and 'urlhandle' in info_dict:
996 data = info_dict['urlhandle']
997 data = urllib2.urlopen(request)
999 except (urllib2.HTTPError, ), err:
1000 if (err.code < 500 or err.code >= 600) and err.code != 416:
1001 # Unexpected HTTP error
1003 elif err.code == 416:
1004 # Unable to resume (requested range not satisfiable)
1006 # Open the connection again without the range header
1007 data = urllib2.urlopen(basic_request)
1008 content_length = data.info()['Content-Length']
1009 except (urllib2.HTTPError, ), err:
1010 if err.code < 500 or err.code >= 600:
1013 # Examine the reported length
1014 if (content_length is not None and
1015 (resume_len - 100 < long(content_length) < resume_len + 100)):
1016 # The file had already been fully downloaded.
1017 # Explanation to the above condition: in issue #175 it was revealed that
1018 # YouTube sometimes adds or removes a few bytes from the end of the file,
1019 # changing the file size slightly and causing problems for some users. So
1020 # I decided to implement a suggested change and consider the file
1021 # completely downloaded if the file size differs less than 100 bytes from
1022 # the one in the hard drive.
1023 self.report_file_already_downloaded(filename)
1024 self.try_rename(tmpfilename, filename)
1027 # The length does not match, we start the download over
1028 self.report_unable_to_resume()
1033 if count <= retries:
1034 self.report_retry(count, retries)
1037 self.trouble(u'ERROR: giving up after %s retries' % retries)
1040 data_len = data.info().get('Content-length', None)
1041 if data_len is not None:
1042 data_len = long(data_len) + resume_len
1043 data_len_str = self.format_bytes(data_len)
1044 byte_counter = 0 + resume_len
1048 # Download and write
1049 before = time.time()
1050 data_block = data.read(block_size)
1052 if len(data_block) == 0:
1054 byte_counter += len(data_block)
1056 # Open file just in time
1059 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1060 assert stream is not None
1061 filename = self.undo_temp_name(tmpfilename)
1062 self.report_destination(filename)
1063 except (OSError, IOError), err:
1064 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1067 stream.write(data_block)
1068 except (IOError, OSError), err:
1069 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1071 block_size = self.best_block_size(after - before, len(data_block))
1074 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1075 if data_len is None:
1076 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1078 percent_str = self.calc_percent(byte_counter, data_len)
1079 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1080 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1083 self.slow_down(start, byte_counter - resume_len)
1086 self.trouble(u'\nERROR: Did not get any data blocks')
1089 self.report_finish()
1090 if data_len is not None and byte_counter != data_len:
1091 raise ContentTooShortError(byte_counter, long(data_len))
1092 self.try_rename(tmpfilename, filename)
1094 # Update file modification time
1095 if self.params.get('updatetime', True):
1096 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1101 class InfoExtractor(object):
1102 """Information Extractor class.
1104 Information extractors are the classes that, given a URL, extract
1105 information from the video (or videos) the URL refers to. This
1106 information includes the real video URL, the video title and simplified
1107 title, author and others. The information is stored in a dictionary
1108 which is then passed to the FileDownloader. The FileDownloader
1109 processes this information possibly downloading the video to the file
1110 system, among other possible outcomes. The dictionaries must include
1111 the following fields:
1113 id: Video identifier.
1114 url: Final video URL.
1115 uploader: Nickname of the video uploader.
1116 title: Literal title.
1117 stitle: Simplified title.
1118 ext: Video filename extension.
1119 format: Video format.
1120 player_url: SWF Player URL (may be None).
1122 The following fields are optional. Their primary purpose is to allow
1123 youtube-dl to serve as the backend for a video search function, such
1124 as the one in youtube2mp3. They are only used when their respective
1125 forced printing functions are called:
1127 thumbnail: Full URL to a video thumbnail image.
1128 description: One-line video description.
1130 Subclasses of this one should re-define the _real_initialize() and
1131 _real_extract() methods and define a _VALID_URL regexp.
1132 Probably, they should also be added to the list of extractors.
1138 def __init__(self, downloader=None):
1139 """Constructor. Receives an optional downloader."""
1141 self.set_downloader(downloader)
1143 def suitable(self, url):
1144 """Receives a URL and returns True if suitable for this IE."""
1145 return re.match(self._VALID_URL, url) is not None
1147 def initialize(self):
1148 """Initializes an instance (authentication, etc)."""
1150 self._real_initialize()
1153 def extract(self, url):
1154 """Extracts URL information and returns it in list of dicts."""
1156 return self._real_extract(url)
1158 def set_downloader(self, downloader):
1159 """Sets the downloader for this IE."""
1160 self._downloader = downloader
1162 def _real_initialize(self):
1163 """Real initialization process. Redefine in subclasses."""
1166 def _real_extract(self, url):
1167 """Real extraction process. Redefine in subclasses."""
1171 class YoutubeIE(InfoExtractor):
1172 """Information extractor for youtube.com."""
1174 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1175 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1176 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1177 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1178 _NETRC_MACHINE = 'youtube'
1179 # Listed in order of quality
1180 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1181 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1182 _video_extensions = {
1188 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1194 _video_dimensions = {
1210 IE_NAME = u'youtube'
1212 def report_lang(self):
1213 """Report attempt to set language."""
1214 self._downloader.to_screen(u'[youtube] Setting language')
1216 def report_login(self):
1217 """Report attempt to log in."""
1218 self._downloader.to_screen(u'[youtube] Logging in')
1220 def report_age_confirmation(self):
1221 """Report attempt to confirm age."""
1222 self._downloader.to_screen(u'[youtube] Confirming age')
1224 def report_video_webpage_download(self, video_id):
1225 """Report attempt to download video webpage."""
1226 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1228 def report_video_info_webpage_download(self, video_id):
1229 """Report attempt to download video info webpage."""
1230 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1232 def report_video_subtitles_download(self, video_id):
1233 """Report attempt to download video info webpage."""
1234 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1236 def report_information_extraction(self, video_id):
1237 """Report attempt to extract video information."""
1238 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1240 def report_unavailable_format(self, video_id, format):
1241 """Report extracted video URL."""
1242 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1244 def report_rtmp_download(self):
1245 """Indicate the download will use the RTMP protocol."""
1246 self._downloader.to_screen(u'[youtube] RTMP download detected')
1248 def _closed_captions_xml_to_srt(self, xml_string):
1250 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1251 # TODO parse xml instead of regex
1252 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1253 if not dur: dur = '4'
1254 start = float(start)
1255 end = start + float(dur)
1256 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1257 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1258 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
1259 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
1260 srt += str(n) + '\n'
1261 srt += start + ' --> ' + end + '\n'
1262 srt += caption + '\n\n'
1265 def _print_formats(self, formats):
1266 print 'Available formats:'
1268 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1270 def _real_initialize(self):
1271 if self._downloader is None:
1276 downloader_params = self._downloader.params
1278 # Attempt to use provided username and password or .netrc data
1279 if downloader_params.get('username', None) is not None:
1280 username = downloader_params['username']
1281 password = downloader_params['password']
1282 elif downloader_params.get('usenetrc', False):
1284 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1285 if info is not None:
1289 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1290 except (IOError, netrc.NetrcParseError), err:
1291 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1295 request = urllib2.Request(self._LANG_URL)
1298 urllib2.urlopen(request).read()
1299 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1300 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1303 # No authentication to be performed
1304 if username is None:
1309 'current_form': 'loginForm',
1311 'action_login': 'Log In',
1312 'username': username,
1313 'password': password,
1315 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1318 login_results = urllib2.urlopen(request).read()
1319 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1320 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1322 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1323 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1329 'action_confirm': 'Confirm',
1331 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1333 self.report_age_confirmation()
1334 age_results = urllib2.urlopen(request).read()
1335 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1336 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1339 def _real_extract(self, url):
1340 # Extract video id from URL
1341 mobj = re.match(self._VALID_URL, url)
1343 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1345 video_id = mobj.group(2)
1348 self.report_video_webpage_download(video_id)
1349 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1351 video_webpage = urllib2.urlopen(request).read()
1352 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1353 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1356 # Attempt to extract SWF player URL
1357 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1358 if mobj is not None:
1359 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1364 self.report_video_info_webpage_download(video_id)
1365 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1366 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1367 % (video_id, el_type))
1368 request = urllib2.Request(video_info_url)
1370 video_info_webpage = urllib2.urlopen(request).read()
1371 video_info = parse_qs(video_info_webpage)
1372 if 'token' in video_info:
1374 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1375 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1377 if 'token' not in video_info:
1378 if 'reason' in video_info:
1379 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1381 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1384 # Start extracting information
1385 self.report_information_extraction(video_id)
1388 if 'author' not in video_info:
1389 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1391 video_uploader = urllib.unquote_plus(video_info['author'][0])
1394 if 'title' not in video_info:
1395 self._downloader.trouble(u'ERROR: unable to extract video title')
1397 video_title = urllib.unquote_plus(video_info['title'][0])
1398 video_title = video_title.decode('utf-8')
1399 video_title = sanitize_title(video_title)
1402 simple_title = _simplify_title(video_title)
1405 if 'thumbnail_url' not in video_info:
1406 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1407 video_thumbnail = ''
1408 else: # don't panic if we can't find it
1409 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1413 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1414 if mobj is not None:
1415 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1416 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1417 for expression in format_expressions:
1419 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1427 video_description = u'No description available.'
1428 mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1429 if mobj is not None:
1430 video_description = mobj.group(1).decode('utf-8')
1432 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1433 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1434 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1435 # TODO use another parser
1438 video_subtitles = None
1439 if self._downloader.params.get('writesubtitles', False):
1440 self.report_video_subtitles_download(video_id)
1441 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1443 srt_list = urllib2.urlopen(request).read()
1444 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1445 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1447 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1449 if self._downloader.params.get('subtitleslang', False):
1450 srt_lang = self._downloader.params.get('subtitleslang')
1451 elif 'en' in srt_lang_list:
1454 srt_lang = srt_lang_list[0]
1455 if not srt_lang in srt_lang_list:
1456 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
1458 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1460 srt_xml = urllib2.urlopen(request).read()
1461 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1462 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1464 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1466 self._downloader.trouble(u'WARNING: video has no closed captions')
1469 video_token = urllib.unquote_plus(video_info['token'][0])
1471 # Decide which formats to download
1472 req_format = self._downloader.params.get('format', None)
1474 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1475 self.report_rtmp_download()
1476 video_url_list = [(None, video_info['conn'][0])]
1477 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1478 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1479 url_data = [parse_qs(uds) for uds in url_data_strs]
1480 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1481 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1483 format_limit = self._downloader.params.get('format_limit', None)
1484 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1485 if format_limit is not None and format_limit in available_formats:
1486 format_list = available_formats[available_formats.index(format_limit):]
1488 format_list = available_formats
1489 existing_formats = [x for x in format_list if x in url_map]
1490 if len(existing_formats) == 0:
1491 self._downloader.trouble(u'ERROR: no known formats available for video')
1493 if self._downloader.params.get('listformats', None):
1494 self._print_formats(existing_formats)
1496 if req_format is None or req_format == 'best':
1497 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1498 elif req_format == 'worst':
1499 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1500 elif req_format in ('-1', 'all'):
1501 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1503 # Specific formats. We pick the first in a slash-delimeted sequence.
1504 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1505 req_formats = req_format.split('/')
1506 video_url_list = None
1507 for rf in req_formats:
1509 video_url_list = [(rf, url_map[rf])]
1511 if video_url_list is None:
1512 self._downloader.trouble(u'ERROR: requested format not available')
1515 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1518 for format_param, video_real_url in video_url_list:
1519 # At this point we have a new video
1520 self._downloader.increment_downloads()
1523 video_extension = self._video_extensions.get(format_param, 'flv')
1526 # Process video information
1527 self._downloader.process_info({
1528 'id': video_id.decode('utf-8'),
1529 'url': video_real_url.decode('utf-8'),
1530 'uploader': video_uploader.decode('utf-8'),
1531 'upload_date': upload_date,
1532 'title': video_title,
1533 'stitle': simple_title,
1534 'ext': video_extension.decode('utf-8'),
1535 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1536 'thumbnail': video_thumbnail.decode('utf-8'),
1537 'description': video_description,
1538 'player_url': player_url,
1539 'subtitles': video_subtitles
1541 except UnavailableVideoError, err:
1542 self._downloader.trouble(u'\nERROR: unable to download video')
1545 class MetacafeIE(InfoExtractor):
1546 """Information Extractor for metacafe.com."""
1548 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1549 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1550 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1552 IE_NAME = u'metacafe'
1554 def __init__(self, youtube_ie, downloader=None):
1555 InfoExtractor.__init__(self, downloader)
1556 self._youtube_ie = youtube_ie
1558 def report_disclaimer(self):
1559 """Report disclaimer retrieval."""
1560 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1562 def report_age_confirmation(self):
1563 """Report attempt to confirm age."""
1564 self._downloader.to_screen(u'[metacafe] Confirming age')
1566 def report_download_webpage(self, video_id):
1567 """Report webpage download."""
1568 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1570 def report_extraction(self, video_id):
1571 """Report information extraction."""
1572 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1574 def _real_initialize(self):
1575 # Retrieve disclaimer
1576 request = urllib2.Request(self._DISCLAIMER)
1578 self.report_disclaimer()
1579 disclaimer = urllib2.urlopen(request).read()
1580 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1581 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1587 'submit': "Continue - I'm over 18",
1589 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1591 self.report_age_confirmation()
1592 disclaimer = urllib2.urlopen(request).read()
1593 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1594 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1597 def _real_extract(self, url):
1598 # Extract id and simplified title from URL
1599 mobj = re.match(self._VALID_URL, url)
1601 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1604 video_id = mobj.group(1)
1606 # Check if video comes from YouTube
1607 mobj2 = re.match(r'^yt-(.*)$', video_id)
1608 if mobj2 is not None:
1609 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1612 # At this point we have a new video
1613 self._downloader.increment_downloads()
1615 simple_title = mobj.group(2).decode('utf-8')
1617 # Retrieve video webpage to extract further information
1618 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1620 self.report_download_webpage(video_id)
1621 webpage = urllib2.urlopen(request).read()
1622 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1623 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1626 # Extract URL, uploader and title from webpage
1627 self.report_extraction(video_id)
1628 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1629 if mobj is not None:
1630 mediaURL = urllib.unquote(mobj.group(1))
1631 video_extension = mediaURL[-3:]
1633 # Extract gdaKey if available
1634 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1636 video_url = mediaURL
1638 gdaKey = mobj.group(1)
1639 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1641 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1643 self._downloader.trouble(u'ERROR: unable to extract media URL')
1645 vardict = parse_qs(mobj.group(1))
1646 if 'mediaData' not in vardict:
1647 self._downloader.trouble(u'ERROR: unable to extract media URL')
1649 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1651 self._downloader.trouble(u'ERROR: unable to extract media URL')
1653 mediaURL = mobj.group(1).replace('\\/', '/')
1654 video_extension = mediaURL[-3:]
1655 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1657 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1659 self._downloader.trouble(u'ERROR: unable to extract title')
1661 video_title = mobj.group(1).decode('utf-8')
1662 video_title = sanitize_title(video_title)
1664 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1666 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1668 video_uploader = mobj.group(1)
1671 # Process video information
1672 self._downloader.process_info({
1673 'id': video_id.decode('utf-8'),
1674 'url': video_url.decode('utf-8'),
1675 'uploader': video_uploader.decode('utf-8'),
1676 'upload_date': u'NA',
1677 'title': video_title,
1678 'stitle': simple_title,
1679 'ext': video_extension.decode('utf-8'),
1683 except UnavailableVideoError:
1684 self._downloader.trouble(u'\nERROR: unable to download video')
1687 class DailymotionIE(InfoExtractor):
1688 """Information Extractor for Dailymotion"""
1690 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1691 IE_NAME = u'dailymotion'
1693 def __init__(self, downloader=None):
1694 InfoExtractor.__init__(self, downloader)
1696 def report_download_webpage(self, video_id):
1697 """Report webpage download."""
1698 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1700 def report_extraction(self, video_id):
1701 """Report information extraction."""
1702 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1704 def _real_extract(self, url):
1705 # Extract id and simplified title from URL
1706 mobj = re.match(self._VALID_URL, url)
1708 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1711 # At this point we have a new video
1712 self._downloader.increment_downloads()
1713 video_id = mobj.group(1)
1715 video_extension = 'flv'
1717 # Retrieve video webpage to extract further information
1718 request = urllib2.Request(url)
1719 request.add_header('Cookie', 'family_filter=off')
1721 self.report_download_webpage(video_id)
1722 webpage = urllib2.urlopen(request).read()
1723 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1724 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1727 # Extract URL, uploader and title from webpage
1728 self.report_extraction(video_id)
1729 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1731 self._downloader.trouble(u'ERROR: unable to extract media URL')
1733 sequence = urllib.unquote(mobj.group(1))
1734 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1736 self._downloader.trouble(u'ERROR: unable to extract media URL')
1738 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1740 # if needed add http://www.dailymotion.com/ if relative URL
1742 video_url = mediaURL
1744 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1746 self._downloader.trouble(u'ERROR: unable to extract title')
1748 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1749 video_title = sanitize_title(video_title)
1750 simple_title = _simplify_title(video_title)
1752 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1754 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1756 video_uploader = mobj.group(1)
1759 # Process video information
1760 self._downloader.process_info({
1761 'id': video_id.decode('utf-8'),
1762 'url': video_url.decode('utf-8'),
1763 'uploader': video_uploader.decode('utf-8'),
1764 'upload_date': u'NA',
1765 'title': video_title,
1766 'stitle': simple_title,
1767 'ext': video_extension.decode('utf-8'),
1771 except UnavailableVideoError:
1772 self._downloader.trouble(u'\nERROR: unable to download video')
1775 class GoogleIE(InfoExtractor):
1776 """Information extractor for video.google.com."""
1778 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1779 IE_NAME = u'video.google'
1781 def __init__(self, downloader=None):
1782 InfoExtractor.__init__(self, downloader)
1784 def report_download_webpage(self, video_id):
1785 """Report webpage download."""
1786 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1788 def report_extraction(self, video_id):
1789 """Report information extraction."""
1790 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1792 def _real_extract(self, url):
1793 # Extract id from URL
1794 mobj = re.match(self._VALID_URL, url)
1796 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1799 # At this point we have a new video
1800 self._downloader.increment_downloads()
1801 video_id = mobj.group(1)
1803 video_extension = 'mp4'
1805 # Retrieve video webpage to extract further information
1806 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1808 self.report_download_webpage(video_id)
1809 webpage = urllib2.urlopen(request).read()
1810 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1811 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1814 # Extract URL, uploader, and title from webpage
1815 self.report_extraction(video_id)
1816 mobj = re.search(r"download_url:'([^']+)'", webpage)
1818 video_extension = 'flv'
1819 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1821 self._downloader.trouble(u'ERROR: unable to extract media URL')
1823 mediaURL = urllib.unquote(mobj.group(1))
1824 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1825 mediaURL = mediaURL.replace('\\x26', '\x26')
1827 video_url = mediaURL
1829 mobj = re.search(r'<title>(.*)</title>', webpage)
1831 self._downloader.trouble(u'ERROR: unable to extract title')
1833 video_title = mobj.group(1).decode('utf-8')
1834 video_title = sanitize_title(video_title)
1835 simple_title = _simplify_title(video_title)
1837 # Extract video description
1838 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1840 self._downloader.trouble(u'ERROR: unable to extract video description')
1842 video_description = mobj.group(1).decode('utf-8')
1843 if not video_description:
1844 video_description = 'No description available.'
1846 # Extract video thumbnail
1847 if self._downloader.params.get('forcethumbnail', False):
1848 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1850 webpage = urllib2.urlopen(request).read()
1851 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1852 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1854 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1856 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1858 video_thumbnail = mobj.group(1)
1859 else: # we need something to pass to process_info
1860 video_thumbnail = ''
1863 # Process video information
1864 self._downloader.process_info({
1865 'id': video_id.decode('utf-8'),
1866 'url': video_url.decode('utf-8'),
1868 'upload_date': u'NA',
1869 'title': video_title,
1870 'stitle': simple_title,
1871 'ext': video_extension.decode('utf-8'),
1875 except UnavailableVideoError:
1876 self._downloader.trouble(u'\nERROR: unable to download video')
1879 class PhotobucketIE(InfoExtractor):
1880 """Information extractor for photobucket.com."""
1882 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1883 IE_NAME = u'photobucket'
1885 def __init__(self, downloader=None):
1886 InfoExtractor.__init__(self, downloader)
1888 def report_download_webpage(self, video_id):
1889 """Report webpage download."""
1890 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1892 def report_extraction(self, video_id):
1893 """Report information extraction."""
1894 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1896 def _real_extract(self, url):
1897 # Extract id from URL
1898 mobj = re.match(self._VALID_URL, url)
1900 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1903 # At this point we have a new video
1904 self._downloader.increment_downloads()
1905 video_id = mobj.group(1)
1907 video_extension = 'flv'
1909 # Retrieve video webpage to extract further information
1910 request = urllib2.Request(url)
1912 self.report_download_webpage(video_id)
1913 webpage = urllib2.urlopen(request).read()
1914 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1915 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1918 # Extract URL, uploader, and title from webpage
1919 self.report_extraction(video_id)
1920 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1922 self._downloader.trouble(u'ERROR: unable to extract media URL')
1924 mediaURL = urllib.unquote(mobj.group(1))
1926 video_url = mediaURL
1928 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1930 self._downloader.trouble(u'ERROR: unable to extract title')
1932 video_title = mobj.group(1).decode('utf-8')
1933 video_title = sanitize_title(video_title)
1934 simple_title = _simplify_title(vide_title)
1936 video_uploader = mobj.group(2).decode('utf-8')
1939 # Process video information
1940 self._downloader.process_info({
1941 'id': video_id.decode('utf-8'),
1942 'url': video_url.decode('utf-8'),
1943 'uploader': video_uploader,
1944 'upload_date': u'NA',
1945 'title': video_title,
1946 'stitle': simple_title,
1947 'ext': video_extension.decode('utf-8'),
1951 except UnavailableVideoError:
1952 self._downloader.trouble(u'\nERROR: unable to download video')
1955 class YahooIE(InfoExtractor):
1956 """Information extractor for video.yahoo.com."""
1958 # _VALID_URL matches all Yahoo! Video URLs
1959 # _VPAGE_URL matches only the extractable '/watch/' URLs
1960 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1961 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1962 IE_NAME = u'video.yahoo'
1964 def __init__(self, downloader=None):
1965 InfoExtractor.__init__(self, downloader)
1967 def report_download_webpage(self, video_id):
1968 """Report webpage download."""
1969 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1971 def report_extraction(self, video_id):
1972 """Report information extraction."""
1973 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1975 def _real_extract(self, url, new_video=True):
1976 # Extract ID from URL
1977 mobj = re.match(self._VALID_URL, url)
1979 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1982 # At this point we have a new video
1983 self._downloader.increment_downloads()
1984 video_id = mobj.group(2)
1985 video_extension = 'flv'
1987 # Rewrite valid but non-extractable URLs as
1988 # extractable English language /watch/ URLs
1989 if re.match(self._VPAGE_URL, url) is None:
1990 request = urllib2.Request(url)
1992 webpage = urllib2.urlopen(request).read()
1993 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1994 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1997 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1999 self._downloader.trouble(u'ERROR: Unable to extract id field')
2001 yahoo_id = mobj.group(1)
2003 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2005 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2007 yahoo_vid = mobj.group(1)
2009 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2010 return self._real_extract(url, new_video=False)
2012 # Retrieve video webpage to extract further information
2013 request = urllib2.Request(url)
2015 self.report_download_webpage(video_id)
2016 webpage = urllib2.urlopen(request).read()
2017 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2018 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2021 # Extract uploader and title from webpage
2022 self.report_extraction(video_id)
2023 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2025 self._downloader.trouble(u'ERROR: unable to extract video title')
2027 video_title = mobj.group(1).decode('utf-8')
2028 simple_title = _simplify_title(video_title)
2030 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2032 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2034 video_uploader = mobj.group(1).decode('utf-8')
2036 # Extract video thumbnail
2037 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2039 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2041 video_thumbnail = mobj.group(1).decode('utf-8')
2043 # Extract video description
2044 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2046 self._downloader.trouble(u'ERROR: unable to extract video description')
2048 video_description = mobj.group(1).decode('utf-8')
2049 if not video_description:
2050 video_description = 'No description available.'
2052 # Extract video height and width
2053 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2055 self._downloader.trouble(u'ERROR: unable to extract video height')
2057 yv_video_height = mobj.group(1)
2059 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2061 self._downloader.trouble(u'ERROR: unable to extract video width')
2063 yv_video_width = mobj.group(1)
2065 # Retrieve video playlist to extract media URL
2066 # I'm not completely sure what all these options are, but we
2067 # seem to need most of them, otherwise the server sends a 401.
2068 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
2069 yv_bitrate = '700' # according to Wikipedia this is hard-coded
2070 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2071 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2072 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2074 self.report_download_webpage(video_id)
2075 webpage = urllib2.urlopen(request).read()
2076 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2077 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2080 # Extract media URL from playlist XML
2081 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2083 self._downloader.trouble(u'ERROR: Unable to extract media URL')
2085 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2086 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2089 # Process video information
2090 self._downloader.process_info({
2091 'id': video_id.decode('utf-8'),
2093 'uploader': video_uploader,
2094 'upload_date': u'NA',
2095 'title': video_title,
2096 'stitle': simple_title,
2097 'ext': video_extension.decode('utf-8'),
2098 'thumbnail': video_thumbnail.decode('utf-8'),
2099 'description': video_description,
2100 'thumbnail': video_thumbnail,
2103 except UnavailableVideoError:
2104 self._downloader.trouble(u'\nERROR: unable to download video')
2107 class VimeoIE(InfoExtractor):
2108 """Information extractor for vimeo.com."""
2110 # _VALID_URL matches Vimeo URLs
2111 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2114 def __init__(self, downloader=None):
2115 InfoExtractor.__init__(self, downloader)
2117 def report_download_webpage(self, video_id):
2118 """Report webpage download."""
2119 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2121 def report_extraction(self, video_id):
2122 """Report information extraction."""
2123 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2125 def _real_extract(self, url, new_video=True):
2126 # Extract ID from URL
2127 mobj = re.match(self._VALID_URL, url)
2129 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2132 # At this point we have a new video
2133 self._downloader.increment_downloads()
2134 video_id = mobj.group(1)
2136 # Retrieve video webpage to extract further information
2137 request = urllib2.Request(url, None, std_headers)
2139 self.report_download_webpage(video_id)
2140 webpage = urllib2.urlopen(request).read()
2141 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2142 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2145 # Now we begin extracting as much information as we can from what we
2146 # retrieved. First we extract the information common to all extractors,
2147 # and latter we extract those that are Vimeo specific.
2148 self.report_extraction(video_id)
2150 # Extract the config JSON
2151 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2153 config = json.loads(config)
2155 self._downloader.trouble(u'ERROR: unable to extract info section')
2159 video_title = config["video"]["title"]
2160 simple_title = _simplify_title(video_title)
2163 video_uploader = config["video"]["owner"]["name"]
2165 # Extract video thumbnail
2166 video_thumbnail = config["video"]["thumbnail"]
2168 # Extract video description
2172 video_description = u'No description available.'
2173 mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2174 if mobj is not None:
2175 video_description = mobj.group(1)
2177 html_parser = lxml.etree.HTMLParser()
2178 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2179 video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2180 # TODO use another parser
2182 # Extract upload date
2183 video_upload_date = u'NA'
2184 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2185 if mobj is not None:
2186 video_upload_date = mobj.group(1)
2188 # Vimeo specific: extract request signature and timestamp
2189 sig = config['request']['signature']
2190 timestamp = config['request']['timestamp']
2192 # Vimeo specific: extract video codec and quality information
2193 # TODO bind to format param
2194 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2195 for codec in codecs:
2196 if codec[0] in config["video"]["files"]:
2197 video_codec = codec[0]
2198 video_extension = codec[1]
2199 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2200 else: quality = 'sd'
2203 self._downloader.trouble(u'ERROR: no known codec found')
2206 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2207 %(video_id, sig, timestamp, quality, video_codec.upper())
2210 # Process video information
2211 self._downloader.process_info({
2214 'uploader': video_uploader,
2215 'upload_date': video_upload_date,
2216 'title': video_title,
2217 'stitle': simple_title,
2218 'ext': video_extension,
2219 'thumbnail': video_thumbnail,
2220 'description': video_description,
2223 except UnavailableVideoError:
2224 self._downloader.trouble(u'ERROR: unable to download video')
2227 class GenericIE(InfoExtractor):
2228 """Generic last-resort information extractor."""
2231 IE_NAME = u'generic'
2233 def __init__(self, downloader=None):
2234 InfoExtractor.__init__(self, downloader)
2236 def report_download_webpage(self, video_id):
2237 """Report webpage download."""
2238 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2239 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2241 def report_extraction(self, video_id):
2242 """Report information extraction."""
2243 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2245 def _real_extract(self, url):
2246 # At this point we have a new video
2247 self._downloader.increment_downloads()
2249 video_id = url.split('/')[-1]
2250 request = urllib2.Request(url)
2252 self.report_download_webpage(video_id)
2253 webpage = urllib2.urlopen(request).read()
2254 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2255 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2257 except ValueError, err:
2258 # since this is the last-resort InfoExtractor, if
2259 # this error is thrown, it'll be thrown here
2260 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2263 self.report_extraction(video_id)
2264 # Start with something easy: JW Player in SWFObject
2265 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2267 # Broaden the search a little bit
2268 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2270 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2273 # It's possible that one of the regexes
2274 # matched, but returned an empty group:
2275 if mobj.group(1) is None:
2276 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2279 video_url = urllib.unquote(mobj.group(1))
2280 video_id = os.path.basename(video_url)
2282 # here's a fun little line of code for you:
2283 video_extension = os.path.splitext(video_id)[1][1:]
2284 video_id = os.path.splitext(video_id)[0]
2286 # it's tempting to parse this further, but you would
2287 # have to take into account all the variations like
2288 # Video Title - Site Name
2289 # Site Name | Video Title
2290 # Video Title - Tagline | Site Name
2291 # and so on and so forth; it's just not practical
2292 mobj = re.search(r'<title>(.*)</title>', webpage)
2294 self._downloader.trouble(u'ERROR: unable to extract title')
2296 video_title = mobj.group(1).decode('utf-8')
2297 video_title = sanitize_title(video_title)
2298 simple_title = _simplify_title(video_title)
2300 # video uploader is domain name
2301 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2303 self._downloader.trouble(u'ERROR: unable to extract title')
2305 video_uploader = mobj.group(1).decode('utf-8')
2308 # Process video information
2309 self._downloader.process_info({
2310 'id': video_id.decode('utf-8'),
2311 'url': video_url.decode('utf-8'),
2312 'uploader': video_uploader,
2313 'upload_date': u'NA',
2314 'title': video_title,
2315 'stitle': simple_title,
2316 'ext': video_extension.decode('utf-8'),
2320 except UnavailableVideoError, err:
2321 self._downloader.trouble(u'\nERROR: unable to download video')
2324 class YoutubeSearchIE(InfoExtractor):
2325 """Information Extractor for YouTube search queries."""
2326 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2327 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
2329 _max_youtube_results = 1000
2330 IE_NAME = u'youtube:search'
2332 def __init__(self, youtube_ie, downloader=None):
2333 InfoExtractor.__init__(self, downloader)
2334 self._youtube_ie = youtube_ie
2336 def report_download_page(self, query, pagenum):
2337 """Report attempt to download playlist page with given number."""
2338 query = query.decode(preferredencoding())
2339 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2341 def _real_initialize(self):
2342 self._youtube_ie.initialize()
2344 def _real_extract(self, query):
2345 mobj = re.match(self._VALID_URL, query)
2347 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2350 prefix, query = query.split(':')
2352 query = query.encode('utf-8')
2354 self._download_n_results(query, 1)
2356 elif prefix == 'all':
2357 self._download_n_results(query, self._max_youtube_results)
2363 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2365 elif n > self._max_youtube_results:
2366 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2367 n = self._max_youtube_results
2368 self._download_n_results(query, n)
2370 except ValueError: # parsing prefix as integer fails
2371 self._download_n_results(query, 1)
2374 def _download_n_results(self, query, n):
2375 """Downloads a specified number of results for a query"""
2381 while (50 * pagenum) < limit:
2382 self.report_download_page(query, pagenum+1)
2383 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
2384 request = urllib2.Request(result_url)
2386 data = urllib2.urlopen(request).read()
2387 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2388 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
2390 api_response = json.loads(data)['data']
2392 new_ids = list(video['id'] for video in api_response['items'])
2393 video_ids += new_ids
2395 limit = min(n, api_response['totalItems'])
2398 if len(video_ids) > n:
2399 video_ids = video_ids[:n]
2400 for id in video_ids:
2401 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2405 class GoogleSearchIE(InfoExtractor):
2406 """Information Extractor for Google Video search queries."""
2407 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2408 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2409 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2410 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2412 _max_google_results = 1000
2413 IE_NAME = u'video.google:search'
2415 def __init__(self, google_ie, downloader=None):
2416 InfoExtractor.__init__(self, downloader)
2417 self._google_ie = google_ie
2419 def report_download_page(self, query, pagenum):
2420 """Report attempt to download playlist page with given number."""
2421 query = query.decode(preferredencoding())
2422 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2424 def _real_initialize(self):
2425 self._google_ie.initialize()
2427 def _real_extract(self, query):
2428 mobj = re.match(self._VALID_URL, query)
2430 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2433 prefix, query = query.split(':')
2435 query = query.encode('utf-8')
2437 self._download_n_results(query, 1)
2439 elif prefix == 'all':
2440 self._download_n_results(query, self._max_google_results)
2446 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2448 elif n > self._max_google_results:
2449 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2450 n = self._max_google_results
2451 self._download_n_results(query, n)
2453 except ValueError: # parsing prefix as integer fails
2454 self._download_n_results(query, 1)
2457 def _download_n_results(self, query, n):
2458 """Downloads a specified number of results for a query"""
2464 self.report_download_page(query, pagenum)
2465 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2466 request = urllib2.Request(result_url)
2468 page = urllib2.urlopen(request).read()
2469 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2470 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2473 # Extract video identifiers
2474 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2475 video_id = mobj.group(1)
2476 if video_id not in video_ids:
2477 video_ids.append(video_id)
2478 if len(video_ids) == n:
2479 # Specified n videos reached
2480 for id in video_ids:
2481 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2484 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2485 for id in video_ids:
2486 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2489 pagenum = pagenum + 1
2492 class YahooSearchIE(InfoExtractor):
2493 """Information Extractor for Yahoo! Video search queries."""
2494 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2495 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2496 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2497 _MORE_PAGES_INDICATOR = r'\s*Next'
2499 _max_yahoo_results = 1000
2500 IE_NAME = u'video.yahoo:search'
2502 def __init__(self, yahoo_ie, downloader=None):
2503 InfoExtractor.__init__(self, downloader)
2504 self._yahoo_ie = yahoo_ie
2506 def report_download_page(self, query, pagenum):
2507 """Report attempt to download playlist page with given number."""
2508 query = query.decode(preferredencoding())
2509 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2511 def _real_initialize(self):
2512 self._yahoo_ie.initialize()
2514 def _real_extract(self, query):
2515 mobj = re.match(self._VALID_URL, query)
2517 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2520 prefix, query = query.split(':')
2522 query = query.encode('utf-8')
2524 self._download_n_results(query, 1)
2526 elif prefix == 'all':
2527 self._download_n_results(query, self._max_yahoo_results)
2533 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2535 elif n > self._max_yahoo_results:
2536 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2537 n = self._max_yahoo_results
2538 self._download_n_results(query, n)
2540 except ValueError: # parsing prefix as integer fails
2541 self._download_n_results(query, 1)
2544 def _download_n_results(self, query, n):
2545 """Downloads a specified number of results for a query"""
2548 already_seen = set()
2552 self.report_download_page(query, pagenum)
2553 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2554 request = urllib2.Request(result_url)
2556 page = urllib2.urlopen(request).read()
2557 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2558 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2561 # Extract video identifiers
2562 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2563 video_id = mobj.group(1)
2564 if video_id not in already_seen:
2565 video_ids.append(video_id)
2566 already_seen.add(video_id)
2567 if len(video_ids) == n:
2568 # Specified n videos reached
2569 for id in video_ids:
2570 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2573 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2574 for id in video_ids:
2575 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2578 pagenum = pagenum + 1
2581 class YoutubePlaylistIE(InfoExtractor):
2582 """Information Extractor for YouTube playlists."""
2584 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2585 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2586 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=PL%s&'
2587 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2589 IE_NAME = u'youtube:playlist'
2591 def __init__(self, youtube_ie, downloader=None):
2592 InfoExtractor.__init__(self, downloader)
2593 self._youtube_ie = youtube_ie
2595 def report_download_page(self, playlist_id, pagenum):
2596 """Report attempt to download playlist page with given number."""
2597 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2599 def _real_initialize(self):
2600 self._youtube_ie.initialize()
2602 def _real_extract(self, url):
2603 # Extract playlist id
2604 mobj = re.match(self._VALID_URL, url)
2606 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2610 if mobj.group(3) is not None:
2611 self._youtube_ie.extract(mobj.group(3))
2614 # Download playlist pages
2615 # prefix is 'p' as default for playlists but there are other types that need extra care
2616 playlist_prefix = mobj.group(1)
2617 if playlist_prefix == 'a':
2618 playlist_access = 'artist'
2620 playlist_prefix = 'p'
2621 playlist_access = 'view_play_list'
2622 playlist_id = mobj.group(2)
2627 self.report_download_page(playlist_id, pagenum)
2628 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2629 request = urllib2.Request(url)
2631 page = urllib2.urlopen(request).read()
2632 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2633 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2636 # Extract video identifiers
2638 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
2639 if mobj.group(1) not in ids_in_page:
2640 ids_in_page.append(mobj.group(1))
2641 video_ids.extend(ids_in_page)
2643 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2645 pagenum = pagenum + 1
2647 playliststart = self._downloader.params.get('playliststart', 1) - 1
2648 playlistend = self._downloader.params.get('playlistend', -1)
2649 if playlistend == -1:
2650 video_ids = video_ids[playliststart:]
2652 video_ids = video_ids[playliststart:playlistend]
2654 for id in video_ids:
2655 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2659 class YoutubeUserIE(InfoExtractor):
2660 """Information Extractor for YouTube users."""
2662 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2663 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2664 _GDATA_PAGE_SIZE = 50
2665 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2666 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2668 IE_NAME = u'youtube:user'
2670 def __init__(self, youtube_ie, downloader=None):
2671 InfoExtractor.__init__(self, downloader)
2672 self._youtube_ie = youtube_ie
2674 def report_download_page(self, username, start_index):
2675 """Report attempt to download user page."""
2676 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2677 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2679 def _real_initialize(self):
2680 self._youtube_ie.initialize()
2682 def _real_extract(self, url):
2684 mobj = re.match(self._VALID_URL, url)
2686 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2689 username = mobj.group(1)
2691 # Download video ids using YouTube Data API. Result size per
2692 # query is limited (currently to 50 videos) so we need to query
2693 # page by page until there are no video ids - it means we got
2700 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2701 self.report_download_page(username, start_index)
2703 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2706 page = urllib2.urlopen(request).read()
2707 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2708 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2711 # Extract video identifiers
2714 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2715 if mobj.group(1) not in ids_in_page:
2716 ids_in_page.append(mobj.group(1))
2718 video_ids.extend(ids_in_page)
2720 # A little optimization - if current page is not
2721 # "full", ie. does not contain PAGE_SIZE video ids then
2722 # we can assume that this page is the last one - there
2723 # are no more ids on further pages - no need to query
2726 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2731 all_ids_count = len(video_ids)
2732 playliststart = self._downloader.params.get('playliststart', 1) - 1
2733 playlistend = self._downloader.params.get('playlistend', -1)
2735 if playlistend == -1:
2736 video_ids = video_ids[playliststart:]
2738 video_ids = video_ids[playliststart:playlistend]
2740 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2741 (username, all_ids_count, len(video_ids)))
2743 for video_id in video_ids:
2744 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2747 class DepositFilesIE(InfoExtractor):
2748 """Information extractor for depositfiles.com"""
2750 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2751 IE_NAME = u'DepositFiles'
2753 def __init__(self, downloader=None):
2754 InfoExtractor.__init__(self, downloader)
2756 def report_download_webpage(self, file_id):
2757 """Report webpage download."""
2758 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2760 def report_extraction(self, file_id):
2761 """Report information extraction."""
2762 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2764 def _real_extract(self, url):
2765 # At this point we have a new file
2766 self._downloader.increment_downloads()
2768 file_id = url.split('/')[-1]
2769 # Rebuild url in english locale
2770 url = 'http://depositfiles.com/en/files/' + file_id
2772 # Retrieve file webpage with 'Free download' button pressed
2773 free_download_indication = { 'gateway_result' : '1' }
2774 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2776 self.report_download_webpage(file_id)
2777 webpage = urllib2.urlopen(request).read()
2778 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2779 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2782 # Search for the real file URL
2783 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2784 if (mobj is None) or (mobj.group(1) is None):
2785 # Try to figure out reason of the error.
2786 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2787 if (mobj is not None) and (mobj.group(1) is not None):
2788 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2789 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2791 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2794 file_url = mobj.group(1)
2795 file_extension = os.path.splitext(file_url)[1][1:]
2797 # Search for file title
2798 mobj = re.search(r'<b title="(.*?)">', webpage)
2800 self._downloader.trouble(u'ERROR: unable to extract title')
2802 file_title = mobj.group(1).decode('utf-8')
2805 # Process file information
2806 self._downloader.process_info({
2807 'id': file_id.decode('utf-8'),
2808 'url': file_url.decode('utf-8'),
2810 'upload_date': u'NA',
2811 'title': file_title,
2812 'stitle': file_title,
2813 'ext': file_extension.decode('utf-8'),
2817 except UnavailableVideoError, err:
2818 self._downloader.trouble(u'ERROR: unable to download file')
2821 class FacebookIE(InfoExtractor):
2822 """Information Extractor for Facebook"""
2824 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2825 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2826 _NETRC_MACHINE = 'facebook'
2827 _available_formats = ['video', 'highqual', 'lowqual']
2828 _video_extensions = {
2833 IE_NAME = u'facebook'
2835 def __init__(self, downloader=None):
2836 InfoExtractor.__init__(self, downloader)
2838 def _reporter(self, message):
2839 """Add header and report message."""
2840 self._downloader.to_screen(u'[facebook] %s' % message)
2842 def report_login(self):
2843 """Report attempt to log in."""
2844 self._reporter(u'Logging in')
2846 def report_video_webpage_download(self, video_id):
2847 """Report attempt to download video webpage."""
2848 self._reporter(u'%s: Downloading video webpage' % video_id)
2850 def report_information_extraction(self, video_id):
2851 """Report attempt to extract video information."""
2852 self._reporter(u'%s: Extracting video information' % video_id)
2854 def _parse_page(self, video_webpage):
2855 """Extract video information from page"""
2857 data = {'title': r'\("video_title", "(.*?)"\)',
2858 'description': r'<div class="datawrap">(.*?)</div>',
2859 'owner': r'\("video_owner_name", "(.*?)"\)',
2860 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2863 for piece in data.keys():
2864 mobj = re.search(data[piece], video_webpage)
2865 if mobj is not None:
2866 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2870 for fmt in self._available_formats:
2871 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2872 if mobj is not None:
2873 # URL is in a Javascript segment inside an escaped Unicode format within
2874 # the generally utf-8 page
2875 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2876 video_info['video_urls'] = video_urls
2880 def _real_initialize(self):
2881 if self._downloader is None:
2886 downloader_params = self._downloader.params
2888 # Attempt to use provided username and password or .netrc data
2889 if downloader_params.get('username', None) is not None:
2890 useremail = downloader_params['username']
2891 password = downloader_params['password']
2892 elif downloader_params.get('usenetrc', False):
2894 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2895 if info is not None:
2899 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2900 except (IOError, netrc.NetrcParseError), err:
2901 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2904 if useremail is None:
2913 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2916 login_results = urllib2.urlopen(request).read()
2917 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2918 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2920 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2921 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2924 def _real_extract(self, url):
2925 mobj = re.match(self._VALID_URL, url)
2927 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2929 video_id = mobj.group('ID')
2932 self.report_video_webpage_download(video_id)
2933 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2935 page = urllib2.urlopen(request)
2936 video_webpage = page.read()
2937 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2938 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2941 # Start extracting information
2942 self.report_information_extraction(video_id)
2944 # Extract information
2945 video_info = self._parse_page(video_webpage)
2948 if 'owner' not in video_info:
2949 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2951 video_uploader = video_info['owner']
2954 if 'title' not in video_info:
2955 self._downloader.trouble(u'ERROR: unable to extract video title')
2957 video_title = video_info['title']
2958 video_title = video_title.decode('utf-8')
2959 video_title = sanitize_title(video_title)
2961 simple_title = _simplify_title(video_title)
2964 if 'thumbnail' not in video_info:
2965 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2966 video_thumbnail = ''
2968 video_thumbnail = video_info['thumbnail']
2972 if 'upload_date' in video_info:
2973 upload_time = video_info['upload_date']
2974 timetuple = email.utils.parsedate_tz(upload_time)
2975 if timetuple is not None:
2977 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2982 video_description = video_info.get('description', 'No description available.')
2984 url_map = video_info['video_urls']
2985 if len(url_map.keys()) > 0:
2986 # Decide which formats to download
2987 req_format = self._downloader.params.get('format', None)
2988 format_limit = self._downloader.params.get('format_limit', None)
2990 if format_limit is not None and format_limit in self._available_formats:
2991 format_list = self._available_formats[self._available_formats.index(format_limit):]
2993 format_list = self._available_formats
2994 existing_formats = [x for x in format_list if x in url_map]
2995 if len(existing_formats) == 0:
2996 self._downloader.trouble(u'ERROR: no known formats available for video')
2998 if req_format is None:
2999 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
3000 elif req_format == 'worst':
3001 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3002 elif req_format == '-1':
3003 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3006 if req_format not in url_map:
3007 self._downloader.trouble(u'ERROR: requested format not available')
3009 video_url_list = [(req_format, url_map[req_format])] # Specific format
3011 for format_param, video_real_url in video_url_list:
3013 # At this point we have a new video
3014 self._downloader.increment_downloads()
3017 video_extension = self._video_extensions.get(format_param, 'mp4')
3020 # Process video information
3021 self._downloader.process_info({
3022 'id': video_id.decode('utf-8'),
3023 'url': video_real_url.decode('utf-8'),
3024 'uploader': video_uploader.decode('utf-8'),
3025 'upload_date': upload_date,
3026 'title': video_title,
3027 'stitle': simple_title,
3028 'ext': video_extension.decode('utf-8'),
3029 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3030 'thumbnail': video_thumbnail.decode('utf-8'),
3031 'description': video_description.decode('utf-8'),
3034 except UnavailableVideoError, err:
3035 self._downloader.trouble(u'\nERROR: unable to download video')
3037 class BlipTVIE(InfoExtractor):
3038 """Information extractor for blip.tv"""
3040 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3041 _URL_EXT = r'^.*\.([a-z0-9]+)$'
3042 IE_NAME = u'blip.tv'
3044 def report_extraction(self, file_id):
3045 """Report information extraction."""
3046 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3048 def report_direct_download(self, title):
3049 """Report information extraction."""
3050 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3052 def _real_extract(self, url):
3053 mobj = re.match(self._VALID_URL, url)
3055 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3062 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3063 request = urllib2.Request(json_url)
3064 self.report_extraction(mobj.group(1))
3067 urlh = urllib2.urlopen(request)
3068 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3069 basename = url.split('/')[-1]
3070 title,ext = os.path.splitext(basename)
3071 title = title.decode('UTF-8')
3072 ext = ext.replace('.', '')
3073 self.report_direct_download(title)
3078 'stitle': _simplify_title(title),
3082 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3083 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3085 if info is None: # Regular URL
3087 json_code = urlh.read()
3088 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3089 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3093 json_data = json.loads(json_code)
3094 if 'Post' in json_data:
3095 data = json_data['Post']
3099 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3100 video_url = data['media']['url']
3101 umobj = re.match(self._URL_EXT, video_url)
3103 raise ValueError('Can not determine filename extension')
3104 ext = umobj.group(1)
3107 'id': data['item_id'],
3109 'uploader': data['display_name'],
3110 'upload_date': upload_date,
3111 'title': data['title'],
3112 'stitle': _simplify_title(data['title']),
3114 'format': data['media']['mimeType'],
3115 'thumbnail': data['thumbnailUrl'],
3116 'description': data['description'],
3117 'player_url': data['embedUrl']
3119 except (ValueError,KeyError), err:
3120 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3123 self._downloader.increment_downloads()
3126 self._downloader.process_info(info)
3127 except UnavailableVideoError, err:
3128 self._downloader.trouble(u'\nERROR: unable to download video')
3131 class MyVideoIE(InfoExtractor):
3132 """Information Extractor for myvideo.de."""
3134 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3135 IE_NAME = u'myvideo'
3137 def __init__(self, downloader=None):
3138 InfoExtractor.__init__(self, downloader)
3140 def report_download_webpage(self, video_id):
3141 """Report webpage download."""
3142 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3144 def report_extraction(self, video_id):
3145 """Report information extraction."""
3146 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3148 def _real_extract(self,url):
3149 mobj = re.match(self._VALID_URL, url)
3151 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3154 video_id = mobj.group(1)
3157 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3159 self.report_download_webpage(video_id)
3160 webpage = urllib2.urlopen(request).read()
3161 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3162 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3165 self.report_extraction(video_id)
3166 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3169 self._downloader.trouble(u'ERROR: unable to extract media URL')
3171 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3173 mobj = re.search('<title>([^<]+)</title>', webpage)
3175 self._downloader.trouble(u'ERROR: unable to extract title')
3178 video_title = mobj.group(1)
3179 video_title = sanitize_title(video_title)
3181 simple_title = _simplify_title(video_title)
3184 self._downloader.process_info({
3188 'upload_date': u'NA',
3189 'title': video_title,
3190 'stitle': simple_title,
3195 except UnavailableVideoError:
3196 self._downloader.trouble(u'\nERROR: Unable to download video')
3198 class ComedyCentralIE(InfoExtractor):
3199 """Information extractor for The Daily Show and Colbert Report """
3201 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3202 IE_NAME = u'comedycentral'
3204 def report_extraction(self, episode_id):
3205 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3207 def report_config_download(self, episode_id):
3208 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3210 def report_index_download(self, episode_id):
3211 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3213 def report_player_url(self, episode_id):
3214 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3216 def _real_extract(self, url):
3217 mobj = re.match(self._VALID_URL, url)
3219 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3222 if mobj.group('shortname'):
3223 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3224 url = u'http://www.thedailyshow.com/full-episodes/'
3226 url = u'http://www.colbertnation.com/full-episodes/'
3227 mobj = re.match(self._VALID_URL, url)
3228 assert mobj is not None
3230 dlNewest = not mobj.group('episode')
3232 epTitle = mobj.group('showname')
3234 epTitle = mobj.group('episode')
3236 req = urllib2.Request(url)
3237 self.report_extraction(epTitle)
3239 htmlHandle = urllib2.urlopen(req)
3240 html = htmlHandle.read()
3241 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3242 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3245 url = htmlHandle.geturl()
3246 mobj = re.match(self._VALID_URL, url)
3248 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3250 if mobj.group('episode') == '':
3251 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3253 epTitle = mobj.group('episode')
3255 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3256 if len(mMovieParams) == 0:
3257 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3260 playerUrl_raw = mMovieParams[0][0]
3261 self.report_player_url(epTitle)
3263 urlHandle = urllib2.urlopen(playerUrl_raw)
3264 playerUrl = urlHandle.geturl()
3265 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3266 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3269 uri = mMovieParams[0][1]
3270 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3271 self.report_index_download(epTitle)
3273 indexXml = urllib2.urlopen(indexUrl).read()
3274 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3275 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3278 idoc = xml.etree.ElementTree.fromstring(indexXml)
3279 itemEls = idoc.findall('.//item')
3280 for itemEl in itemEls:
3281 mediaId = itemEl.findall('./guid')[0].text
3282 shortMediaId = mediaId.split(':')[-1]
3283 showId = mediaId.split(':')[-2].replace('.com', '')
3284 officialTitle = itemEl.findall('./title')[0].text
3285 officialDate = itemEl.findall('./pubDate')[0].text
3287 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3288 urllib.urlencode({'uri': mediaId}))
3289 configReq = urllib2.Request(configUrl)
3290 self.report_config_download(epTitle)
3292 configXml = urllib2.urlopen(configReq).read()
3293 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3294 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3297 cdoc = xml.etree.ElementTree.fromstring(configXml)
3299 for rendition in cdoc.findall('.//rendition'):
3300 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3304 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3307 # For now, just pick the highest bitrate
3308 format,video_url = turls[-1]
3310 self._downloader.increment_downloads()
3312 effTitle = showId + u'-' + epTitle
3317 'upload_date': officialDate,
3319 'stitle': _simplify_title(effTitle),
3323 'description': officialTitle,
3324 'player_url': playerUrl
3328 self._downloader.process_info(info)
3329 except UnavailableVideoError, err:
3330 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3334 class EscapistIE(InfoExtractor):
3335 """Information extractor for The Escapist """
3337 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3338 IE_NAME = u'escapist'
3340 def report_extraction(self, showName):
3341 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3343 def report_config_download(self, showName):
3344 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3346 def _real_extract(self, url):
3347 htmlParser = HTMLParser.HTMLParser()
3349 mobj = re.match(self._VALID_URL, url)
3351 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3353 showName = mobj.group('showname')
3354 videoId = mobj.group('episode')
3356 self.report_extraction(showName)
3358 webPage = urllib2.urlopen(url).read()
3359 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3360 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3363 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3364 description = htmlParser.unescape(descMatch.group(1))
3365 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3366 imgUrl = htmlParser.unescape(imgMatch.group(1))
3367 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3368 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3369 configUrlMatch = re.search('config=(.*)$', playerUrl)
3370 configUrl = urllib2.unquote(configUrlMatch.group(1))
3372 self.report_config_download(showName)
3374 configJSON = urllib2.urlopen(configUrl).read()
3375 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3376 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3379 # Technically, it's JavaScript, not JSON
3380 configJSON = configJSON.replace("'", '"')
3383 config = json.loads(configJSON)
3384 except (ValueError,), err:
3385 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3388 playlist = config['playlist']
3389 videoUrl = playlist[1]['url']
3391 self._downloader.increment_downloads()
3395 'uploader': showName,
3396 'upload_date': None,
3398 'stitle': _simplify_title(showName),
3401 'thumbnail': imgUrl,
3402 'description': description,
3403 'player_url': playerUrl,
3407 self._downloader.process_info(info)
3408 except UnavailableVideoError, err:
3409 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3412 class CollegeHumorIE(InfoExtractor):
3413 """Information extractor for collegehumor.com"""
3415 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3416 IE_NAME = u'collegehumor'
3418 def report_webpage(self, video_id):
3419 """Report information extraction."""
3420 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3422 def report_extraction(self, video_id):
3423 """Report information extraction."""
3424 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3426 def _real_extract(self, url):
3427 htmlParser = HTMLParser.HTMLParser()
3429 mobj = re.match(self._VALID_URL, url)
3431 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3433 video_id = mobj.group('videoid')
3435 self.report_webpage(video_id)
3436 request = urllib2.Request(url)
3438 webpage = urllib2.urlopen(request).read()
3439 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3440 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3443 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3445 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3447 internal_video_id = m.group('internalvideoid')
3451 'internal_id': internal_video_id,
3454 self.report_extraction(video_id)
3455 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3457 metaXml = urllib2.urlopen(xmlUrl).read()
3458 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3459 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3462 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3464 videoNode = mdoc.findall('./video')[0]
3465 info['description'] = videoNode.findall('./description')[0].text
3466 info['title'] = videoNode.findall('./caption')[0].text
3467 info['stitle'] = _simplify_title(info['title'])
3468 info['url'] = videoNode.findall('./file')[0].text
3469 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3470 info['ext'] = info['url'].rpartition('.')[2]
3471 info['format'] = info['ext']
3473 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3476 self._downloader.increment_downloads()
3479 self._downloader.process_info(info)
3480 except UnavailableVideoError, err:
3481 self._downloader.trouble(u'\nERROR: unable to download video')
3484 class XVideosIE(InfoExtractor):
3485 """Information extractor for xvideos.com"""
3487 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3488 IE_NAME = u'xvideos'
3490 def report_webpage(self, video_id):
3491 """Report information extraction."""
3492 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3494 def report_extraction(self, video_id):
3495 """Report information extraction."""
3496 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3498 def _real_extract(self, url):
3499 htmlParser = HTMLParser.HTMLParser()
3501 mobj = re.match(self._VALID_URL, url)
3503 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3505 video_id = mobj.group(1).decode('utf-8')
3507 self.report_webpage(video_id)
3509 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3511 webpage = urllib2.urlopen(request).read()
3512 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3513 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3516 self.report_extraction(video_id)
3520 mobj = re.search(r'flv_url=(.+?)&', webpage)
3522 self._downloader.trouble(u'ERROR: unable to extract video url')
3524 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3528 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3530 self._downloader.trouble(u'ERROR: unable to extract video title')
3532 video_title = mobj.group(1).decode('utf-8')
3535 # Extract video thumbnail
3536 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3538 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3540 video_thumbnail = mobj.group(1).decode('utf-8')
3544 self._downloader.increment_downloads()
3549 'upload_date': None,
3550 'title': video_title,
3551 'stitle': _simplify_title(video_title),
3554 'thumbnail': video_thumbnail,
3555 'description': None,
3560 self._downloader.process_info(info)
3561 except UnavailableVideoError, err:
3562 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3565 class SoundcloudIE(InfoExtractor):
3566 """Information extractor for soundcloud.com
3567 To access the media, the uid of the song and a stream token
3568 must be extracted from the page source and the script must make
3569 a request to media.soundcloud.com/crossdomain.xml. Then
3570 the media can be grabbed by requesting from an url composed
3571 of the stream token and uid
3574 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3575 IE_NAME = u'soundcloud'
3577 def __init__(self, downloader=None):
3578 InfoExtractor.__init__(self, downloader)
3580 def report_webpage(self, video_id):
3581 """Report information extraction."""
3582 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3584 def report_extraction(self, video_id):
3585 """Report information extraction."""
3586 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3588 def _real_extract(self, url):
3589 htmlParser = HTMLParser.HTMLParser()
3591 mobj = re.match(self._VALID_URL, url)
3593 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3596 # extract uploader (which is in the url)
3597 uploader = mobj.group(1).decode('utf-8')
3598 # extract simple title (uploader + slug of song title)
3599 slug_title = mobj.group(2).decode('utf-8')
3600 simple_title = uploader + '-' + slug_title
3602 self.report_webpage('%s/%s' % (uploader, slug_title))
3604 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3606 webpage = urllib2.urlopen(request).read()
3607 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3608 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3611 self.report_extraction('%s/%s' % (uploader, slug_title))
3613 # extract uid and stream token that soundcloud hands out for access
3614 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3616 video_id = mobj.group(1)
3617 stream_token = mobj.group(2)
3619 # extract unsimplified title
3620 mobj = re.search('"title":"(.*?)",', webpage)
3622 title = mobj.group(1)
3624 # construct media url (with uid/token)
3625 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3626 mediaURL = mediaURL % (video_id, stream_token)
3629 description = u'No description available'
3630 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3632 description = mobj.group(1)
3636 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3639 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3640 except Exception, e:
3643 # for soundcloud, a request to a cross domain is required for cookies
3644 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3647 self._downloader.process_info({
3648 'id': video_id.decode('utf-8'),
3650 'uploader': uploader.decode('utf-8'),
3651 'upload_date': upload_date,
3652 'title': simple_title.decode('utf-8'),
3653 'stitle': simple_title.decode('utf-8'),
3657 'description': description.decode('utf-8')
3659 except UnavailableVideoError:
3660 self._downloader.trouble(u'\nERROR: unable to download video')
3663 class InfoQIE(InfoExtractor):
3664 """Information extractor for infoq.com"""
3666 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3669 def report_webpage(self, video_id):
3670 """Report information extraction."""
3671 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3673 def report_extraction(self, video_id):
3674 """Report information extraction."""
3675 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3677 def _real_extract(self, url):
3678 htmlParser = HTMLParser.HTMLParser()
3680 mobj = re.match(self._VALID_URL, url)
3682 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3685 self.report_webpage(url)
3687 request = urllib2.Request(url)
3689 webpage = urllib2.urlopen(request).read()
3690 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3691 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3694 self.report_extraction(url)
3698 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3700 self._downloader.trouble(u'ERROR: unable to extract video url')
3702 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3706 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3708 self._downloader.trouble(u'ERROR: unable to extract video title')
3710 video_title = mobj.group(1).decode('utf-8')
3712 # Extract description
3713 video_description = u'No description available.'
3714 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3715 if mobj is not None:
3716 video_description = mobj.group(1).decode('utf-8')
3718 video_filename = video_url.split('/')[-1]
3719 video_id, extension = video_filename.split('.')
3721 self._downloader.increment_downloads()
3726 'upload_date': None,
3727 'title': video_title,
3728 'stitle': _simplify_title(video_title),
3730 'format': extension, # Extension is always(?) mp4, but seems to be flv
3732 'description': video_description,
3737 self._downloader.process_info(info)
3738 except UnavailableVideoError, err:
3739 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3741 class MixcloudIE(InfoExtractor):
3742 """Information extractor for www.mixcloud.com"""
3743 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3744 IE_NAME = u'mixcloud'
3746 def __init__(self, downloader=None):
3747 InfoExtractor.__init__(self, downloader)
3749 def report_download_json(self, file_id):
3750 """Report JSON download."""
3751 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3753 def report_extraction(self, file_id):
3754 """Report information extraction."""
3755 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3757 def get_urls(self, jsonData, fmt, bitrate='best'):
3758 """Get urls from 'audio_formats' section in json"""
3761 bitrate_list = jsonData[fmt]
3762 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3763 bitrate = max(bitrate_list) # select highest
3765 url_list = jsonData[fmt][bitrate]
3766 except TypeError: # we have no bitrate info.
3767 url_list = jsonData[fmt]
3771 def check_urls(self, url_list):
3772 """Returns 1st active url from list"""
3773 for url in url_list:
3775 urllib2.urlopen(url)
3777 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3782 def _print_formats(self, formats):
3783 print 'Available formats:'
3784 for fmt in formats.keys():
3785 for b in formats[fmt]:
3787 ext = formats[fmt][b][0]
3788 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3789 except TypeError: # we have no bitrate info
3790 ext = formats[fmt][0]
3791 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3794 def _real_extract(self, url):
3795 mobj = re.match(self._VALID_URL, url)
3797 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3799 # extract uploader & filename from url
3800 uploader = mobj.group(1).decode('utf-8')
3801 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3803 # construct API request
3804 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3805 # retrieve .json file with links to files
3806 request = urllib2.Request(file_url)
3808 self.report_download_json(file_url)
3809 jsonData = urllib2.urlopen(request).read()
3810 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3811 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3815 json_data = json.loads(jsonData)
3816 player_url = json_data['player_swf_url']
3817 formats = dict(json_data['audio_formats'])
3819 req_format = self._downloader.params.get('format', None)
3822 if self._downloader.params.get('listformats', None):
3823 self._print_formats(formats)
3826 if req_format is None or req_format == 'best':
3827 for format_param in formats.keys():
3828 url_list = self.get_urls(formats, format_param)
3830 file_url = self.check_urls(url_list)
3831 if file_url is not None:
3834 if req_format not in formats.keys():
3835 self._downloader.trouble(u'ERROR: format is not available')
3838 url_list = self.get_urls(formats, req_format)
3839 file_url = self.check_urls(url_list)
3840 format_param = req_format
3843 self._downloader.increment_downloads()
3845 # Process file information
3846 self._downloader.process_info({
3847 'id': file_id.decode('utf-8'),
3848 'url': file_url.decode('utf-8'),
3849 'uploader': uploader.decode('utf-8'),
3850 'upload_date': u'NA',
3851 'title': json_data['name'],
3852 'stitle': _simplify_title(json_data['name']),
3853 'ext': file_url.split('.')[-1].decode('utf-8'),
3854 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3855 'thumbnail': json_data['thumbnail_url'],
3856 'description': json_data['description'],
3857 'player_url': player_url.decode('utf-8'),
3859 except UnavailableVideoError, err:
3860 self._downloader.trouble(u'ERROR: unable to download file')
3862 class StanfordOpenClassroomIE(InfoExtractor):
3863 """Information extractor for Stanford's Open ClassRoom"""
3865 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3866 IE_NAME = u'stanfordoc'
3868 def report_download_webpage(self, objid):
3869 """Report information extraction."""
3870 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3872 def report_extraction(self, video_id):
3873 """Report information extraction."""
3874 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3876 def _real_extract(self, url):
3877 mobj = re.match(self._VALID_URL, url)
3879 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3882 if mobj.group('course') and mobj.group('video'): # A specific video
3883 course = mobj.group('course')
3884 video = mobj.group('video')
3886 'id': _simplify_title(course + '_' + video),
3889 self.report_extraction(info['id'])
3890 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3891 xmlUrl = baseUrl + video + '.xml'
3893 metaXml = urllib2.urlopen(xmlUrl).read()
3894 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3895 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3897 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3899 info['title'] = mdoc.findall('./title')[0].text
3900 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3902 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3904 info['stitle'] = _simplify_title(info['title'])
3905 info['ext'] = info['url'].rpartition('.')[2]
3906 info['format'] = info['ext']
3907 self._downloader.increment_downloads()
3909 self._downloader.process_info(info)
3910 except UnavailableVideoError, err:
3911 self._downloader.trouble(u'\nERROR: unable to download video')
3912 elif mobj.group('course'): # A course page
3913 unescapeHTML = HTMLParser.HTMLParser().unescape
3915 course = mobj.group('course')
3917 'id': _simplify_title(course),
3921 self.report_download_webpage(info['id'])
3923 coursepage = urllib2.urlopen(url).read()
3924 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3925 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3928 m = re.search('<h1>([^<]+)</h1>', coursepage)
3930 info['title'] = unescapeHTML(m.group(1))
3932 info['title'] = info['id']
3933 info['stitle'] = _simplify_title(info['title'])
3935 m = re.search('<description>([^<]+)</description>', coursepage)
3937 info['description'] = unescapeHTML(m.group(1))
3939 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3942 'type': 'reference',
3943 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3947 for entry in info['list']:
3948 assert entry['type'] == 'reference'
3949 self.extract(entry['url'])
3951 unescapeHTML = HTMLParser.HTMLParser().unescape
3954 'id': 'Stanford OpenClassroom',
3958 self.report_download_webpage(info['id'])
3959 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3961 rootpage = urllib2.urlopen(rootURL).read()
3962 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3963 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3966 info['title'] = info['id']
3967 info['stitle'] = _simplify_title(info['title'])
3969 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3972 'type': 'reference',
3973 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3977 for entry in info['list']:
3978 assert entry['type'] == 'reference'
3979 self.extract(entry['url'])
3981 class MTVIE(InfoExtractor):
3982 """Information extractor for MTV.com"""
3984 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3987 def report_webpage(self, video_id):
3988 """Report information extraction."""
3989 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3991 def report_extraction(self, video_id):
3992 """Report information extraction."""
3993 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3995 def _real_extract(self, url):
3996 mobj = re.match(self._VALID_URL, url)
3998 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4000 if not mobj.group('proto'):
4001 url = 'http://' + url
4002 video_id = mobj.group('videoid')
4003 self.report_webpage(video_id)
4005 request = urllib2.Request(url)
4007 webpage = urllib2.urlopen(request).read()
4008 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4009 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4012 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4014 self._downloader.trouble(u'ERROR: unable to extract song name')
4016 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4017 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4019 self._downloader.trouble(u'ERROR: unable to extract performer')
4021 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4022 video_title = performer + ' - ' + song_name
4024 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4026 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4028 mtvn_uri = mobj.group(1)
4030 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4032 self._downloader.trouble(u'ERROR: unable to extract content id')
4034 content_id = mobj.group(1)
4036 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4037 self.report_extraction(video_id)
4038 request = urllib2.Request(videogen_url)
4040 metadataXml = urllib2.urlopen(request).read()
4041 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4042 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4045 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4046 renditions = mdoc.findall('.//rendition')
4048 # For now, always pick the highest quality.
4049 rendition = renditions[-1]
4052 _,_,ext = rendition.attrib['type'].partition('/')
4053 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4054 video_url = rendition.find('./src').text
4056 self._downloader.trouble('Invalid rendition field.')
4059 self._downloader.increment_downloads()
4063 'uploader': performer,
4064 'title': video_title,
4065 'stitle': _simplify_title(video_title),
4071 self._downloader.process_info(info)
4072 except UnavailableVideoError, err:
4073 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4076 class PostProcessor(object):
4077 """Post Processor class.
4079 PostProcessor objects can be added to downloaders with their
4080 add_post_processor() method. When the downloader has finished a
4081 successful download, it will take its internal chain of PostProcessors
4082 and start calling the run() method on each one of them, first with
4083 an initial argument and then with the returned value of the previous
4086 The chain will be stopped if one of them ever returns None or the end
4087 of the chain is reached.
4089 PostProcessor objects follow a "mutual registration" process similar
4090 to InfoExtractor objects.
4095 def __init__(self, downloader=None):
4096 self._downloader = downloader
4098 def set_downloader(self, downloader):
4099 """Sets the downloader for this PP."""
4100 self._downloader = downloader
4102 def run(self, information):
4103 """Run the PostProcessor.
4105 The "information" argument is a dictionary like the ones
4106 composed by InfoExtractors. The only difference is that this
4107 one has an extra field called "filepath" that points to the
4110 When this method returns None, the postprocessing chain is
4111 stopped. However, this method may return an information
4112 dictionary that will be passed to the next postprocessing
4113 object in the chain. It can be the one it received after
4114 changing some fields.
4116 In addition, this method may raise a PostProcessingError
4117 exception that will be taken into account by the downloader
4120 return information # by default, do nothing
4122 class AudioConversionError(BaseException):
4123 def __init__(self, message):
4124 self.message = message
4126 class FFmpegExtractAudioPP(PostProcessor):
4128 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4129 PostProcessor.__init__(self, downloader)
4130 if preferredcodec is None:
4131 preferredcodec = 'best'
4132 self._preferredcodec = preferredcodec
4133 self._preferredquality = preferredquality
4134 self._keepvideo = keepvideo
4137 def get_audio_codec(path):
4139 cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4140 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4141 output = handle.communicate()[0]
4142 if handle.wait() != 0:
4144 except (IOError, OSError):
4147 for line in output.split('\n'):
4148 if line.startswith('codec_name='):
4149 audio_codec = line.split('=')[1].strip()
4150 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4155 def run_ffmpeg(path, out_path, codec, more_opts):
4159 acodec_opts = ['-acodec', codec]
4160 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4162 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4163 stdout,stderr = p.communicate()
4164 except (IOError, OSError):
4165 e = sys.exc_info()[1]
4166 if isinstance(e, OSError) and e.errno == 2:
4167 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4170 if p.returncode != 0:
4171 msg = stderr.strip().split('\n')[-1]
4172 raise AudioConversionError(msg)
4174 def run(self, information):
4175 path = information['filepath']
4177 filecodec = self.get_audio_codec(path)
4178 if filecodec is None:
4179 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4183 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4184 if self._preferredcodec == 'm4a' and filecodec == 'aac':
4185 # Lossless, but in another container
4187 extension = self._preferredcodec
4188 more_opts = ['-absf', 'aac_adtstoasc']
4189 elif filecodec in ['aac', 'mp3', 'vorbis']:
4190 # Lossless if possible
4192 extension = filecodec
4193 if filecodec == 'aac':
4194 more_opts = ['-f', 'adts']
4195 if filecodec == 'vorbis':
4199 acodec = 'libmp3lame'
4202 if self._preferredquality is not None:
4203 more_opts += ['-ab', self._preferredquality]
4205 # We convert the audio (lossy)
4206 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4207 extension = self._preferredcodec
4209 if self._preferredquality is not None:
4210 more_opts += ['-ab', self._preferredquality]
4211 if self._preferredcodec == 'aac':
4212 more_opts += ['-f', 'adts']
4213 if self._preferredcodec == 'm4a':
4214 more_opts += ['-absf', 'aac_adtstoasc']
4215 if self._preferredcodec == 'vorbis':
4217 if self._preferredcodec == 'wav':
4219 more_opts += ['-f', 'wav']
4221 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4222 new_path = prefix + sep + extension
4223 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4225 self.run_ffmpeg(path, new_path, acodec, more_opts)
4227 etype,e,tb = sys.exc_info()
4228 if isinstance(e, AudioConversionError):
4229 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4231 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4234 # Try to update the date time for extracted audio file.
4235 if information.get('filetime') is not None:
4237 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4239 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4241 if not self._keepvideo:
4243 os.remove(_encodeFilename(path))
4244 except (IOError, OSError):
4245 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4248 information['filepath'] = new_path
4252 def updateSelf(downloader, filename):
4253 ''' Update the program file with the latest version from the repository '''
4254 # Note: downloader only used for options
4255 if not os.access(filename, os.W_OK):
4256 sys.exit('ERROR: no write permissions on %s' % filename)
4258 downloader.to_screen(u'Updating to latest version...')
4262 urlh = urllib.urlopen(UPDATE_URL)
4263 newcontent = urlh.read()
4265 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4266 if vmatch is not None and vmatch.group(1) == __version__:
4267 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4271 except (IOError, OSError), err:
4272 sys.exit('ERROR: unable to download latest version')
4275 outf = open(filename, 'wb')
4277 outf.write(newcontent)
4280 except (IOError, OSError), err:
4281 sys.exit('ERROR: unable to overwrite current version')
4283 downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4286 def _readOptions(filename_bytes):
4288 optionf = open(filename_bytes)
4290 return [] # silently skip if file is not present
4294 res += shlex.split(l, comments=True)
4299 def _format_option_string(option):
4300 ''' ('-o', '--option') -> -o, --format METAVAR'''
4304 if option._short_opts: opts.append(option._short_opts[0])
4305 if option._long_opts: opts.append(option._long_opts[0])
4306 if len(opts) > 1: opts.insert(1, ', ')
4308 if option.takes_value(): opts.append(' %s' % option.metavar)
4310 return "".join(opts)
4312 def _find_term_columns():
4313 columns = os.environ.get('COLUMNS', None)
4318 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4319 out,err = sp.communicate()
4320 return int(out.split()[1])
4326 max_help_position = 80
4328 # No need to wrap help messages if we're on a wide console
4329 columns = _find_term_columns()
4330 if columns: max_width = columns
4332 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4333 fmt.format_option_strings = _format_option_string
4336 'version' : __version__,
4338 'usage' : '%prog [options] url [url...]',
4339 'conflict_handler' : 'resolve',
4342 parser = optparse.OptionParser(**kw)
4345 general = optparse.OptionGroup(parser, 'General Options')
4346 selection = optparse.OptionGroup(parser, 'Video Selection')
4347 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4348 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4349 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4350 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4351 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4353 general.add_option('-h', '--help',
4354 action='help', help='print this help text and exit')
4355 general.add_option('-v', '--version',
4356 action='version', help='print program version and exit')
4357 general.add_option('-U', '--update',
4358 action='store_true', dest='update_self', help='update this program to latest version')
4359 general.add_option('-i', '--ignore-errors',
4360 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4361 general.add_option('-r', '--rate-limit',
4362 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4363 general.add_option('-R', '--retries',
4364 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4365 general.add_option('--dump-user-agent',
4366 action='store_true', dest='dump_user_agent',
4367 help='display the current browser identification', default=False)
4368 general.add_option('--list-extractors',
4369 action='store_true', dest='list_extractors',
4370 help='List all supported extractors and the URLs they would handle', default=False)
4372 selection.add_option('--playlist-start',
4373 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4374 selection.add_option('--playlist-end',
4375 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4376 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4377 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4378 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4380 authentication.add_option('-u', '--username',
4381 dest='username', metavar='USERNAME', help='account username')
4382 authentication.add_option('-p', '--password',
4383 dest='password', metavar='PASSWORD', help='account password')
4384 authentication.add_option('-n', '--netrc',
4385 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4388 video_format.add_option('-f', '--format',
4389 action='store', dest='format', metavar='FORMAT', help='video format code')
4390 video_format.add_option('--all-formats',
4391 action='store_const', dest='format', help='download all available video formats', const='all')
4392 video_format.add_option('--prefer-free-formats',
4393 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4394 video_format.add_option('--max-quality',
4395 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4396 video_format.add_option('-F', '--list-formats',
4397 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4398 video_format.add_option('--write-srt',
4399 action='store_true', dest='writesubtitles',
4400 help='write video closed captions to a .srt file (currently youtube only)', default=False)
4401 video_format.add_option('--srt-lang',
4402 action='store', dest='subtitleslang', metavar='LANG',
4403 help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
4406 verbosity.add_option('-q', '--quiet',
4407 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4408 verbosity.add_option('-s', '--simulate',
4409 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4410 verbosity.add_option('--skip-download',
4411 action='store_true', dest='skip_download', help='do not download the video', default=False)
4412 verbosity.add_option('-g', '--get-url',
4413 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4414 verbosity.add_option('-e', '--get-title',
4415 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4416 verbosity.add_option('--get-thumbnail',
4417 action='store_true', dest='getthumbnail',
4418 help='simulate, quiet but print thumbnail URL', default=False)
4419 verbosity.add_option('--get-description',
4420 action='store_true', dest='getdescription',
4421 help='simulate, quiet but print video description', default=False)
4422 verbosity.add_option('--get-filename',
4423 action='store_true', dest='getfilename',
4424 help='simulate, quiet but print output filename', default=False)
4425 verbosity.add_option('--get-format',
4426 action='store_true', dest='getformat',
4427 help='simulate, quiet but print output format', default=False)
4428 verbosity.add_option('--no-progress',
4429 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4430 verbosity.add_option('--console-title',
4431 action='store_true', dest='consoletitle',
4432 help='display progress in console titlebar', default=False)
4433 verbosity.add_option('-v', '--verbose',
4434 action='store_true', dest='verbose', help='print various debugging information', default=False)
4437 filesystem.add_option('-t', '--title',
4438 action='store_true', dest='usetitle', help='use title in file name', default=False)
4439 filesystem.add_option('-l', '--literal',
4440 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4441 filesystem.add_option('-A', '--auto-number',
4442 action='store_true', dest='autonumber',
4443 help='number downloaded files starting from 00000', default=False)
4444 filesystem.add_option('-o', '--output',
4445 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4446 filesystem.add_option('-a', '--batch-file',
4447 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4448 filesystem.add_option('-w', '--no-overwrites',
4449 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4450 filesystem.add_option('-c', '--continue',
4451 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4452 filesystem.add_option('--no-continue',
4453 action='store_false', dest='continue_dl',
4454 help='do not resume partially downloaded files (restart from beginning)')
4455 filesystem.add_option('--cookies',
4456 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4457 filesystem.add_option('--no-part',
4458 action='store_true', dest='nopart', help='do not use .part files', default=False)
4459 filesystem.add_option('--no-mtime',
4460 action='store_false', dest='updatetime',
4461 help='do not use the Last-modified header to set the file modification time', default=True)
4462 filesystem.add_option('--write-description',
4463 action='store_true', dest='writedescription',
4464 help='write video description to a .description file', default=False)
4465 filesystem.add_option('--write-info-json',
4466 action='store_true', dest='writeinfojson',
4467 help='write video metadata to a .info.json file', default=False)
4470 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4471 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4472 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4473 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4474 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4475 help='ffmpeg audio bitrate specification, 128k by default')
4476 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4477 help='keeps the video file on disk after the post-processing; the video is erased by default')
4480 parser.add_option_group(general)
4481 parser.add_option_group(selection)
4482 parser.add_option_group(filesystem)
4483 parser.add_option_group(verbosity)
4484 parser.add_option_group(video_format)
4485 parser.add_option_group(authentication)
4486 parser.add_option_group(postproc)
4488 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4490 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4492 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4493 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4494 opts, args = parser.parse_args(argv)
4496 return parser, opts, args
4498 def gen_extractors():
4499 """ Return a list of an instance of every supported extractor.
4500 The order does matter; the first extractor matched is the one handling the URL.
4502 youtube_ie = YoutubeIE()
4503 google_ie = GoogleIE()
4504 yahoo_ie = YahooIE()
4506 YoutubePlaylistIE(youtube_ie),
4507 YoutubeUserIE(youtube_ie),
4508 YoutubeSearchIE(youtube_ie),
4510 MetacafeIE(youtube_ie),
4513 GoogleSearchIE(google_ie),
4516 YahooSearchIE(yahoo_ie),
4529 StanfordOpenClassroomIE(),
4536 parser, opts, args = parseOpts()
4538 # Open appropriate CookieJar
4539 if opts.cookiefile is None:
4540 jar = cookielib.CookieJar()
4543 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4544 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4546 except (IOError, OSError), err:
4547 sys.exit(u'ERROR: unable to open cookie file')
4550 if opts.dump_user_agent:
4551 print std_headers['User-Agent']
4554 # Batch file verification
4556 if opts.batchfile is not None:
4558 if opts.batchfile == '-':
4561 batchfd = open(opts.batchfile, 'r')
4562 batchurls = batchfd.readlines()
4563 batchurls = [x.strip() for x in batchurls]
4564 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4566 sys.exit(u'ERROR: batch file could not be read')
4567 all_urls = batchurls + args
4569 # General configuration
4570 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4571 proxy_handler = urllib2.ProxyHandler()
4572 opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4573 urllib2.install_opener(opener)
4574 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4577 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4579 extractors = gen_extractors()
4581 if opts.list_extractors:
4582 for ie in extractors:
4584 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4585 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4586 for mu in matchedUrls:
4590 # Conflicting, missing and erroneous options
4591 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4592 parser.error(u'using .netrc conflicts with giving username/password')
4593 if opts.password is not None and opts.username is None:
4594 parser.error(u'account username missing')
4595 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4596 parser.error(u'using output template conflicts with using title, literal title or auto number')
4597 if opts.usetitle and opts.useliteral:
4598 parser.error(u'using title conflicts with using literal title')
4599 if opts.username is not None and opts.password is None:
4600 opts.password = getpass.getpass(u'Type account password and press return:')
4601 if opts.ratelimit is not None:
4602 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4603 if numeric_limit is None:
4604 parser.error(u'invalid rate limit specified')
4605 opts.ratelimit = numeric_limit
4606 if opts.retries is not None:
4608 opts.retries = long(opts.retries)
4609 except (TypeError, ValueError), err:
4610 parser.error(u'invalid retry count specified')
4612 opts.playliststart = int(opts.playliststart)
4613 if opts.playliststart <= 0:
4614 raise ValueError(u'Playlist start must be positive')
4615 except (TypeError, ValueError), err:
4616 parser.error(u'invalid playlist start number specified')
4618 opts.playlistend = int(opts.playlistend)
4619 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4620 raise ValueError(u'Playlist end must be greater than playlist start')
4621 except (TypeError, ValueError), err:
4622 parser.error(u'invalid playlist end number specified')
4623 if opts.extractaudio:
4624 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4625 parser.error(u'invalid audio format specified')
4628 fd = FileDownloader({
4629 'usenetrc': opts.usenetrc,
4630 'username': opts.username,
4631 'password': opts.password,
4632 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4633 'forceurl': opts.geturl,
4634 'forcetitle': opts.gettitle,
4635 'forcethumbnail': opts.getthumbnail,
4636 'forcedescription': opts.getdescription,
4637 'forcefilename': opts.getfilename,
4638 'forceformat': opts.getformat,
4639 'simulate': opts.simulate,
4640 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4641 'format': opts.format,
4642 'format_limit': opts.format_limit,
4643 'listformats': opts.listformats,
4644 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4645 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4646 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4647 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4648 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4649 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4650 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4651 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4652 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4653 or u'%(id)s.%(ext)s'),
4654 'ignoreerrors': opts.ignoreerrors,
4655 'ratelimit': opts.ratelimit,
4656 'nooverwrites': opts.nooverwrites,
4657 'retries': opts.retries,
4658 'continuedl': opts.continue_dl,
4659 'noprogress': opts.noprogress,
4660 'playliststart': opts.playliststart,
4661 'playlistend': opts.playlistend,
4662 'logtostderr': opts.outtmpl == '-',
4663 'consoletitle': opts.consoletitle,
4664 'nopart': opts.nopart,
4665 'updatetime': opts.updatetime,
4666 'writedescription': opts.writedescription,
4667 'writeinfojson': opts.writeinfojson,
4668 'writesubtitles': opts.writesubtitles,
4669 'subtitleslang': opts.subtitleslang,
4670 'matchtitle': opts.matchtitle,
4671 'rejecttitle': opts.rejecttitle,
4672 'max_downloads': opts.max_downloads,
4673 'prefer_free_formats': opts.prefer_free_formats,
4674 'verbose': opts.verbose,
4676 for extractor in extractors:
4677 fd.add_info_extractor(extractor)
4680 if opts.extractaudio:
4681 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4684 if opts.update_self:
4685 updateSelf(fd, sys.argv[0])
4688 if len(all_urls) < 1:
4689 if not opts.update_self:
4690 parser.error(u'you must provide at least one URL')
4695 retcode = fd.download(all_urls)
4696 except MaxDownloadsReached:
4697 fd.to_screen(u'--max-download limit reached, aborting.')
4700 # Dump cookie jar if requested
4701 if opts.cookiefile is not None:
4704 except (IOError, OSError), err:
4705 sys.exit(u'ERROR: unable to save cookie jar')
4712 except DownloadError:
4714 except SameFileError:
4715 sys.exit(u'ERROR: fixed output name but more than one file to download')
4716 except KeyboardInterrupt:
4717 sys.exit(u'\nERROR: Interrupted by user')
4719 if __name__ == '__main__':
4722 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: