2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
20 __license__ = 'Public Domain'
21 __version__ = '2012.02.27'
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
56 except ImportError: # Python 2.4
59 import cStringIO as StringIO
63 # parse_qs was moved from the cgi module to the urlparse module recently.
65 from urlparse import parse_qs
67 from cgi import parse_qs
75 import xml.etree.ElementTree
76 except ImportError: # Python<2.5: Not officially supported, but let it slip
77 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
80 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
81 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
82 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
83 'Accept-Encoding': 'gzip, deflate',
84 'Accept-Language': 'en-us,en;q=0.5',
89 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
95 def raiseError(msg, i):
96 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
97 def skipSpace(i, expectMore=True):
98 while i < len(s) and s[i] in ' \t\r\n':
102 raiseError('Premature end', i)
104 def decodeEscape(match):
120 return unichr(int(esc[1:5], 16))
121 if len(esc) == 5+6 and esc[5:7] == '\\u':
122 hi = int(esc[1:5], 16)
123 low = int(esc[7:11], 16)
124 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
125 raise ValueError('Unknown escape ' + str(esc))
132 while s[e-bslashes-1] == '\\':
134 if bslashes % 2 == 1:
138 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
139 stri = rexp.sub(decodeEscape, s[i:e])
145 if s[i] == '}': # Empty dictionary
149 raiseError('Expected a string object key', i)
150 i,key = parseString(i)
152 if i >= len(s) or s[i] != ':':
153 raiseError('Expected a colon', i)
160 raiseError('Expected comma or closing curly brace', i)
165 if s[i] == ']': # Empty array
170 i = skipSpace(i) # Raise exception if premature end
174 raiseError('Expected a comma or closing bracket', i)
176 def parseDiscrete(i):
177 for k,v in {'true': True, 'false': False, 'null': None}.items():
178 if s.startswith(k, i):
180 raiseError('Not a boolean (or null)', i)
182 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
184 raiseError('Not a number', i)
186 if '.' in nums or 'e' in nums or 'E' in nums:
187 return (i+len(nums), float(nums))
188 return (i+len(nums), int(nums))
189 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
192 i,res = CHARMAP.get(s[i], parseNumber)(i)
193 i = skipSpace(i, False)
197 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
200 def preferredencoding():
201 """Get preferred encoding.
203 Returns the best encoding scheme for the system, based on
204 locale.getpreferredencoding() and some further tweaks.
206 def yield_preferredencoding():
208 pref = locale.getpreferredencoding()
214 return yield_preferredencoding().next()
217 def htmlentity_transform(matchobj):
218 """Transforms an HTML entity to a Unicode character.
220 This function receives a match object and is intended to be used with
221 the re.sub() function.
223 entity = matchobj.group(1)
225 # Known non-numeric HTML entity
226 if entity in htmlentitydefs.name2codepoint:
227 return unichr(htmlentitydefs.name2codepoint[entity])
230 mobj = re.match(ur'(?u)#(x?\d+)', entity)
232 numstr = mobj.group(1)
233 if numstr.startswith(u'x'):
235 numstr = u'0%s' % numstr
238 return unichr(long(numstr, base))
240 # Unknown entity in name, return its literal representation
241 return (u'&%s;' % entity)
244 def sanitize_title(utitle):
245 """Sanitizes a video title so it could be used as part of a filename."""
246 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
247 return utitle.replace(unicode(os.sep), u'%')
250 def sanitize_open(filename, open_mode):
251 """Try to open the given filename, and slightly tweak it if this fails.
253 Attempts to open the given filename. If this fails, it tries to change
254 the filename slightly, step by step, until it's either able to open it
255 or it fails and raises a final exception, like the standard open()
258 It returns the tuple (stream, definitive_file_name).
262 if sys.platform == 'win32':
264 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
265 return (sys.stdout, filename)
266 stream = open(_encodeFilename(filename), open_mode)
267 return (stream, filename)
268 except (IOError, OSError), err:
269 # In case of error, try to remove win32 forbidden chars
270 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
272 # An exception here should be caught in the caller
273 stream = open(_encodeFilename(filename), open_mode)
274 return (stream, filename)
277 def timeconvert(timestr):
278 """Convert RFC 2822 defined time string into system timestamp"""
280 timetuple = email.utils.parsedate_tz(timestr)
281 if timetuple is not None:
282 timestamp = email.utils.mktime_tz(timetuple)
285 def _simplify_title(title):
286 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
287 return expr.sub(u'_', title).strip(u'_')
289 def _orderedSet(iterable):
290 """ Remove all duplicates from the input iterable """
297 def _unescapeHTML(s):
299 @param s a string (of type unicode)
301 assert type(s) == type(u'')
303 htmlParser = HTMLParser.HTMLParser()
304 return htmlParser.unescape(s)
306 def _encodeFilename(s):
308 @param s The name of the file (of type unicode)
311 assert type(s) == type(u'')
313 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
314 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
315 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
316 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
319 return s.encode(sys.getfilesystemencoding(), 'ignore')
321 class DownloadError(Exception):
322 """Download Error exception.
324 This exception may be thrown by FileDownloader objects if they are not
325 configured to continue on errors. They will contain the appropriate
331 class SameFileError(Exception):
332 """Same File exception.
334 This exception will be thrown by FileDownloader objects if they detect
335 multiple files would have to be downloaded to the same file on disk.
340 class PostProcessingError(Exception):
341 """Post Processing exception.
343 This exception may be raised by PostProcessor's .run() method to
344 indicate an error in the postprocessing task.
348 class MaxDownloadsReached(Exception):
349 """ --max-downloads limit has been reached. """
353 class UnavailableVideoError(Exception):
354 """Unavailable Format exception.
356 This exception will be thrown when a video is requested
357 in a format that is not available for that video.
362 class ContentTooShortError(Exception):
363 """Content Too Short exception.
365 This exception may be raised by FileDownloader objects when a file they
366 download is too small for what the server announced first, indicating
367 the connection was probably interrupted.
373 def __init__(self, downloaded, expected):
374 self.downloaded = downloaded
375 self.expected = expected
378 class YoutubeDLHandler(urllib2.HTTPHandler):
379 """Handler for HTTP requests and responses.
381 This class, when installed with an OpenerDirector, automatically adds
382 the standard headers to every HTTP request and handles gzipped and
383 deflated responses from web servers. If compression is to be avoided in
384 a particular request, the original request in the program code only has
385 to include the HTTP header "Youtubedl-No-Compression", which will be
386 removed before making the real request.
388 Part of this code was copied from:
390 http://techknack.net/python-urllib2-handlers/
392 Andrew Rowls, the author of that code, agreed to release it to the
399 return zlib.decompress(data, -zlib.MAX_WBITS)
401 return zlib.decompress(data)
404 def addinfourl_wrapper(stream, headers, url, code):
405 if hasattr(urllib2.addinfourl, 'getcode'):
406 return urllib2.addinfourl(stream, headers, url, code)
407 ret = urllib2.addinfourl(stream, headers, url)
411 def http_request(self, req):
412 for h in std_headers:
415 req.add_header(h, std_headers[h])
416 if 'Youtubedl-no-compression' in req.headers:
417 if 'Accept-encoding' in req.headers:
418 del req.headers['Accept-encoding']
419 del req.headers['Youtubedl-no-compression']
422 def http_response(self, req, resp):
425 if resp.headers.get('Content-encoding', '') == 'gzip':
426 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
427 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
428 resp.msg = old_resp.msg
430 if resp.headers.get('Content-encoding', '') == 'deflate':
431 gz = StringIO.StringIO(self.deflate(resp.read()))
432 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
433 resp.msg = old_resp.msg
437 class FileDownloader(object):
438 """File Downloader class.
440 File downloader objects are the ones responsible of downloading the
441 actual video file and writing it to disk if the user has requested
442 it, among some other tasks. In most cases there should be one per
443 program. As, given a video URL, the downloader doesn't know how to
444 extract all the needed information, task that InfoExtractors do, it
445 has to pass the URL to one of them.
447 For this, file downloader objects have a method that allows
448 InfoExtractors to be registered in a given order. When it is passed
449 a URL, the file downloader handles it to the first InfoExtractor it
450 finds that reports being able to handle it. The InfoExtractor extracts
451 all the information about the video or videos the URL refers to, and
452 asks the FileDownloader to process the video information, possibly
453 downloading the video.
455 File downloaders accept a lot of parameters. In order not to saturate
456 the object constructor with arguments, it receives a dictionary of
457 options instead. These options are available through the params
458 attribute for the InfoExtractors to use. The FileDownloader also
459 registers itself as the downloader in charge for the InfoExtractors
460 that are added to it, so this is a "mutual registration".
464 username: Username for authentication purposes.
465 password: Password for authentication purposes.
466 usenetrc: Use netrc for authentication instead.
467 quiet: Do not print messages to stdout.
468 forceurl: Force printing final URL.
469 forcetitle: Force printing title.
470 forcethumbnail: Force printing thumbnail URL.
471 forcedescription: Force printing description.
472 forcefilename: Force printing final filename.
473 simulate: Do not download the video files.
474 format: Video format code.
475 format_limit: Highest quality format to try.
476 outtmpl: Template for output names.
477 ignoreerrors: Do not stop on download errors.
478 ratelimit: Download speed limit, in bytes/sec.
479 nooverwrites: Prevent overwriting files.
480 retries: Number of times to retry for HTTP error 5xx
481 continuedl: Try to continue downloads if possible.
482 noprogress: Do not print the progress bar.
483 playliststart: Playlist item to start at.
484 playlistend: Playlist item to end at.
485 matchtitle: Download only matching titles.
486 rejecttitle: Reject downloads for matching titles.
487 logtostderr: Log messages to stderr instead of stdout.
488 consoletitle: Display progress in console window's titlebar.
489 nopart: Do not use temporary .part files.
490 updatetime: Use the Last-modified header to set output file timestamps.
491 writedescription: Write the video description to a .description file
492 writeinfojson: Write the video description to a .info.json file
493 writesubtitles: Write the video subtitles to a .srt file
494 subtitleslang: Language of the subtitles to download
500 _download_retcode = None
501 _num_downloads = None
504 def __init__(self, params):
505 """Create a FileDownloader object with the given options."""
508 self._download_retcode = 0
509 self._num_downloads = 0
510 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
514 def format_bytes(bytes):
517 if type(bytes) is str:
522 exponent = long(math.log(bytes, 1024.0))
523 suffix = 'bkMGTPEZY'[exponent]
524 converted = float(bytes) / float(1024 ** exponent)
525 return '%.2f%s' % (converted, suffix)
528 def calc_percent(byte_counter, data_len):
531 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
534 def calc_eta(start, now, total, current):
538 if current == 0 or dif < 0.001: # One millisecond
540 rate = float(current) / dif
541 eta = long((float(total) - float(current)) / rate)
542 (eta_mins, eta_secs) = divmod(eta, 60)
545 return '%02d:%02d' % (eta_mins, eta_secs)
548 def calc_speed(start, now, bytes):
550 if bytes == 0 or dif < 0.001: # One millisecond
551 return '%10s' % '---b/s'
552 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
555 def best_block_size(elapsed_time, bytes):
556 new_min = max(bytes / 2.0, 1.0)
557 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
558 if elapsed_time < 0.001:
560 rate = bytes / elapsed_time
568 def parse_bytes(bytestr):
569 """Parse a string indicating a byte quantity into a long integer."""
570 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
573 number = float(matchobj.group(1))
574 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
575 return long(round(number * multiplier))
577 def add_info_extractor(self, ie):
578 """Add an InfoExtractor object to the end of the list."""
580 ie.set_downloader(self)
582 def add_post_processor(self, pp):
583 """Add a PostProcessor object to the end of the chain."""
585 pp.set_downloader(self)
587 def to_screen(self, message, skip_eol=False):
588 """Print message to stdout if not in quiet mode."""
589 assert type(message) == type(u'')
590 if not self.params.get('quiet', False):
591 terminator = [u'\n', u''][skip_eol]
592 output = message + terminator
594 if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
595 output = output.encode(preferredencoding(), 'ignore')
596 self._screen_file.write(output)
597 self._screen_file.flush()
599 def to_stderr(self, message):
600 """Print message to stderr."""
601 print >>sys.stderr, message.encode(preferredencoding())
603 def to_cons_title(self, message):
604 """Set console/terminal window title to message."""
605 if not self.params.get('consoletitle', False):
607 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
608 # c_wchar_p() might not be necessary if `message` is
609 # already of type unicode()
610 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
611 elif 'TERM' in os.environ:
612 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
614 def fixed_template(self):
615 """Checks if the output template is fixed."""
616 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
618 def trouble(self, message=None):
619 """Determine action to take when a download problem appears.
621 Depending on if the downloader has been configured to ignore
622 download errors or not, this method may throw an exception or
623 not when errors are found, after printing the message.
625 if message is not None:
626 self.to_stderr(message)
627 if not self.params.get('ignoreerrors', False):
628 raise DownloadError(message)
629 self._download_retcode = 1
631 def slow_down(self, start_time, byte_counter):
632 """Sleep if the download speed is over the rate limit."""
633 rate_limit = self.params.get('ratelimit', None)
634 if rate_limit is None or byte_counter == 0:
637 elapsed = now - start_time
640 speed = float(byte_counter) / elapsed
641 if speed > rate_limit:
642 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
644 def temp_name(self, filename):
645 """Returns a temporary filename for the given filename."""
646 if self.params.get('nopart', False) or filename == u'-' or \
647 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
649 return filename + u'.part'
651 def undo_temp_name(self, filename):
652 if filename.endswith(u'.part'):
653 return filename[:-len(u'.part')]
656 def try_rename(self, old_filename, new_filename):
658 if old_filename == new_filename:
660 os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
661 except (IOError, OSError), err:
662 self.trouble(u'ERROR: unable to rename file')
664 def try_utime(self, filename, last_modified_hdr):
665 """Try to set the last-modified time of the given file."""
666 if last_modified_hdr is None:
668 if not os.path.isfile(_encodeFilename(filename)):
670 timestr = last_modified_hdr
673 filetime = timeconvert(timestr)
677 os.utime(filename, (time.time(), filetime))
682 def report_writedescription(self, descfn):
683 """ Report that the description file is being written """
684 self.to_screen(u'[info] Writing video description to: ' + descfn)
686 def report_writesubtitles(self, srtfn):
687 """ Report that the subtitles file is being written """
688 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
690 def report_writeinfojson(self, infofn):
691 """ Report that the metadata file has been written """
692 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
694 def report_destination(self, filename):
695 """Report destination filename."""
696 self.to_screen(u'[download] Destination: ' + filename)
698 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
699 """Report download progress."""
700 if self.params.get('noprogress', False):
702 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
703 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
704 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
705 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
707 def report_resuming_byte(self, resume_len):
708 """Report attempt to resume at given byte."""
709 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
711 def report_retry(self, count, retries):
712 """Report retry in case of HTTP error 5xx"""
713 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
715 def report_file_already_downloaded(self, file_name):
716 """Report file has already been fully downloaded."""
718 self.to_screen(u'[download] %s has already been downloaded' % file_name)
719 except (UnicodeEncodeError), err:
720 self.to_screen(u'[download] The file has already been downloaded')
722 def report_unable_to_resume(self):
723 """Report it was impossible to resume download."""
724 self.to_screen(u'[download] Unable to resume')
726 def report_finish(self):
727 """Report download finished."""
728 if self.params.get('noprogress', False):
729 self.to_screen(u'[download] Download completed')
733 def increment_downloads(self):
734 """Increment the ordinal that assigns a number to each file."""
735 self._num_downloads += 1
737 def prepare_filename(self, info_dict):
738 """Generate the output filename."""
740 template_dict = dict(info_dict)
741 template_dict['epoch'] = unicode(long(time.time()))
742 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
743 filename = self.params['outtmpl'] % template_dict
745 except (ValueError, KeyError), err:
746 self.trouble(u'ERROR: invalid system charset or erroneous output template')
749 def _match_entry(self, info_dict):
750 """ Returns None iff the file should be downloaded """
752 title = info_dict['title']
753 matchtitle = self.params.get('matchtitle', False)
754 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
755 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
756 rejecttitle = self.params.get('rejecttitle', False)
757 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
758 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
761 def process_info(self, info_dict):
762 """Process a single dictionary returned by an InfoExtractor."""
764 reason = self._match_entry(info_dict)
765 if reason is not None:
766 self.to_screen(u'[download] ' + reason)
769 max_downloads = self.params.get('max_downloads')
770 if max_downloads is not None:
771 if self._num_downloads > int(max_downloads):
772 raise MaxDownloadsReached()
774 filename = self.prepare_filename(info_dict)
777 if self.params.get('forcetitle', False):
778 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
779 if self.params.get('forceurl', False):
780 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
781 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
782 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
783 if self.params.get('forcedescription', False) and 'description' in info_dict:
784 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
785 if self.params.get('forcefilename', False) and filename is not None:
786 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
787 if self.params.get('forceformat', False):
788 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
790 # Do nothing else if in simulate mode
791 if self.params.get('simulate', False):
798 dn = os.path.dirname(_encodeFilename(filename))
799 if dn != '' and not os.path.exists(dn): # dn is already encoded
801 except (OSError, IOError), err:
802 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
805 if self.params.get('writedescription', False):
807 descfn = filename + u'.description'
808 self.report_writedescription(descfn)
809 descfile = open(_encodeFilename(descfn), 'wb')
811 descfile.write(info_dict['description'].encode('utf-8'))
814 except (OSError, IOError):
815 self.trouble(u'ERROR: Cannot write description file ' + descfn)
818 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
819 # subtitles download errors are already managed as troubles in relevant IE
820 # that way it will silently go on when used with unsupporting IE
822 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
823 self.report_writesubtitles(srtfn)
824 srtfile = open(_encodeFilename(srtfn), 'wb')
826 srtfile.write(info_dict['subtitles'].encode('utf-8'))
829 except (OSError, IOError):
830 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
833 if self.params.get('writeinfojson', False):
834 infofn = filename + u'.info.json'
835 self.report_writeinfojson(infofn)
838 except (NameError,AttributeError):
839 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
842 infof = open(_encodeFilename(infofn), 'wb')
844 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
845 json.dump(json_info_dict, infof)
848 except (OSError, IOError):
849 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
852 if not self.params.get('skip_download', False):
853 if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
857 success = self._do_download(filename, info_dict)
858 except (OSError, IOError), err:
859 raise UnavailableVideoError
860 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
861 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
863 except (ContentTooShortError, ), err:
864 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
869 self.post_process(filename, info_dict)
870 except (PostProcessingError), err:
871 self.trouble(u'ERROR: postprocessing: %s' % str(err))
874 def download(self, url_list):
875 """Download a given list of URLs."""
876 if len(url_list) > 1 and self.fixed_template():
877 raise SameFileError(self.params['outtmpl'])
880 suitable_found = False
882 # Go to next InfoExtractor if not suitable
883 if not ie.suitable(url):
886 # Suitable InfoExtractor found
887 suitable_found = True
889 # Extract information from URL and process it
892 # Suitable InfoExtractor had been found; go to next URL
895 if not suitable_found:
896 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
898 return self._download_retcode
900 def post_process(self, filename, ie_info):
901 """Run the postprocessing chain on the given file."""
903 info['filepath'] = filename
909 def _download_with_rtmpdump(self, filename, url, player_url):
910 self.report_destination(filename)
911 tmpfilename = self.temp_name(filename)
913 # Check for rtmpdump first
915 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
916 except (OSError, IOError):
917 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
920 # Download using rtmpdump. rtmpdump returns exit code 2 when
921 # the connection was interrumpted and resuming appears to be
922 # possible. This is part of rtmpdump's normal usage, AFAIK.
923 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
924 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
925 if self.params.get('verbose', False):
928 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
931 self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
932 retval = subprocess.call(args)
933 while retval == 2 or retval == 1:
934 prevsize = os.path.getsize(_encodeFilename(tmpfilename))
935 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
936 time.sleep(5.0) # This seems to be needed
937 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
938 cursize = os.path.getsize(_encodeFilename(tmpfilename))
939 if prevsize == cursize and retval == 1:
941 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
942 if prevsize == cursize and retval == 2 and cursize > 1024:
943 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
947 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
948 self.try_rename(tmpfilename, filename)
951 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
954 def _do_download(self, filename, info_dict):
955 url = info_dict['url']
956 player_url = info_dict.get('player_url', None)
958 # Check file already present
959 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
960 self.report_file_already_downloaded(filename)
963 # Attempt to download using rtmpdump
964 if url.startswith('rtmp'):
965 return self._download_with_rtmpdump(filename, url, player_url)
967 tmpfilename = self.temp_name(filename)
970 # Do not include the Accept-Encoding header
971 headers = {'Youtubedl-no-compression': 'True'}
972 basic_request = urllib2.Request(url, None, headers)
973 request = urllib2.Request(url, None, headers)
975 # Establish possible resume length
976 if os.path.isfile(_encodeFilename(tmpfilename)):
977 resume_len = os.path.getsize(_encodeFilename(tmpfilename))
983 if self.params.get('continuedl', False):
984 self.report_resuming_byte(resume_len)
985 request.add_header('Range','bytes=%d-' % resume_len)
991 retries = self.params.get('retries', 0)
992 while count <= retries:
993 # Establish connection
995 if count == 0 and 'urlhandle' in info_dict:
996 data = info_dict['urlhandle']
997 data = urllib2.urlopen(request)
999 except (urllib2.HTTPError, ), err:
1000 if (err.code < 500 or err.code >= 600) and err.code != 416:
1001 # Unexpected HTTP error
1003 elif err.code == 416:
1004 # Unable to resume (requested range not satisfiable)
1006 # Open the connection again without the range header
1007 data = urllib2.urlopen(basic_request)
1008 content_length = data.info()['Content-Length']
1009 except (urllib2.HTTPError, ), err:
1010 if err.code < 500 or err.code >= 600:
1013 # Examine the reported length
1014 if (content_length is not None and
1015 (resume_len - 100 < long(content_length) < resume_len + 100)):
1016 # The file had already been fully downloaded.
1017 # Explanation to the above condition: in issue #175 it was revealed that
1018 # YouTube sometimes adds or removes a few bytes from the end of the file,
1019 # changing the file size slightly and causing problems for some users. So
1020 # I decided to implement a suggested change and consider the file
1021 # completely downloaded if the file size differs less than 100 bytes from
1022 # the one in the hard drive.
1023 self.report_file_already_downloaded(filename)
1024 self.try_rename(tmpfilename, filename)
1027 # The length does not match, we start the download over
1028 self.report_unable_to_resume()
1033 if count <= retries:
1034 self.report_retry(count, retries)
1037 self.trouble(u'ERROR: giving up after %s retries' % retries)
1040 data_len = data.info().get('Content-length', None)
1041 if data_len is not None:
1042 data_len = long(data_len) + resume_len
1043 data_len_str = self.format_bytes(data_len)
1044 byte_counter = 0 + resume_len
1048 # Download and write
1049 before = time.time()
1050 data_block = data.read(block_size)
1052 if len(data_block) == 0:
1054 byte_counter += len(data_block)
1056 # Open file just in time
1059 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1060 assert stream is not None
1061 filename = self.undo_temp_name(tmpfilename)
1062 self.report_destination(filename)
1063 except (OSError, IOError), err:
1064 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1067 stream.write(data_block)
1068 except (IOError, OSError), err:
1069 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1071 block_size = self.best_block_size(after - before, len(data_block))
1074 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1075 if data_len is None:
1076 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1078 percent_str = self.calc_percent(byte_counter, data_len)
1079 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1080 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1083 self.slow_down(start, byte_counter - resume_len)
1086 self.trouble(u'\nERROR: Did not get any data blocks')
1089 self.report_finish()
1090 if data_len is not None and byte_counter != data_len:
1091 raise ContentTooShortError(byte_counter, long(data_len))
1092 self.try_rename(tmpfilename, filename)
1094 # Update file modification time
1095 if self.params.get('updatetime', True):
1096 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1101 class InfoExtractor(object):
1102 """Information Extractor class.
1104 Information extractors are the classes that, given a URL, extract
1105 information from the video (or videos) the URL refers to. This
1106 information includes the real video URL, the video title and simplified
1107 title, author and others. The information is stored in a dictionary
1108 which is then passed to the FileDownloader. The FileDownloader
1109 processes this information possibly downloading the video to the file
1110 system, among other possible outcomes. The dictionaries must include
1111 the following fields:
1113 id: Video identifier.
1114 url: Final video URL.
1115 uploader: Nickname of the video uploader.
1116 title: Literal title.
1117 stitle: Simplified title.
1118 ext: Video filename extension.
1119 format: Video format.
1120 player_url: SWF Player URL (may be None).
1122 The following fields are optional. Their primary purpose is to allow
1123 youtube-dl to serve as the backend for a video search function, such
1124 as the one in youtube2mp3. They are only used when their respective
1125 forced printing functions are called:
1127 thumbnail: Full URL to a video thumbnail image.
1128 description: One-line video description.
1130 Subclasses of this one should re-define the _real_initialize() and
1131 _real_extract() methods and define a _VALID_URL regexp.
1132 Probably, they should also be added to the list of extractors.
1138 def __init__(self, downloader=None):
1139 """Constructor. Receives an optional downloader."""
1141 self.set_downloader(downloader)
1143 def suitable(self, url):
1144 """Receives a URL and returns True if suitable for this IE."""
1145 return re.match(self._VALID_URL, url) is not None
1147 def initialize(self):
1148 """Initializes an instance (authentication, etc)."""
1150 self._real_initialize()
1153 def extract(self, url):
1154 """Extracts URL information and returns it in list of dicts."""
1156 return self._real_extract(url)
1158 def set_downloader(self, downloader):
1159 """Sets the downloader for this IE."""
1160 self._downloader = downloader
1162 def _real_initialize(self):
1163 """Real initialization process. Redefine in subclasses."""
1166 def _real_extract(self, url):
1167 """Real extraction process. Redefine in subclasses."""
1171 class YoutubeIE(InfoExtractor):
1172 """Information extractor for youtube.com."""
1174 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1175 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1176 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1177 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1178 _NETRC_MACHINE = 'youtube'
1179 # Listed in order of quality
1180 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1181 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1182 _video_extensions = {
1188 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1193 _video_dimensions = {
1208 IE_NAME = u'youtube'
1210 def report_lang(self):
1211 """Report attempt to set language."""
1212 self._downloader.to_screen(u'[youtube] Setting language')
1214 def report_login(self):
1215 """Report attempt to log in."""
1216 self._downloader.to_screen(u'[youtube] Logging in')
1218 def report_age_confirmation(self):
1219 """Report attempt to confirm age."""
1220 self._downloader.to_screen(u'[youtube] Confirming age')
1222 def report_video_webpage_download(self, video_id):
1223 """Report attempt to download video webpage."""
1224 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1226 def report_video_info_webpage_download(self, video_id):
1227 """Report attempt to download video info webpage."""
1228 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1230 def report_video_subtitles_download(self, video_id):
1231 """Report attempt to download video info webpage."""
1232 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1234 def report_information_extraction(self, video_id):
1235 """Report attempt to extract video information."""
1236 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1238 def report_unavailable_format(self, video_id, format):
1239 """Report extracted video URL."""
1240 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1242 def report_rtmp_download(self):
1243 """Indicate the download will use the RTMP protocol."""
1244 self._downloader.to_screen(u'[youtube] RTMP download detected')
1246 def _closed_captions_xml_to_srt(self, xml_string):
1248 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1249 # TODO parse xml instead of regex
1250 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1251 if not dur: dur = '4'
1252 start = float(start)
1253 end = start + float(dur)
1254 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1255 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1256 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
1257 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
1258 srt += str(n) + '\n'
1259 srt += start + ' --> ' + end + '\n'
1260 srt += caption + '\n\n'
1263 def _print_formats(self, formats):
1264 print 'Available formats:'
1266 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1268 def _real_initialize(self):
1269 if self._downloader is None:
1274 downloader_params = self._downloader.params
1276 # Attempt to use provided username and password or .netrc data
1277 if downloader_params.get('username', None) is not None:
1278 username = downloader_params['username']
1279 password = downloader_params['password']
1280 elif downloader_params.get('usenetrc', False):
1282 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1283 if info is not None:
1287 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1288 except (IOError, netrc.NetrcParseError), err:
1289 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1293 request = urllib2.Request(self._LANG_URL)
1296 urllib2.urlopen(request).read()
1297 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1298 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1301 # No authentication to be performed
1302 if username is None:
1307 'current_form': 'loginForm',
1309 'action_login': 'Log In',
1310 'username': username,
1311 'password': password,
1313 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1316 login_results = urllib2.urlopen(request).read()
1317 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1318 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1320 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1321 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1327 'action_confirm': 'Confirm',
1329 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1331 self.report_age_confirmation()
1332 age_results = urllib2.urlopen(request).read()
1333 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1334 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1337 def _real_extract(self, url):
1338 # Extract video id from URL
1339 mobj = re.match(self._VALID_URL, url)
1341 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1343 video_id = mobj.group(2)
1346 self.report_video_webpage_download(video_id)
1347 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1349 video_webpage = urllib2.urlopen(request).read()
1350 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1351 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1354 # Attempt to extract SWF player URL
1355 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1356 if mobj is not None:
1357 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1362 self.report_video_info_webpage_download(video_id)
1363 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1364 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1365 % (video_id, el_type))
1366 request = urllib2.Request(video_info_url)
1368 video_info_webpage = urllib2.urlopen(request).read()
1369 video_info = parse_qs(video_info_webpage)
1370 if 'token' in video_info:
1372 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1373 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1375 if 'token' not in video_info:
1376 if 'reason' in video_info:
1377 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1379 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1382 # Start extracting information
1383 self.report_information_extraction(video_id)
1386 if 'author' not in video_info:
1387 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1389 video_uploader = urllib.unquote_plus(video_info['author'][0])
1392 if 'title' not in video_info:
1393 self._downloader.trouble(u'ERROR: unable to extract video title')
1395 video_title = urllib.unquote_plus(video_info['title'][0])
1396 video_title = video_title.decode('utf-8')
1397 video_title = sanitize_title(video_title)
1400 simple_title = _simplify_title(video_title)
1403 if 'thumbnail_url' not in video_info:
1404 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1405 video_thumbnail = ''
1406 else: # don't panic if we can't find it
1407 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1411 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1412 if mobj is not None:
1413 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1414 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1415 for expression in format_expressions:
1417 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1425 video_description = u'No description available.'
1426 mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1427 if mobj is not None:
1428 video_description = mobj.group(1).decode('utf-8')
1430 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1431 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1432 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1433 # TODO use another parser
1436 video_subtitles = None
1437 if self._downloader.params.get('writesubtitles', False):
1438 self.report_video_subtitles_download(video_id)
1439 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1441 srt_list = urllib2.urlopen(request).read()
1442 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1443 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1445 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1447 if self._downloader.params.get('subtitleslang', False):
1448 srt_lang = self._downloader.params.get('subtitleslang')
1449 elif 'en' in srt_lang_list:
1452 srt_lang = srt_lang_list[0]
1453 if not srt_lang in srt_lang_list:
1454 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
1456 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1458 srt_xml = urllib2.urlopen(request).read()
1459 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1460 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1462 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1464 self._downloader.trouble(u'WARNING: video has no closed captions')
1467 video_token = urllib.unquote_plus(video_info['token'][0])
1469 # Decide which formats to download
1470 req_format = self._downloader.params.get('format', None)
1472 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1473 self.report_rtmp_download()
1474 video_url_list = [(None, video_info['conn'][0])]
1475 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1476 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1477 url_data = [parse_qs(uds) for uds in url_data_strs]
1478 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1479 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1481 format_limit = self._downloader.params.get('format_limit', None)
1482 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1483 if format_limit is not None and format_limit in available_formats:
1484 format_list = available_formats[available_formats.index(format_limit):]
1486 format_list = available_formats
1487 existing_formats = [x for x in format_list if x in url_map]
1488 if len(existing_formats) == 0:
1489 self._downloader.trouble(u'ERROR: no known formats available for video')
1491 if self._downloader.params.get('listformats', None):
1492 self._print_formats(existing_formats)
1494 if req_format is None or req_format == 'best':
1495 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1496 elif req_format == 'worst':
1497 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1498 elif req_format in ('-1', 'all'):
1499 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1501 # Specific formats. We pick the first in a slash-delimeted sequence.
1502 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1503 req_formats = req_format.split('/')
1504 video_url_list = None
1505 for rf in req_formats:
1507 video_url_list = [(rf, url_map[rf])]
1509 if video_url_list is None:
1510 self._downloader.trouble(u'ERROR: requested format not available')
1513 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1516 for format_param, video_real_url in video_url_list:
1517 # At this point we have a new video
1518 self._downloader.increment_downloads()
1521 video_extension = self._video_extensions.get(format_param, 'flv')
1524 # Process video information
1525 self._downloader.process_info({
1526 'id': video_id.decode('utf-8'),
1527 'url': video_real_url.decode('utf-8'),
1528 'uploader': video_uploader.decode('utf-8'),
1529 'upload_date': upload_date,
1530 'title': video_title,
1531 'stitle': simple_title,
1532 'ext': video_extension.decode('utf-8'),
1533 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1534 'thumbnail': video_thumbnail.decode('utf-8'),
1535 'description': video_description,
1536 'player_url': player_url,
1537 'subtitles': video_subtitles
1539 except UnavailableVideoError, err:
1540 self._downloader.trouble(u'\nERROR: unable to download video')
1543 class MetacafeIE(InfoExtractor):
1544 """Information Extractor for metacafe.com."""
1546 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1547 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1548 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1550 IE_NAME = u'metacafe'
1552 def __init__(self, youtube_ie, downloader=None):
1553 InfoExtractor.__init__(self, downloader)
1554 self._youtube_ie = youtube_ie
1556 def report_disclaimer(self):
1557 """Report disclaimer retrieval."""
1558 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1560 def report_age_confirmation(self):
1561 """Report attempt to confirm age."""
1562 self._downloader.to_screen(u'[metacafe] Confirming age')
1564 def report_download_webpage(self, video_id):
1565 """Report webpage download."""
1566 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1568 def report_extraction(self, video_id):
1569 """Report information extraction."""
1570 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1572 def _real_initialize(self):
1573 # Retrieve disclaimer
1574 request = urllib2.Request(self._DISCLAIMER)
1576 self.report_disclaimer()
1577 disclaimer = urllib2.urlopen(request).read()
1578 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1579 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1585 'submit': "Continue - I'm over 18",
1587 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1589 self.report_age_confirmation()
1590 disclaimer = urllib2.urlopen(request).read()
1591 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1592 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1595 def _real_extract(self, url):
1596 # Extract id and simplified title from URL
1597 mobj = re.match(self._VALID_URL, url)
1599 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1602 video_id = mobj.group(1)
1604 # Check if video comes from YouTube
1605 mobj2 = re.match(r'^yt-(.*)$', video_id)
1606 if mobj2 is not None:
1607 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1610 # At this point we have a new video
1611 self._downloader.increment_downloads()
1613 simple_title = mobj.group(2).decode('utf-8')
1615 # Retrieve video webpage to extract further information
1616 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1618 self.report_download_webpage(video_id)
1619 webpage = urllib2.urlopen(request).read()
1620 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1621 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1624 # Extract URL, uploader and title from webpage
1625 self.report_extraction(video_id)
1626 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1627 if mobj is not None:
1628 mediaURL = urllib.unquote(mobj.group(1))
1629 video_extension = mediaURL[-3:]
1631 # Extract gdaKey if available
1632 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1634 video_url = mediaURL
1636 gdaKey = mobj.group(1)
1637 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1639 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1641 self._downloader.trouble(u'ERROR: unable to extract media URL')
1643 vardict = parse_qs(mobj.group(1))
1644 if 'mediaData' not in vardict:
1645 self._downloader.trouble(u'ERROR: unable to extract media URL')
1647 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1649 self._downloader.trouble(u'ERROR: unable to extract media URL')
1651 mediaURL = mobj.group(1).replace('\\/', '/')
1652 video_extension = mediaURL[-3:]
1653 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1655 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1657 self._downloader.trouble(u'ERROR: unable to extract title')
1659 video_title = mobj.group(1).decode('utf-8')
1660 video_title = sanitize_title(video_title)
1662 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1664 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1666 video_uploader = mobj.group(1)
1669 # Process video information
1670 self._downloader.process_info({
1671 'id': video_id.decode('utf-8'),
1672 'url': video_url.decode('utf-8'),
1673 'uploader': video_uploader.decode('utf-8'),
1674 'upload_date': u'NA',
1675 'title': video_title,
1676 'stitle': simple_title,
1677 'ext': video_extension.decode('utf-8'),
1681 except UnavailableVideoError:
1682 self._downloader.trouble(u'\nERROR: unable to download video')
1685 class DailymotionIE(InfoExtractor):
1686 """Information Extractor for Dailymotion"""
1688 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1689 IE_NAME = u'dailymotion'
1691 def __init__(self, downloader=None):
1692 InfoExtractor.__init__(self, downloader)
1694 def report_download_webpage(self, video_id):
1695 """Report webpage download."""
1696 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1698 def report_extraction(self, video_id):
1699 """Report information extraction."""
1700 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1702 def _real_extract(self, url):
1703 # Extract id and simplified title from URL
1704 mobj = re.match(self._VALID_URL, url)
1706 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1709 # At this point we have a new video
1710 self._downloader.increment_downloads()
1711 video_id = mobj.group(1)
1713 video_extension = 'flv'
1715 # Retrieve video webpage to extract further information
1716 request = urllib2.Request(url)
1717 request.add_header('Cookie', 'family_filter=off')
1719 self.report_download_webpage(video_id)
1720 webpage = urllib2.urlopen(request).read()
1721 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1722 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1725 # Extract URL, uploader and title from webpage
1726 self.report_extraction(video_id)
1727 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1729 self._downloader.trouble(u'ERROR: unable to extract media URL')
1731 sequence = urllib.unquote(mobj.group(1))
1732 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1734 self._downloader.trouble(u'ERROR: unable to extract media URL')
1736 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1738 # if needed add http://www.dailymotion.com/ if relative URL
1740 video_url = mediaURL
1742 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1744 self._downloader.trouble(u'ERROR: unable to extract title')
1746 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1747 video_title = sanitize_title(video_title)
1748 simple_title = _simplify_title(video_title)
1750 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1752 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1754 video_uploader = mobj.group(1)
1757 # Process video information
1758 self._downloader.process_info({
1759 'id': video_id.decode('utf-8'),
1760 'url': video_url.decode('utf-8'),
1761 'uploader': video_uploader.decode('utf-8'),
1762 'upload_date': u'NA',
1763 'title': video_title,
1764 'stitle': simple_title,
1765 'ext': video_extension.decode('utf-8'),
1769 except UnavailableVideoError:
1770 self._downloader.trouble(u'\nERROR: unable to download video')
1773 class GoogleIE(InfoExtractor):
1774 """Information extractor for video.google.com."""
1776 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1777 IE_NAME = u'video.google'
1779 def __init__(self, downloader=None):
1780 InfoExtractor.__init__(self, downloader)
1782 def report_download_webpage(self, video_id):
1783 """Report webpage download."""
1784 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1786 def report_extraction(self, video_id):
1787 """Report information extraction."""
1788 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1790 def _real_extract(self, url):
1791 # Extract id from URL
1792 mobj = re.match(self._VALID_URL, url)
1794 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1797 # At this point we have a new video
1798 self._downloader.increment_downloads()
1799 video_id = mobj.group(1)
1801 video_extension = 'mp4'
1803 # Retrieve video webpage to extract further information
1804 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1806 self.report_download_webpage(video_id)
1807 webpage = urllib2.urlopen(request).read()
1808 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1809 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1812 # Extract URL, uploader, and title from webpage
1813 self.report_extraction(video_id)
1814 mobj = re.search(r"download_url:'([^']+)'", webpage)
1816 video_extension = 'flv'
1817 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1819 self._downloader.trouble(u'ERROR: unable to extract media URL')
1821 mediaURL = urllib.unquote(mobj.group(1))
1822 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1823 mediaURL = mediaURL.replace('\\x26', '\x26')
1825 video_url = mediaURL
1827 mobj = re.search(r'<title>(.*)</title>', webpage)
1829 self._downloader.trouble(u'ERROR: unable to extract title')
1831 video_title = mobj.group(1).decode('utf-8')
1832 video_title = sanitize_title(video_title)
1833 simple_title = _simplify_title(video_title)
1835 # Extract video description
1836 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1838 self._downloader.trouble(u'ERROR: unable to extract video description')
1840 video_description = mobj.group(1).decode('utf-8')
1841 if not video_description:
1842 video_description = 'No description available.'
1844 # Extract video thumbnail
1845 if self._downloader.params.get('forcethumbnail', False):
1846 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1848 webpage = urllib2.urlopen(request).read()
1849 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1850 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1852 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1854 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1856 video_thumbnail = mobj.group(1)
1857 else: # we need something to pass to process_info
1858 video_thumbnail = ''
1861 # Process video information
1862 self._downloader.process_info({
1863 'id': video_id.decode('utf-8'),
1864 'url': video_url.decode('utf-8'),
1866 'upload_date': u'NA',
1867 'title': video_title,
1868 'stitle': simple_title,
1869 'ext': video_extension.decode('utf-8'),
1873 except UnavailableVideoError:
1874 self._downloader.trouble(u'\nERROR: unable to download video')
1877 class PhotobucketIE(InfoExtractor):
1878 """Information extractor for photobucket.com."""
1880 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1881 IE_NAME = u'photobucket'
1883 def __init__(self, downloader=None):
1884 InfoExtractor.__init__(self, downloader)
1886 def report_download_webpage(self, video_id):
1887 """Report webpage download."""
1888 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1890 def report_extraction(self, video_id):
1891 """Report information extraction."""
1892 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1894 def _real_extract(self, url):
1895 # Extract id from URL
1896 mobj = re.match(self._VALID_URL, url)
1898 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1901 # At this point we have a new video
1902 self._downloader.increment_downloads()
1903 video_id = mobj.group(1)
1905 video_extension = 'flv'
1907 # Retrieve video webpage to extract further information
1908 request = urllib2.Request(url)
1910 self.report_download_webpage(video_id)
1911 webpage = urllib2.urlopen(request).read()
1912 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1913 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1916 # Extract URL, uploader, and title from webpage
1917 self.report_extraction(video_id)
1918 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1920 self._downloader.trouble(u'ERROR: unable to extract media URL')
1922 mediaURL = urllib.unquote(mobj.group(1))
1924 video_url = mediaURL
1926 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1928 self._downloader.trouble(u'ERROR: unable to extract title')
1930 video_title = mobj.group(1).decode('utf-8')
1931 video_title = sanitize_title(video_title)
1932 simple_title = _simplify_title(vide_title)
1934 video_uploader = mobj.group(2).decode('utf-8')
1937 # Process video information
1938 self._downloader.process_info({
1939 'id': video_id.decode('utf-8'),
1940 'url': video_url.decode('utf-8'),
1941 'uploader': video_uploader,
1942 'upload_date': u'NA',
1943 'title': video_title,
1944 'stitle': simple_title,
1945 'ext': video_extension.decode('utf-8'),
1949 except UnavailableVideoError:
1950 self._downloader.trouble(u'\nERROR: unable to download video')
1953 class YahooIE(InfoExtractor):
1954 """Information extractor for video.yahoo.com."""
1956 # _VALID_URL matches all Yahoo! Video URLs
1957 # _VPAGE_URL matches only the extractable '/watch/' URLs
1958 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1959 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1960 IE_NAME = u'video.yahoo'
1962 def __init__(self, downloader=None):
1963 InfoExtractor.__init__(self, downloader)
1965 def report_download_webpage(self, video_id):
1966 """Report webpage download."""
1967 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1969 def report_extraction(self, video_id):
1970 """Report information extraction."""
1971 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1973 def _real_extract(self, url, new_video=True):
1974 # Extract ID from URL
1975 mobj = re.match(self._VALID_URL, url)
1977 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1980 # At this point we have a new video
1981 self._downloader.increment_downloads()
1982 video_id = mobj.group(2)
1983 video_extension = 'flv'
1985 # Rewrite valid but non-extractable URLs as
1986 # extractable English language /watch/ URLs
1987 if re.match(self._VPAGE_URL, url) is None:
1988 request = urllib2.Request(url)
1990 webpage = urllib2.urlopen(request).read()
1991 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1992 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1995 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1997 self._downloader.trouble(u'ERROR: Unable to extract id field')
1999 yahoo_id = mobj.group(1)
2001 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2003 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2005 yahoo_vid = mobj.group(1)
2007 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2008 return self._real_extract(url, new_video=False)
2010 # Retrieve video webpage to extract further information
2011 request = urllib2.Request(url)
2013 self.report_download_webpage(video_id)
2014 webpage = urllib2.urlopen(request).read()
2015 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2016 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2019 # Extract uploader and title from webpage
2020 self.report_extraction(video_id)
2021 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2023 self._downloader.trouble(u'ERROR: unable to extract video title')
2025 video_title = mobj.group(1).decode('utf-8')
2026 simple_title = _simplify_title(video_title)
2028 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2030 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2032 video_uploader = mobj.group(1).decode('utf-8')
2034 # Extract video thumbnail
2035 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2037 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2039 video_thumbnail = mobj.group(1).decode('utf-8')
2041 # Extract video description
2042 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2044 self._downloader.trouble(u'ERROR: unable to extract video description')
2046 video_description = mobj.group(1).decode('utf-8')
2047 if not video_description:
2048 video_description = 'No description available.'
2050 # Extract video height and width
2051 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2053 self._downloader.trouble(u'ERROR: unable to extract video height')
2055 yv_video_height = mobj.group(1)
2057 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2059 self._downloader.trouble(u'ERROR: unable to extract video width')
2061 yv_video_width = mobj.group(1)
2063 # Retrieve video playlist to extract media URL
2064 # I'm not completely sure what all these options are, but we
2065 # seem to need most of them, otherwise the server sends a 401.
2066 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
2067 yv_bitrate = '700' # according to Wikipedia this is hard-coded
2068 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2069 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2070 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2072 self.report_download_webpage(video_id)
2073 webpage = urllib2.urlopen(request).read()
2074 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2075 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2078 # Extract media URL from playlist XML
2079 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2081 self._downloader.trouble(u'ERROR: Unable to extract media URL')
2083 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2084 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2087 # Process video information
2088 self._downloader.process_info({
2089 'id': video_id.decode('utf-8'),
2091 'uploader': video_uploader,
2092 'upload_date': u'NA',
2093 'title': video_title,
2094 'stitle': simple_title,
2095 'ext': video_extension.decode('utf-8'),
2096 'thumbnail': video_thumbnail.decode('utf-8'),
2097 'description': video_description,
2098 'thumbnail': video_thumbnail,
2101 except UnavailableVideoError:
2102 self._downloader.trouble(u'\nERROR: unable to download video')
2105 class VimeoIE(InfoExtractor):
2106 """Information extractor for vimeo.com."""
2108 # _VALID_URL matches Vimeo URLs
2109 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2112 def __init__(self, downloader=None):
2113 InfoExtractor.__init__(self, downloader)
2115 def report_download_webpage(self, video_id):
2116 """Report webpage download."""
2117 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2119 def report_extraction(self, video_id):
2120 """Report information extraction."""
2121 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2123 def _real_extract(self, url, new_video=True):
2124 # Extract ID from URL
2125 mobj = re.match(self._VALID_URL, url)
2127 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2130 # At this point we have a new video
2131 self._downloader.increment_downloads()
2132 video_id = mobj.group(1)
2134 # Retrieve video webpage to extract further information
2135 request = urllib2.Request(url, None, std_headers)
2137 self.report_download_webpage(video_id)
2138 webpage = urllib2.urlopen(request).read()
2139 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2140 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2143 # Now we begin extracting as much information as we can from what we
2144 # retrieved. First we extract the information common to all extractors,
2145 # and latter we extract those that are Vimeo specific.
2146 self.report_extraction(video_id)
2148 # Extract the config JSON
2149 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2151 config = json.loads(config)
2153 self._downloader.trouble(u'ERROR: unable to extract info section')
2157 video_title = config["video"]["title"]
2158 simple_title = _simplify_title(video_title)
2161 video_uploader = config["video"]["owner"]["name"]
2163 # Extract video thumbnail
2164 video_thumbnail = config["video"]["thumbnail"]
2166 # Extract video description
2170 video_description = u'No description available.'
2171 mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2172 if mobj is not None:
2173 video_description = mobj.group(1)
2175 html_parser = lxml.etree.HTMLParser()
2176 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2177 video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2178 # TODO use another parser
2180 # Extract upload date
2181 video_upload_date = u'NA'
2182 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2183 if mobj is not None:
2184 video_upload_date = mobj.group(1)
2186 # Vimeo specific: extract request signature and timestamp
2187 sig = config['request']['signature']
2188 timestamp = config['request']['timestamp']
2190 # Vimeo specific: extract video codec and quality information
2191 # TODO bind to format param
2192 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2193 for codec in codecs:
2194 if codec[0] in config["video"]["files"]:
2195 video_codec = codec[0]
2196 video_extension = codec[1]
2197 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2198 else: quality = 'sd'
2201 self._downloader.trouble(u'ERROR: no known codec found')
2204 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2205 %(video_id, sig, timestamp, quality, video_codec.upper())
2208 # Process video information
2209 self._downloader.process_info({
2212 'uploader': video_uploader,
2213 'upload_date': video_upload_date,
2214 'title': video_title,
2215 'stitle': simple_title,
2216 'ext': video_extension,
2217 'thumbnail': video_thumbnail,
2218 'description': video_description,
2221 except UnavailableVideoError:
2222 self._downloader.trouble(u'ERROR: unable to download video')
2225 class GenericIE(InfoExtractor):
2226 """Generic last-resort information extractor."""
2229 IE_NAME = u'generic'
2231 def __init__(self, downloader=None):
2232 InfoExtractor.__init__(self, downloader)
2234 def report_download_webpage(self, video_id):
2235 """Report webpage download."""
2236 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2237 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2239 def report_extraction(self, video_id):
2240 """Report information extraction."""
2241 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2243 def _real_extract(self, url):
2244 # At this point we have a new video
2245 self._downloader.increment_downloads()
2247 video_id = url.split('/')[-1]
2248 request = urllib2.Request(url)
2250 self.report_download_webpage(video_id)
2251 webpage = urllib2.urlopen(request).read()
2252 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2253 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2255 except ValueError, err:
2256 # since this is the last-resort InfoExtractor, if
2257 # this error is thrown, it'll be thrown here
2258 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2261 self.report_extraction(video_id)
2262 # Start with something easy: JW Player in SWFObject
2263 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2265 # Broaden the search a little bit
2266 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2268 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2271 # It's possible that one of the regexes
2272 # matched, but returned an empty group:
2273 if mobj.group(1) is None:
2274 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2277 video_url = urllib.unquote(mobj.group(1))
2278 video_id = os.path.basename(video_url)
2280 # here's a fun little line of code for you:
2281 video_extension = os.path.splitext(video_id)[1][1:]
2282 video_id = os.path.splitext(video_id)[0]
2284 # it's tempting to parse this further, but you would
2285 # have to take into account all the variations like
2286 # Video Title - Site Name
2287 # Site Name | Video Title
2288 # Video Title - Tagline | Site Name
2289 # and so on and so forth; it's just not practical
2290 mobj = re.search(r'<title>(.*)</title>', webpage)
2292 self._downloader.trouble(u'ERROR: unable to extract title')
2294 video_title = mobj.group(1).decode('utf-8')
2295 video_title = sanitize_title(video_title)
2296 simple_title = _simplify_title(video_title)
2298 # video uploader is domain name
2299 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2301 self._downloader.trouble(u'ERROR: unable to extract title')
2303 video_uploader = mobj.group(1).decode('utf-8')
2306 # Process video information
2307 self._downloader.process_info({
2308 'id': video_id.decode('utf-8'),
2309 'url': video_url.decode('utf-8'),
2310 'uploader': video_uploader,
2311 'upload_date': u'NA',
2312 'title': video_title,
2313 'stitle': simple_title,
2314 'ext': video_extension.decode('utf-8'),
2318 except UnavailableVideoError, err:
2319 self._downloader.trouble(u'\nERROR: unable to download video')
2322 class YoutubeSearchIE(InfoExtractor):
2323 """Information Extractor for YouTube search queries."""
2324 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2325 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
2327 _max_youtube_results = 1000
2328 IE_NAME = u'youtube:search'
2330 def __init__(self, youtube_ie, downloader=None):
2331 InfoExtractor.__init__(self, downloader)
2332 self._youtube_ie = youtube_ie
2334 def report_download_page(self, query, pagenum):
2335 """Report attempt to download playlist page with given number."""
2336 query = query.decode(preferredencoding())
2337 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2339 def _real_initialize(self):
2340 self._youtube_ie.initialize()
2342 def _real_extract(self, query):
2343 mobj = re.match(self._VALID_URL, query)
2345 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2348 prefix, query = query.split(':')
2350 query = query.encode('utf-8')
2352 self._download_n_results(query, 1)
2354 elif prefix == 'all':
2355 self._download_n_results(query, self._max_youtube_results)
2361 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2363 elif n > self._max_youtube_results:
2364 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2365 n = self._max_youtube_results
2366 self._download_n_results(query, n)
2368 except ValueError: # parsing prefix as integer fails
2369 self._download_n_results(query, 1)
2372 def _download_n_results(self, query, n):
2373 """Downloads a specified number of results for a query"""
2379 while (50 * pagenum) < limit:
2380 self.report_download_page(query, pagenum+1)
2381 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
2382 request = urllib2.Request(result_url)
2384 data = urllib2.urlopen(request).read()
2385 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2386 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
2388 api_response = json.loads(data)['data']
2390 new_ids = list(video['id'] for video in api_response['items'])
2391 video_ids += new_ids
2393 limit = min(n, api_response['totalItems'])
2396 if len(video_ids) > n:
2397 video_ids = video_ids[:n]
2398 for id in video_ids:
2399 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2403 class GoogleSearchIE(InfoExtractor):
2404 """Information Extractor for Google Video search queries."""
2405 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2406 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2407 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2408 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2410 _max_google_results = 1000
2411 IE_NAME = u'video.google:search'
2413 def __init__(self, google_ie, downloader=None):
2414 InfoExtractor.__init__(self, downloader)
2415 self._google_ie = google_ie
2417 def report_download_page(self, query, pagenum):
2418 """Report attempt to download playlist page with given number."""
2419 query = query.decode(preferredencoding())
2420 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2422 def _real_initialize(self):
2423 self._google_ie.initialize()
2425 def _real_extract(self, query):
2426 mobj = re.match(self._VALID_URL, query)
2428 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2431 prefix, query = query.split(':')
2433 query = query.encode('utf-8')
2435 self._download_n_results(query, 1)
2437 elif prefix == 'all':
2438 self._download_n_results(query, self._max_google_results)
2444 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2446 elif n > self._max_google_results:
2447 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2448 n = self._max_google_results
2449 self._download_n_results(query, n)
2451 except ValueError: # parsing prefix as integer fails
2452 self._download_n_results(query, 1)
2455 def _download_n_results(self, query, n):
2456 """Downloads a specified number of results for a query"""
2462 self.report_download_page(query, pagenum)
2463 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2464 request = urllib2.Request(result_url)
2466 page = urllib2.urlopen(request).read()
2467 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2468 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2471 # Extract video identifiers
2472 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2473 video_id = mobj.group(1)
2474 if video_id not in video_ids:
2475 video_ids.append(video_id)
2476 if len(video_ids) == n:
2477 # Specified n videos reached
2478 for id in video_ids:
2479 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2482 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2483 for id in video_ids:
2484 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2487 pagenum = pagenum + 1
2490 class YahooSearchIE(InfoExtractor):
2491 """Information Extractor for Yahoo! Video search queries."""
2492 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2493 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2494 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2495 _MORE_PAGES_INDICATOR = r'\s*Next'
2497 _max_yahoo_results = 1000
2498 IE_NAME = u'video.yahoo:search'
2500 def __init__(self, yahoo_ie, downloader=None):
2501 InfoExtractor.__init__(self, downloader)
2502 self._yahoo_ie = yahoo_ie
2504 def report_download_page(self, query, pagenum):
2505 """Report attempt to download playlist page with given number."""
2506 query = query.decode(preferredencoding())
2507 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2509 def _real_initialize(self):
2510 self._yahoo_ie.initialize()
2512 def _real_extract(self, query):
2513 mobj = re.match(self._VALID_URL, query)
2515 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2518 prefix, query = query.split(':')
2520 query = query.encode('utf-8')
2522 self._download_n_results(query, 1)
2524 elif prefix == 'all':
2525 self._download_n_results(query, self._max_yahoo_results)
2531 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2533 elif n > self._max_yahoo_results:
2534 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2535 n = self._max_yahoo_results
2536 self._download_n_results(query, n)
2538 except ValueError: # parsing prefix as integer fails
2539 self._download_n_results(query, 1)
2542 def _download_n_results(self, query, n):
2543 """Downloads a specified number of results for a query"""
2546 already_seen = set()
2550 self.report_download_page(query, pagenum)
2551 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2552 request = urllib2.Request(result_url)
2554 page = urllib2.urlopen(request).read()
2555 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2556 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2559 # Extract video identifiers
2560 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2561 video_id = mobj.group(1)
2562 if video_id not in already_seen:
2563 video_ids.append(video_id)
2564 already_seen.add(video_id)
2565 if len(video_ids) == n:
2566 # Specified n videos reached
2567 for id in video_ids:
2568 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2571 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2572 for id in video_ids:
2573 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2576 pagenum = pagenum + 1
2579 class YoutubePlaylistIE(InfoExtractor):
2580 """Information Extractor for YouTube playlists."""
2582 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2583 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2584 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=PL%s&'
2585 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2587 IE_NAME = u'youtube:playlist'
2589 def __init__(self, youtube_ie, downloader=None):
2590 InfoExtractor.__init__(self, downloader)
2591 self._youtube_ie = youtube_ie
2593 def report_download_page(self, playlist_id, pagenum):
2594 """Report attempt to download playlist page with given number."""
2595 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2597 def _real_initialize(self):
2598 self._youtube_ie.initialize()
2600 def _real_extract(self, url):
2601 # Extract playlist id
2602 mobj = re.match(self._VALID_URL, url)
2604 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2608 if mobj.group(3) is not None:
2609 self._youtube_ie.extract(mobj.group(3))
2612 # Download playlist pages
2613 # prefix is 'p' as default for playlists but there are other types that need extra care
2614 playlist_prefix = mobj.group(1)
2615 if playlist_prefix == 'a':
2616 playlist_access = 'artist'
2618 playlist_prefix = 'p'
2619 playlist_access = 'view_play_list'
2620 playlist_id = mobj.group(2)
2625 self.report_download_page(playlist_id, pagenum)
2626 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2627 request = urllib2.Request(url)
2629 page = urllib2.urlopen(request).read()
2630 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2631 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2634 # Extract video identifiers
2636 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
2637 if mobj.group(1) not in ids_in_page:
2638 ids_in_page.append(mobj.group(1))
2639 video_ids.extend(ids_in_page)
2641 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2643 pagenum = pagenum + 1
2645 playliststart = self._downloader.params.get('playliststart', 1) - 1
2646 playlistend = self._downloader.params.get('playlistend', -1)
2647 if playlistend == -1:
2648 video_ids = video_ids[playliststart:]
2650 video_ids = video_ids[playliststart:playlistend]
2652 for id in video_ids:
2653 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2657 class YoutubeUserIE(InfoExtractor):
2658 """Information Extractor for YouTube users."""
2660 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2661 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2662 _GDATA_PAGE_SIZE = 50
2663 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2664 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2666 IE_NAME = u'youtube:user'
2668 def __init__(self, youtube_ie, downloader=None):
2669 InfoExtractor.__init__(self, downloader)
2670 self._youtube_ie = youtube_ie
2672 def report_download_page(self, username, start_index):
2673 """Report attempt to download user page."""
2674 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2675 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2677 def _real_initialize(self):
2678 self._youtube_ie.initialize()
2680 def _real_extract(self, url):
2682 mobj = re.match(self._VALID_URL, url)
2684 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2687 username = mobj.group(1)
2689 # Download video ids using YouTube Data API. Result size per
2690 # query is limited (currently to 50 videos) so we need to query
2691 # page by page until there are no video ids - it means we got
2698 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2699 self.report_download_page(username, start_index)
2701 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2704 page = urllib2.urlopen(request).read()
2705 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2706 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2709 # Extract video identifiers
2712 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2713 if mobj.group(1) not in ids_in_page:
2714 ids_in_page.append(mobj.group(1))
2716 video_ids.extend(ids_in_page)
2718 # A little optimization - if current page is not
2719 # "full", ie. does not contain PAGE_SIZE video ids then
2720 # we can assume that this page is the last one - there
2721 # are no more ids on further pages - no need to query
2724 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2729 all_ids_count = len(video_ids)
2730 playliststart = self._downloader.params.get('playliststart', 1) - 1
2731 playlistend = self._downloader.params.get('playlistend', -1)
2733 if playlistend == -1:
2734 video_ids = video_ids[playliststart:]
2736 video_ids = video_ids[playliststart:playlistend]
2738 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2739 (username, all_ids_count, len(video_ids)))
2741 for video_id in video_ids:
2742 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2745 class DepositFilesIE(InfoExtractor):
2746 """Information extractor for depositfiles.com"""
2748 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2749 IE_NAME = u'DepositFiles'
2751 def __init__(self, downloader=None):
2752 InfoExtractor.__init__(self, downloader)
2754 def report_download_webpage(self, file_id):
2755 """Report webpage download."""
2756 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2758 def report_extraction(self, file_id):
2759 """Report information extraction."""
2760 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2762 def _real_extract(self, url):
2763 # At this point we have a new file
2764 self._downloader.increment_downloads()
2766 file_id = url.split('/')[-1]
2767 # Rebuild url in english locale
2768 url = 'http://depositfiles.com/en/files/' + file_id
2770 # Retrieve file webpage with 'Free download' button pressed
2771 free_download_indication = { 'gateway_result' : '1' }
2772 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2774 self.report_download_webpage(file_id)
2775 webpage = urllib2.urlopen(request).read()
2776 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2777 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2780 # Search for the real file URL
2781 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2782 if (mobj is None) or (mobj.group(1) is None):
2783 # Try to figure out reason of the error.
2784 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2785 if (mobj is not None) and (mobj.group(1) is not None):
2786 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2787 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2789 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2792 file_url = mobj.group(1)
2793 file_extension = os.path.splitext(file_url)[1][1:]
2795 # Search for file title
2796 mobj = re.search(r'<b title="(.*?)">', webpage)
2798 self._downloader.trouble(u'ERROR: unable to extract title')
2800 file_title = mobj.group(1).decode('utf-8')
2803 # Process file information
2804 self._downloader.process_info({
2805 'id': file_id.decode('utf-8'),
2806 'url': file_url.decode('utf-8'),
2808 'upload_date': u'NA',
2809 'title': file_title,
2810 'stitle': file_title,
2811 'ext': file_extension.decode('utf-8'),
2815 except UnavailableVideoError, err:
2816 self._downloader.trouble(u'ERROR: unable to download file')
2819 class FacebookIE(InfoExtractor):
2820 """Information Extractor for Facebook"""
2822 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2823 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2824 _NETRC_MACHINE = 'facebook'
2825 _available_formats = ['video', 'highqual', 'lowqual']
2826 _video_extensions = {
2831 IE_NAME = u'facebook'
2833 def __init__(self, downloader=None):
2834 InfoExtractor.__init__(self, downloader)
2836 def _reporter(self, message):
2837 """Add header and report message."""
2838 self._downloader.to_screen(u'[facebook] %s' % message)
2840 def report_login(self):
2841 """Report attempt to log in."""
2842 self._reporter(u'Logging in')
2844 def report_video_webpage_download(self, video_id):
2845 """Report attempt to download video webpage."""
2846 self._reporter(u'%s: Downloading video webpage' % video_id)
2848 def report_information_extraction(self, video_id):
2849 """Report attempt to extract video information."""
2850 self._reporter(u'%s: Extracting video information' % video_id)
2852 def _parse_page(self, video_webpage):
2853 """Extract video information from page"""
2855 data = {'title': r'\("video_title", "(.*?)"\)',
2856 'description': r'<div class="datawrap">(.*?)</div>',
2857 'owner': r'\("video_owner_name", "(.*?)"\)',
2858 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2861 for piece in data.keys():
2862 mobj = re.search(data[piece], video_webpage)
2863 if mobj is not None:
2864 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2868 for fmt in self._available_formats:
2869 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2870 if mobj is not None:
2871 # URL is in a Javascript segment inside an escaped Unicode format within
2872 # the generally utf-8 page
2873 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2874 video_info['video_urls'] = video_urls
2878 def _real_initialize(self):
2879 if self._downloader is None:
2884 downloader_params = self._downloader.params
2886 # Attempt to use provided username and password or .netrc data
2887 if downloader_params.get('username', None) is not None:
2888 useremail = downloader_params['username']
2889 password = downloader_params['password']
2890 elif downloader_params.get('usenetrc', False):
2892 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2893 if info is not None:
2897 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2898 except (IOError, netrc.NetrcParseError), err:
2899 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2902 if useremail is None:
2911 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2914 login_results = urllib2.urlopen(request).read()
2915 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2916 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2918 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2919 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2922 def _real_extract(self, url):
2923 mobj = re.match(self._VALID_URL, url)
2925 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2927 video_id = mobj.group('ID')
2930 self.report_video_webpage_download(video_id)
2931 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2933 page = urllib2.urlopen(request)
2934 video_webpage = page.read()
2935 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2936 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2939 # Start extracting information
2940 self.report_information_extraction(video_id)
2942 # Extract information
2943 video_info = self._parse_page(video_webpage)
2946 if 'owner' not in video_info:
2947 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2949 video_uploader = video_info['owner']
2952 if 'title' not in video_info:
2953 self._downloader.trouble(u'ERROR: unable to extract video title')
2955 video_title = video_info['title']
2956 video_title = video_title.decode('utf-8')
2957 video_title = sanitize_title(video_title)
2959 simple_title = _simplify_title(video_title)
2962 if 'thumbnail' not in video_info:
2963 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2964 video_thumbnail = ''
2966 video_thumbnail = video_info['thumbnail']
2970 if 'upload_date' in video_info:
2971 upload_time = video_info['upload_date']
2972 timetuple = email.utils.parsedate_tz(upload_time)
2973 if timetuple is not None:
2975 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2980 video_description = video_info.get('description', 'No description available.')
2982 url_map = video_info['video_urls']
2983 if len(url_map.keys()) > 0:
2984 # Decide which formats to download
2985 req_format = self._downloader.params.get('format', None)
2986 format_limit = self._downloader.params.get('format_limit', None)
2988 if format_limit is not None and format_limit in self._available_formats:
2989 format_list = self._available_formats[self._available_formats.index(format_limit):]
2991 format_list = self._available_formats
2992 existing_formats = [x for x in format_list if x in url_map]
2993 if len(existing_formats) == 0:
2994 self._downloader.trouble(u'ERROR: no known formats available for video')
2996 if req_format is None:
2997 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2998 elif req_format == 'worst':
2999 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3000 elif req_format == '-1':
3001 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3004 if req_format not in url_map:
3005 self._downloader.trouble(u'ERROR: requested format not available')
3007 video_url_list = [(req_format, url_map[req_format])] # Specific format
3009 for format_param, video_real_url in video_url_list:
3011 # At this point we have a new video
3012 self._downloader.increment_downloads()
3015 video_extension = self._video_extensions.get(format_param, 'mp4')
3018 # Process video information
3019 self._downloader.process_info({
3020 'id': video_id.decode('utf-8'),
3021 'url': video_real_url.decode('utf-8'),
3022 'uploader': video_uploader.decode('utf-8'),
3023 'upload_date': upload_date,
3024 'title': video_title,
3025 'stitle': simple_title,
3026 'ext': video_extension.decode('utf-8'),
3027 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3028 'thumbnail': video_thumbnail.decode('utf-8'),
3029 'description': video_description.decode('utf-8'),
3032 except UnavailableVideoError, err:
3033 self._downloader.trouble(u'\nERROR: unable to download video')
3035 class BlipTVIE(InfoExtractor):
3036 """Information extractor for blip.tv"""
3038 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3039 _URL_EXT = r'^.*\.([a-z0-9]+)$'
3040 IE_NAME = u'blip.tv'
3042 def report_extraction(self, file_id):
3043 """Report information extraction."""
3044 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3046 def report_direct_download(self, title):
3047 """Report information extraction."""
3048 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3050 def _real_extract(self, url):
3051 mobj = re.match(self._VALID_URL, url)
3053 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3060 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3061 request = urllib2.Request(json_url)
3062 self.report_extraction(mobj.group(1))
3065 urlh = urllib2.urlopen(request)
3066 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3067 basename = url.split('/')[-1]
3068 title,ext = os.path.splitext(basename)
3069 title = title.decode('UTF-8')
3070 ext = ext.replace('.', '')
3071 self.report_direct_download(title)
3076 'stitle': _simplify_title(title),
3080 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3081 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3083 if info is None: # Regular URL
3085 json_code = urlh.read()
3086 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3087 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3091 json_data = json.loads(json_code)
3092 if 'Post' in json_data:
3093 data = json_data['Post']
3097 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3098 video_url = data['media']['url']
3099 umobj = re.match(self._URL_EXT, video_url)
3101 raise ValueError('Can not determine filename extension')
3102 ext = umobj.group(1)
3105 'id': data['item_id'],
3107 'uploader': data['display_name'],
3108 'upload_date': upload_date,
3109 'title': data['title'],
3110 'stitle': _simplify_title(data['title']),
3112 'format': data['media']['mimeType'],
3113 'thumbnail': data['thumbnailUrl'],
3114 'description': data['description'],
3115 'player_url': data['embedUrl']
3117 except (ValueError,KeyError), err:
3118 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3121 self._downloader.increment_downloads()
3124 self._downloader.process_info(info)
3125 except UnavailableVideoError, err:
3126 self._downloader.trouble(u'\nERROR: unable to download video')
3129 class MyVideoIE(InfoExtractor):
3130 """Information Extractor for myvideo.de."""
3132 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3133 IE_NAME = u'myvideo'
3135 def __init__(self, downloader=None):
3136 InfoExtractor.__init__(self, downloader)
3138 def report_download_webpage(self, video_id):
3139 """Report webpage download."""
3140 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3142 def report_extraction(self, video_id):
3143 """Report information extraction."""
3144 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3146 def _real_extract(self,url):
3147 mobj = re.match(self._VALID_URL, url)
3149 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3152 video_id = mobj.group(1)
3155 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3157 self.report_download_webpage(video_id)
3158 webpage = urllib2.urlopen(request).read()
3159 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3160 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3163 self.report_extraction(video_id)
3164 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3167 self._downloader.trouble(u'ERROR: unable to extract media URL')
3169 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3171 mobj = re.search('<title>([^<]+)</title>', webpage)
3173 self._downloader.trouble(u'ERROR: unable to extract title')
3176 video_title = mobj.group(1)
3177 video_title = sanitize_title(video_title)
3179 simple_title = _simplify_title(video_title)
3182 self._downloader.process_info({
3186 'upload_date': u'NA',
3187 'title': video_title,
3188 'stitle': simple_title,
3193 except UnavailableVideoError:
3194 self._downloader.trouble(u'\nERROR: Unable to download video')
3196 class ComedyCentralIE(InfoExtractor):
3197 """Information extractor for The Daily Show and Colbert Report """
3199 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3200 IE_NAME = u'comedycentral'
3202 def report_extraction(self, episode_id):
3203 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3205 def report_config_download(self, episode_id):
3206 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3208 def report_index_download(self, episode_id):
3209 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3211 def report_player_url(self, episode_id):
3212 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3214 def _real_extract(self, url):
3215 mobj = re.match(self._VALID_URL, url)
3217 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3220 if mobj.group('shortname'):
3221 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3222 url = u'http://www.thedailyshow.com/full-episodes/'
3224 url = u'http://www.colbertnation.com/full-episodes/'
3225 mobj = re.match(self._VALID_URL, url)
3226 assert mobj is not None
3228 dlNewest = not mobj.group('episode')
3230 epTitle = mobj.group('showname')
3232 epTitle = mobj.group('episode')
3234 req = urllib2.Request(url)
3235 self.report_extraction(epTitle)
3237 htmlHandle = urllib2.urlopen(req)
3238 html = htmlHandle.read()
3239 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3240 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3243 url = htmlHandle.geturl()
3244 mobj = re.match(self._VALID_URL, url)
3246 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3248 if mobj.group('episode') == '':
3249 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3251 epTitle = mobj.group('episode')
3253 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3254 if len(mMovieParams) == 0:
3255 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3258 playerUrl_raw = mMovieParams[0][0]
3259 self.report_player_url(epTitle)
3261 urlHandle = urllib2.urlopen(playerUrl_raw)
3262 playerUrl = urlHandle.geturl()
3263 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3264 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3267 uri = mMovieParams[0][1]
3268 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3269 self.report_index_download(epTitle)
3271 indexXml = urllib2.urlopen(indexUrl).read()
3272 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3273 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3276 idoc = xml.etree.ElementTree.fromstring(indexXml)
3277 itemEls = idoc.findall('.//item')
3278 for itemEl in itemEls:
3279 mediaId = itemEl.findall('./guid')[0].text
3280 shortMediaId = mediaId.split(':')[-1]
3281 showId = mediaId.split(':')[-2].replace('.com', '')
3282 officialTitle = itemEl.findall('./title')[0].text
3283 officialDate = itemEl.findall('./pubDate')[0].text
3285 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3286 urllib.urlencode({'uri': mediaId}))
3287 configReq = urllib2.Request(configUrl)
3288 self.report_config_download(epTitle)
3290 configXml = urllib2.urlopen(configReq).read()
3291 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3292 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3295 cdoc = xml.etree.ElementTree.fromstring(configXml)
3297 for rendition in cdoc.findall('.//rendition'):
3298 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3302 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3305 # For now, just pick the highest bitrate
3306 format,video_url = turls[-1]
3308 self._downloader.increment_downloads()
3310 effTitle = showId + u'-' + epTitle
3315 'upload_date': officialDate,
3317 'stitle': _simplify_title(effTitle),
3321 'description': officialTitle,
3322 'player_url': playerUrl
3326 self._downloader.process_info(info)
3327 except UnavailableVideoError, err:
3328 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3332 class EscapistIE(InfoExtractor):
3333 """Information extractor for The Escapist """
3335 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3336 IE_NAME = u'escapist'
3338 def report_extraction(self, showName):
3339 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3341 def report_config_download(self, showName):
3342 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3344 def _real_extract(self, url):
3345 htmlParser = HTMLParser.HTMLParser()
3347 mobj = re.match(self._VALID_URL, url)
3349 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3351 showName = mobj.group('showname')
3352 videoId = mobj.group('episode')
3354 self.report_extraction(showName)
3356 webPage = urllib2.urlopen(url).read()
3357 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3358 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3361 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3362 description = htmlParser.unescape(descMatch.group(1))
3363 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3364 imgUrl = htmlParser.unescape(imgMatch.group(1))
3365 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3366 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3367 configUrlMatch = re.search('config=(.*)$', playerUrl)
3368 configUrl = urllib2.unquote(configUrlMatch.group(1))
3370 self.report_config_download(showName)
3372 configJSON = urllib2.urlopen(configUrl).read()
3373 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3374 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3377 # Technically, it's JavaScript, not JSON
3378 configJSON = configJSON.replace("'", '"')
3381 config = json.loads(configJSON)
3382 except (ValueError,), err:
3383 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3386 playlist = config['playlist']
3387 videoUrl = playlist[1]['url']
3389 self._downloader.increment_downloads()
3393 'uploader': showName,
3394 'upload_date': None,
3396 'stitle': _simplify_title(showName),
3399 'thumbnail': imgUrl,
3400 'description': description,
3401 'player_url': playerUrl,
3405 self._downloader.process_info(info)
3406 except UnavailableVideoError, err:
3407 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3410 class CollegeHumorIE(InfoExtractor):
3411 """Information extractor for collegehumor.com"""
3413 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3414 IE_NAME = u'collegehumor'
3416 def report_webpage(self, video_id):
3417 """Report information extraction."""
3418 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3420 def report_extraction(self, video_id):
3421 """Report information extraction."""
3422 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3424 def _real_extract(self, url):
3425 htmlParser = HTMLParser.HTMLParser()
3427 mobj = re.match(self._VALID_URL, url)
3429 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3431 video_id = mobj.group('videoid')
3433 self.report_webpage(video_id)
3434 request = urllib2.Request(url)
3436 webpage = urllib2.urlopen(request).read()
3437 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3438 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3441 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3443 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3445 internal_video_id = m.group('internalvideoid')
3449 'internal_id': internal_video_id,
3452 self.report_extraction(video_id)
3453 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3455 metaXml = urllib2.urlopen(xmlUrl).read()
3456 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3457 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3460 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3462 videoNode = mdoc.findall('./video')[0]
3463 info['description'] = videoNode.findall('./description')[0].text
3464 info['title'] = videoNode.findall('./caption')[0].text
3465 info['stitle'] = _simplify_title(info['title'])
3466 info['url'] = videoNode.findall('./file')[0].text
3467 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3468 info['ext'] = info['url'].rpartition('.')[2]
3469 info['format'] = info['ext']
3471 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3474 self._downloader.increment_downloads()
3477 self._downloader.process_info(info)
3478 except UnavailableVideoError, err:
3479 self._downloader.trouble(u'\nERROR: unable to download video')
3482 class XVideosIE(InfoExtractor):
3483 """Information extractor for xvideos.com"""
3485 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3486 IE_NAME = u'xvideos'
3488 def report_webpage(self, video_id):
3489 """Report information extraction."""
3490 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3492 def report_extraction(self, video_id):
3493 """Report information extraction."""
3494 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3496 def _real_extract(self, url):
3497 htmlParser = HTMLParser.HTMLParser()
3499 mobj = re.match(self._VALID_URL, url)
3501 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3503 video_id = mobj.group(1).decode('utf-8')
3505 self.report_webpage(video_id)
3507 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3509 webpage = urllib2.urlopen(request).read()
3510 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3511 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3514 self.report_extraction(video_id)
3518 mobj = re.search(r'flv_url=(.+?)&', webpage)
3520 self._downloader.trouble(u'ERROR: unable to extract video url')
3522 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3526 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3528 self._downloader.trouble(u'ERROR: unable to extract video title')
3530 video_title = mobj.group(1).decode('utf-8')
3533 # Extract video thumbnail
3534 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3536 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3538 video_thumbnail = mobj.group(1).decode('utf-8')
3542 self._downloader.increment_downloads()
3547 'upload_date': None,
3548 'title': video_title,
3549 'stitle': _simplify_title(video_title),
3552 'thumbnail': video_thumbnail,
3553 'description': None,
3558 self._downloader.process_info(info)
3559 except UnavailableVideoError, err:
3560 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3563 class SoundcloudIE(InfoExtractor):
3564 """Information extractor for soundcloud.com
3565 To access the media, the uid of the song and a stream token
3566 must be extracted from the page source and the script must make
3567 a request to media.soundcloud.com/crossdomain.xml. Then
3568 the media can be grabbed by requesting from an url composed
3569 of the stream token and uid
3572 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3573 IE_NAME = u'soundcloud'
3575 def __init__(self, downloader=None):
3576 InfoExtractor.__init__(self, downloader)
3578 def report_webpage(self, video_id):
3579 """Report information extraction."""
3580 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3582 def report_extraction(self, video_id):
3583 """Report information extraction."""
3584 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3586 def _real_extract(self, url):
3587 htmlParser = HTMLParser.HTMLParser()
3589 mobj = re.match(self._VALID_URL, url)
3591 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3594 # extract uploader (which is in the url)
3595 uploader = mobj.group(1).decode('utf-8')
3596 # extract simple title (uploader + slug of song title)
3597 slug_title = mobj.group(2).decode('utf-8')
3598 simple_title = uploader + '-' + slug_title
3600 self.report_webpage('%s/%s' % (uploader, slug_title))
3602 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3604 webpage = urllib2.urlopen(request).read()
3605 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3606 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3609 self.report_extraction('%s/%s' % (uploader, slug_title))
3611 # extract uid and stream token that soundcloud hands out for access
3612 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3614 video_id = mobj.group(1)
3615 stream_token = mobj.group(2)
3617 # extract unsimplified title
3618 mobj = re.search('"title":"(.*?)",', webpage)
3620 title = mobj.group(1)
3622 # construct media url (with uid/token)
3623 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3624 mediaURL = mediaURL % (video_id, stream_token)
3627 description = u'No description available'
3628 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3630 description = mobj.group(1)
3634 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3637 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3638 except Exception, e:
3641 # for soundcloud, a request to a cross domain is required for cookies
3642 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3645 self._downloader.process_info({
3646 'id': video_id.decode('utf-8'),
3648 'uploader': uploader.decode('utf-8'),
3649 'upload_date': upload_date,
3650 'title': simple_title.decode('utf-8'),
3651 'stitle': simple_title.decode('utf-8'),
3655 'description': description.decode('utf-8')
3657 except UnavailableVideoError:
3658 self._downloader.trouble(u'\nERROR: unable to download video')
3661 class InfoQIE(InfoExtractor):
3662 """Information extractor for infoq.com"""
3664 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3667 def report_webpage(self, video_id):
3668 """Report information extraction."""
3669 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3671 def report_extraction(self, video_id):
3672 """Report information extraction."""
3673 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3675 def _real_extract(self, url):
3676 htmlParser = HTMLParser.HTMLParser()
3678 mobj = re.match(self._VALID_URL, url)
3680 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3683 self.report_webpage(url)
3685 request = urllib2.Request(url)
3687 webpage = urllib2.urlopen(request).read()
3688 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3689 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3692 self.report_extraction(url)
3696 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3698 self._downloader.trouble(u'ERROR: unable to extract video url')
3700 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3704 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3706 self._downloader.trouble(u'ERROR: unable to extract video title')
3708 video_title = mobj.group(1).decode('utf-8')
3710 # Extract description
3711 video_description = u'No description available.'
3712 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3713 if mobj is not None:
3714 video_description = mobj.group(1).decode('utf-8')
3716 video_filename = video_url.split('/')[-1]
3717 video_id, extension = video_filename.split('.')
3719 self._downloader.increment_downloads()
3724 'upload_date': None,
3725 'title': video_title,
3726 'stitle': _simplify_title(video_title),
3728 'format': extension, # Extension is always(?) mp4, but seems to be flv
3730 'description': video_description,
3735 self._downloader.process_info(info)
3736 except UnavailableVideoError, err:
3737 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3739 class MixcloudIE(InfoExtractor):
3740 """Information extractor for www.mixcloud.com"""
3741 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3742 IE_NAME = u'mixcloud'
3744 def __init__(self, downloader=None):
3745 InfoExtractor.__init__(self, downloader)
3747 def report_download_json(self, file_id):
3748 """Report JSON download."""
3749 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3751 def report_extraction(self, file_id):
3752 """Report information extraction."""
3753 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3755 def get_urls(self, jsonData, fmt, bitrate='best'):
3756 """Get urls from 'audio_formats' section in json"""
3759 bitrate_list = jsonData[fmt]
3760 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3761 bitrate = max(bitrate_list) # select highest
3763 url_list = jsonData[fmt][bitrate]
3764 except TypeError: # we have no bitrate info.
3765 url_list = jsonData[fmt]
3769 def check_urls(self, url_list):
3770 """Returns 1st active url from list"""
3771 for url in url_list:
3773 urllib2.urlopen(url)
3775 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3780 def _print_formats(self, formats):
3781 print 'Available formats:'
3782 for fmt in formats.keys():
3783 for b in formats[fmt]:
3785 ext = formats[fmt][b][0]
3786 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3787 except TypeError: # we have no bitrate info
3788 ext = formats[fmt][0]
3789 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3792 def _real_extract(self, url):
3793 mobj = re.match(self._VALID_URL, url)
3795 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3797 # extract uploader & filename from url
3798 uploader = mobj.group(1).decode('utf-8')
3799 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3801 # construct API request
3802 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3803 # retrieve .json file with links to files
3804 request = urllib2.Request(file_url)
3806 self.report_download_json(file_url)
3807 jsonData = urllib2.urlopen(request).read()
3808 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3809 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3813 json_data = json.loads(jsonData)
3814 player_url = json_data['player_swf_url']
3815 formats = dict(json_data['audio_formats'])
3817 req_format = self._downloader.params.get('format', None)
3820 if self._downloader.params.get('listformats', None):
3821 self._print_formats(formats)
3824 if req_format is None or req_format == 'best':
3825 for format_param in formats.keys():
3826 url_list = self.get_urls(formats, format_param)
3828 file_url = self.check_urls(url_list)
3829 if file_url is not None:
3832 if req_format not in formats.keys():
3833 self._downloader.trouble(u'ERROR: format is not available')
3836 url_list = self.get_urls(formats, req_format)
3837 file_url = self.check_urls(url_list)
3838 format_param = req_format
3841 self._downloader.increment_downloads()
3843 # Process file information
3844 self._downloader.process_info({
3845 'id': file_id.decode('utf-8'),
3846 'url': file_url.decode('utf-8'),
3847 'uploader': uploader.decode('utf-8'),
3848 'upload_date': u'NA',
3849 'title': json_data['name'],
3850 'stitle': _simplify_title(json_data['name']),
3851 'ext': file_url.split('.')[-1].decode('utf-8'),
3852 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3853 'thumbnail': json_data['thumbnail_url'],
3854 'description': json_data['description'],
3855 'player_url': player_url.decode('utf-8'),
3857 except UnavailableVideoError, err:
3858 self._downloader.trouble(u'ERROR: unable to download file')
3860 class StanfordOpenClassroomIE(InfoExtractor):
3861 """Information extractor for Stanford's Open ClassRoom"""
3863 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3864 IE_NAME = u'stanfordoc'
3866 def report_download_webpage(self, objid):
3867 """Report information extraction."""
3868 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3870 def report_extraction(self, video_id):
3871 """Report information extraction."""
3872 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3874 def _real_extract(self, url):
3875 mobj = re.match(self._VALID_URL, url)
3877 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3880 if mobj.group('course') and mobj.group('video'): # A specific video
3881 course = mobj.group('course')
3882 video = mobj.group('video')
3884 'id': _simplify_title(course + '_' + video),
3887 self.report_extraction(info['id'])
3888 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3889 xmlUrl = baseUrl + video + '.xml'
3891 metaXml = urllib2.urlopen(xmlUrl).read()
3892 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3893 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3895 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3897 info['title'] = mdoc.findall('./title')[0].text
3898 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3900 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3902 info['stitle'] = _simplify_title(info['title'])
3903 info['ext'] = info['url'].rpartition('.')[2]
3904 info['format'] = info['ext']
3905 self._downloader.increment_downloads()
3907 self._downloader.process_info(info)
3908 except UnavailableVideoError, err:
3909 self._downloader.trouble(u'\nERROR: unable to download video')
3910 elif mobj.group('course'): # A course page
3911 unescapeHTML = HTMLParser.HTMLParser().unescape
3913 course = mobj.group('course')
3915 'id': _simplify_title(course),
3919 self.report_download_webpage(info['id'])
3921 coursepage = urllib2.urlopen(url).read()
3922 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3923 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3926 m = re.search('<h1>([^<]+)</h1>', coursepage)
3928 info['title'] = unescapeHTML(m.group(1))
3930 info['title'] = info['id']
3931 info['stitle'] = _simplify_title(info['title'])
3933 m = re.search('<description>([^<]+)</description>', coursepage)
3935 info['description'] = unescapeHTML(m.group(1))
3937 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3940 'type': 'reference',
3941 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3945 for entry in info['list']:
3946 assert entry['type'] == 'reference'
3947 self.extract(entry['url'])
3949 unescapeHTML = HTMLParser.HTMLParser().unescape
3952 'id': 'Stanford OpenClassroom',
3956 self.report_download_webpage(info['id'])
3957 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3959 rootpage = urllib2.urlopen(rootURL).read()
3960 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3961 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3964 info['title'] = info['id']
3965 info['stitle'] = _simplify_title(info['title'])
3967 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3970 'type': 'reference',
3971 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3975 for entry in info['list']:
3976 assert entry['type'] == 'reference'
3977 self.extract(entry['url'])
3979 class MTVIE(InfoExtractor):
3980 """Information extractor for MTV.com"""
3982 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3985 def report_webpage(self, video_id):
3986 """Report information extraction."""
3987 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3989 def report_extraction(self, video_id):
3990 """Report information extraction."""
3991 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3993 def _real_extract(self, url):
3994 mobj = re.match(self._VALID_URL, url)
3996 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3998 if not mobj.group('proto'):
3999 url = 'http://' + url
4000 video_id = mobj.group('videoid')
4001 self.report_webpage(video_id)
4003 request = urllib2.Request(url)
4005 webpage = urllib2.urlopen(request).read()
4006 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4007 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4010 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4012 self._downloader.trouble(u'ERROR: unable to extract song name')
4014 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4015 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4017 self._downloader.trouble(u'ERROR: unable to extract performer')
4019 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4020 video_title = performer + ' - ' + song_name
4022 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4024 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4026 mtvn_uri = mobj.group(1)
4028 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4030 self._downloader.trouble(u'ERROR: unable to extract content id')
4032 content_id = mobj.group(1)
4034 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4035 self.report_extraction(video_id)
4036 request = urllib2.Request(videogen_url)
4038 metadataXml = urllib2.urlopen(request).read()
4039 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4040 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4043 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4044 renditions = mdoc.findall('.//rendition')
4046 # For now, always pick the highest quality.
4047 rendition = renditions[-1]
4050 _,_,ext = rendition.attrib['type'].partition('/')
4051 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4052 video_url = rendition.find('./src').text
4054 self._downloader.trouble('Invalid rendition field.')
4057 self._downloader.increment_downloads()
4061 'uploader': performer,
4062 'title': video_title,
4063 'stitle': _simplify_title(video_title),
4069 self._downloader.process_info(info)
4070 except UnavailableVideoError, err:
4071 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4074 class PostProcessor(object):
4075 """Post Processor class.
4077 PostProcessor objects can be added to downloaders with their
4078 add_post_processor() method. When the downloader has finished a
4079 successful download, it will take its internal chain of PostProcessors
4080 and start calling the run() method on each one of them, first with
4081 an initial argument and then with the returned value of the previous
4084 The chain will be stopped if one of them ever returns None or the end
4085 of the chain is reached.
4087 PostProcessor objects follow a "mutual registration" process similar
4088 to InfoExtractor objects.
4093 def __init__(self, downloader=None):
4094 self._downloader = downloader
4096 def set_downloader(self, downloader):
4097 """Sets the downloader for this PP."""
4098 self._downloader = downloader
4100 def run(self, information):
4101 """Run the PostProcessor.
4103 The "information" argument is a dictionary like the ones
4104 composed by InfoExtractors. The only difference is that this
4105 one has an extra field called "filepath" that points to the
4108 When this method returns None, the postprocessing chain is
4109 stopped. However, this method may return an information
4110 dictionary that will be passed to the next postprocessing
4111 object in the chain. It can be the one it received after
4112 changing some fields.
4114 In addition, this method may raise a PostProcessingError
4115 exception that will be taken into account by the downloader
4118 return information # by default, do nothing
4120 class AudioConversionError(BaseException):
4121 def __init__(self, message):
4122 self.message = message
4124 class FFmpegExtractAudioPP(PostProcessor):
4126 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4127 PostProcessor.__init__(self, downloader)
4128 if preferredcodec is None:
4129 preferredcodec = 'best'
4130 self._preferredcodec = preferredcodec
4131 self._preferredquality = preferredquality
4132 self._keepvideo = keepvideo
4135 def get_audio_codec(path):
4137 cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4138 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4139 output = handle.communicate()[0]
4140 if handle.wait() != 0:
4142 except (IOError, OSError):
4145 for line in output.split('\n'):
4146 if line.startswith('codec_name='):
4147 audio_codec = line.split('=')[1].strip()
4148 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4153 def run_ffmpeg(path, out_path, codec, more_opts):
4157 acodec_opts = ['-acodec', codec]
4158 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4160 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4161 stdout,stderr = p.communicate()
4162 except (IOError, OSError):
4163 e = sys.exc_info()[1]
4164 if isinstance(e, OSError) and e.errno == 2:
4165 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4168 if p.returncode != 0:
4169 msg = stderr.strip().split('\n')[-1]
4170 raise AudioConversionError(msg)
4172 def run(self, information):
4173 path = information['filepath']
4175 filecodec = self.get_audio_codec(path)
4176 if filecodec is None:
4177 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4181 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4182 if self._preferredcodec == 'm4a' and filecodec == 'aac':
4183 # Lossless, but in another container
4185 extension = self._preferredcodec
4186 more_opts = ['-absf', 'aac_adtstoasc']
4187 elif filecodec in ['aac', 'mp3', 'vorbis']:
4188 # Lossless if possible
4190 extension = filecodec
4191 if filecodec == 'aac':
4192 more_opts = ['-f', 'adts']
4193 if filecodec == 'vorbis':
4197 acodec = 'libmp3lame'
4200 if self._preferredquality is not None:
4201 more_opts += ['-ab', self._preferredquality]
4203 # We convert the audio (lossy)
4204 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4205 extension = self._preferredcodec
4207 if self._preferredquality is not None:
4208 more_opts += ['-ab', self._preferredquality]
4209 if self._preferredcodec == 'aac':
4210 more_opts += ['-f', 'adts']
4211 if self._preferredcodec == 'm4a':
4212 more_opts += ['-absf', 'aac_adtstoasc']
4213 if self._preferredcodec == 'vorbis':
4215 if self._preferredcodec == 'wav':
4217 more_opts += ['-f', 'wav']
4219 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4220 new_path = prefix + sep + extension
4221 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4223 self.run_ffmpeg(path, new_path, acodec, more_opts)
4225 etype,e,tb = sys.exc_info()
4226 if isinstance(e, AudioConversionError):
4227 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4229 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4232 # Try to update the date time for extracted audio file.
4233 if information.get('filetime') is not None:
4235 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4237 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4239 if not self._keepvideo:
4241 os.remove(_encodeFilename(path))
4242 except (IOError, OSError):
4243 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4246 information['filepath'] = new_path
4250 def updateSelf(downloader, filename):
4251 ''' Update the program file with the latest version from the repository '''
4252 # Note: downloader only used for options
4253 if not os.access(filename, os.W_OK):
4254 sys.exit('ERROR: no write permissions on %s' % filename)
4256 downloader.to_screen(u'Updating to latest version...')
4260 urlh = urllib.urlopen(UPDATE_URL)
4261 newcontent = urlh.read()
4263 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4264 if vmatch is not None and vmatch.group(1) == __version__:
4265 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4269 except (IOError, OSError), err:
4270 sys.exit('ERROR: unable to download latest version')
4273 outf = open(filename, 'wb')
4275 outf.write(newcontent)
4278 except (IOError, OSError), err:
4279 sys.exit('ERROR: unable to overwrite current version')
4281 downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4284 def _readOptions(filename_bytes):
4286 optionf = open(filename_bytes)
4288 return [] # silently skip if file is not present
4292 res += shlex.split(l, comments=True)
4297 def _format_option_string(option):
4298 ''' ('-o', '--option') -> -o, --format METAVAR'''
4302 if option._short_opts: opts.append(option._short_opts[0])
4303 if option._long_opts: opts.append(option._long_opts[0])
4304 if len(opts) > 1: opts.insert(1, ', ')
4306 if option.takes_value(): opts.append(' %s' % option.metavar)
4308 return "".join(opts)
4310 def _find_term_columns():
4311 columns = os.environ.get('COLUMNS', None)
4316 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4317 out,err = sp.communicate()
4318 return int(out.split()[1])
4324 max_help_position = 80
4326 # No need to wrap help messages if we're on a wide console
4327 columns = _find_term_columns()
4328 if columns: max_width = columns
4330 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4331 fmt.format_option_strings = _format_option_string
4334 'version' : __version__,
4336 'usage' : '%prog [options] url [url...]',
4337 'conflict_handler' : 'resolve',
4340 parser = optparse.OptionParser(**kw)
4343 general = optparse.OptionGroup(parser, 'General Options')
4344 selection = optparse.OptionGroup(parser, 'Video Selection')
4345 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4346 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4347 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4348 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4349 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4351 general.add_option('-h', '--help',
4352 action='help', help='print this help text and exit')
4353 general.add_option('-v', '--version',
4354 action='version', help='print program version and exit')
4355 general.add_option('-U', '--update',
4356 action='store_true', dest='update_self', help='update this program to latest version')
4357 general.add_option('-i', '--ignore-errors',
4358 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4359 general.add_option('-r', '--rate-limit',
4360 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4361 general.add_option('-R', '--retries',
4362 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4363 general.add_option('--dump-user-agent',
4364 action='store_true', dest='dump_user_agent',
4365 help='display the current browser identification', default=False)
4366 general.add_option('--list-extractors',
4367 action='store_true', dest='list_extractors',
4368 help='List all supported extractors and the URLs they would handle', default=False)
4370 selection.add_option('--playlist-start',
4371 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4372 selection.add_option('--playlist-end',
4373 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4374 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4375 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4376 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4378 authentication.add_option('-u', '--username',
4379 dest='username', metavar='USERNAME', help='account username')
4380 authentication.add_option('-p', '--password',
4381 dest='password', metavar='PASSWORD', help='account password')
4382 authentication.add_option('-n', '--netrc',
4383 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4386 video_format.add_option('-f', '--format',
4387 action='store', dest='format', metavar='FORMAT', help='video format code')
4388 video_format.add_option('--all-formats',
4389 action='store_const', dest='format', help='download all available video formats', const='all')
4390 video_format.add_option('--prefer-free-formats',
4391 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4392 video_format.add_option('--max-quality',
4393 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4394 video_format.add_option('-F', '--list-formats',
4395 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4396 video_format.add_option('--write-srt',
4397 action='store_true', dest='writesubtitles',
4398 help='write video closed captions to a .srt file (currently youtube only)', default=False)
4399 video_format.add_option('--srt-lang',
4400 action='store', dest='subtitleslang', metavar='LANG',
4401 help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
4404 verbosity.add_option('-q', '--quiet',
4405 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4406 verbosity.add_option('-s', '--simulate',
4407 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4408 verbosity.add_option('--skip-download',
4409 action='store_true', dest='skip_download', help='do not download the video', default=False)
4410 verbosity.add_option('-g', '--get-url',
4411 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4412 verbosity.add_option('-e', '--get-title',
4413 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4414 verbosity.add_option('--get-thumbnail',
4415 action='store_true', dest='getthumbnail',
4416 help='simulate, quiet but print thumbnail URL', default=False)
4417 verbosity.add_option('--get-description',
4418 action='store_true', dest='getdescription',
4419 help='simulate, quiet but print video description', default=False)
4420 verbosity.add_option('--get-filename',
4421 action='store_true', dest='getfilename',
4422 help='simulate, quiet but print output filename', default=False)
4423 verbosity.add_option('--get-format',
4424 action='store_true', dest='getformat',
4425 help='simulate, quiet but print output format', default=False)
4426 verbosity.add_option('--no-progress',
4427 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4428 verbosity.add_option('--console-title',
4429 action='store_true', dest='consoletitle',
4430 help='display progress in console titlebar', default=False)
4431 verbosity.add_option('-v', '--verbose',
4432 action='store_true', dest='verbose', help='print various debugging information', default=False)
4435 filesystem.add_option('-t', '--title',
4436 action='store_true', dest='usetitle', help='use title in file name', default=False)
4437 filesystem.add_option('-l', '--literal',
4438 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4439 filesystem.add_option('-A', '--auto-number',
4440 action='store_true', dest='autonumber',
4441 help='number downloaded files starting from 00000', default=False)
4442 filesystem.add_option('-o', '--output',
4443 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4444 filesystem.add_option('-a', '--batch-file',
4445 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4446 filesystem.add_option('-w', '--no-overwrites',
4447 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4448 filesystem.add_option('-c', '--continue',
4449 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4450 filesystem.add_option('--no-continue',
4451 action='store_false', dest='continue_dl',
4452 help='do not resume partially downloaded files (restart from beginning)')
4453 filesystem.add_option('--cookies',
4454 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4455 filesystem.add_option('--no-part',
4456 action='store_true', dest='nopart', help='do not use .part files', default=False)
4457 filesystem.add_option('--no-mtime',
4458 action='store_false', dest='updatetime',
4459 help='do not use the Last-modified header to set the file modification time', default=True)
4460 filesystem.add_option('--write-description',
4461 action='store_true', dest='writedescription',
4462 help='write video description to a .description file', default=False)
4463 filesystem.add_option('--write-info-json',
4464 action='store_true', dest='writeinfojson',
4465 help='write video metadata to a .info.json file', default=False)
4468 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4469 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4470 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4471 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4472 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4473 help='ffmpeg audio bitrate specification, 128k by default')
4474 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4475 help='keeps the video file on disk after the post-processing; the video is erased by default')
4478 parser.add_option_group(general)
4479 parser.add_option_group(selection)
4480 parser.add_option_group(filesystem)
4481 parser.add_option_group(verbosity)
4482 parser.add_option_group(video_format)
4483 parser.add_option_group(authentication)
4484 parser.add_option_group(postproc)
4486 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4488 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4490 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4491 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4492 opts, args = parser.parse_args(argv)
4494 return parser, opts, args
4496 def gen_extractors():
4497 """ Return a list of an instance of every supported extractor.
4498 The order does matter; the first extractor matched is the one handling the URL.
4500 youtube_ie = YoutubeIE()
4501 google_ie = GoogleIE()
4502 yahoo_ie = YahooIE()
4504 YoutubePlaylistIE(youtube_ie),
4505 YoutubeUserIE(youtube_ie),
4506 YoutubeSearchIE(youtube_ie),
4508 MetacafeIE(youtube_ie),
4511 GoogleSearchIE(google_ie),
4514 YahooSearchIE(yahoo_ie),
4527 StanfordOpenClassroomIE(),
4534 parser, opts, args = parseOpts()
4536 # Open appropriate CookieJar
4537 if opts.cookiefile is None:
4538 jar = cookielib.CookieJar()
4541 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4542 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4544 except (IOError, OSError), err:
4545 sys.exit(u'ERROR: unable to open cookie file')
4548 if opts.dump_user_agent:
4549 print std_headers['User-Agent']
4552 # Batch file verification
4554 if opts.batchfile is not None:
4556 if opts.batchfile == '-':
4559 batchfd = open(opts.batchfile, 'r')
4560 batchurls = batchfd.readlines()
4561 batchurls = [x.strip() for x in batchurls]
4562 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4564 sys.exit(u'ERROR: batch file could not be read')
4565 all_urls = batchurls + args
4566 all_urls = map(lambda url: url.strip(), all_urls)
4568 # General configuration
4569 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4570 proxy_handler = urllib2.ProxyHandler()
4571 opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4572 urllib2.install_opener(opener)
4573 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4576 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4578 extractors = gen_extractors()
4580 if opts.list_extractors:
4581 for ie in extractors:
4583 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4584 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4585 for mu in matchedUrls:
4589 # Conflicting, missing and erroneous options
4590 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4591 parser.error(u'using .netrc conflicts with giving username/password')
4592 if opts.password is not None and opts.username is None:
4593 parser.error(u'account username missing')
4594 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4595 parser.error(u'using output template conflicts with using title, literal title or auto number')
4596 if opts.usetitle and opts.useliteral:
4597 parser.error(u'using title conflicts with using literal title')
4598 if opts.username is not None and opts.password is None:
4599 opts.password = getpass.getpass(u'Type account password and press return:')
4600 if opts.ratelimit is not None:
4601 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4602 if numeric_limit is None:
4603 parser.error(u'invalid rate limit specified')
4604 opts.ratelimit = numeric_limit
4605 if opts.retries is not None:
4607 opts.retries = long(opts.retries)
4608 except (TypeError, ValueError), err:
4609 parser.error(u'invalid retry count specified')
4611 opts.playliststart = int(opts.playliststart)
4612 if opts.playliststart <= 0:
4613 raise ValueError(u'Playlist start must be positive')
4614 except (TypeError, ValueError), err:
4615 parser.error(u'invalid playlist start number specified')
4617 opts.playlistend = int(opts.playlistend)
4618 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4619 raise ValueError(u'Playlist end must be greater than playlist start')
4620 except (TypeError, ValueError), err:
4621 parser.error(u'invalid playlist end number specified')
4622 if opts.extractaudio:
4623 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4624 parser.error(u'invalid audio format specified')
4627 fd = FileDownloader({
4628 'usenetrc': opts.usenetrc,
4629 'username': opts.username,
4630 'password': opts.password,
4631 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4632 'forceurl': opts.geturl,
4633 'forcetitle': opts.gettitle,
4634 'forcethumbnail': opts.getthumbnail,
4635 'forcedescription': opts.getdescription,
4636 'forcefilename': opts.getfilename,
4637 'forceformat': opts.getformat,
4638 'simulate': opts.simulate,
4639 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4640 'format': opts.format,
4641 'format_limit': opts.format_limit,
4642 'listformats': opts.listformats,
4643 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4644 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4645 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4646 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4647 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4648 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4649 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4650 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4651 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4652 or u'%(id)s.%(ext)s'),
4653 'ignoreerrors': opts.ignoreerrors,
4654 'ratelimit': opts.ratelimit,
4655 'nooverwrites': opts.nooverwrites,
4656 'retries': opts.retries,
4657 'continuedl': opts.continue_dl,
4658 'noprogress': opts.noprogress,
4659 'playliststart': opts.playliststart,
4660 'playlistend': opts.playlistend,
4661 'logtostderr': opts.outtmpl == '-',
4662 'consoletitle': opts.consoletitle,
4663 'nopart': opts.nopart,
4664 'updatetime': opts.updatetime,
4665 'writedescription': opts.writedescription,
4666 'writeinfojson': opts.writeinfojson,
4667 'writesubtitles': opts.writesubtitles,
4668 'subtitleslang': opts.subtitleslang,
4669 'matchtitle': opts.matchtitle,
4670 'rejecttitle': opts.rejecttitle,
4671 'max_downloads': opts.max_downloads,
4672 'prefer_free_formats': opts.prefer_free_formats,
4673 'verbose': opts.verbose,
4675 for extractor in extractors:
4676 fd.add_info_extractor(extractor)
4679 if opts.extractaudio:
4680 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4683 if opts.update_self:
4684 updateSelf(fd, sys.argv[0])
4687 if len(all_urls) < 1:
4688 if not opts.update_self:
4689 parser.error(u'you must provide at least one URL')
4694 retcode = fd.download(all_urls)
4695 except MaxDownloadsReached:
4696 fd.to_screen(u'--max-download limit reached, aborting.')
4699 # Dump cookie jar if requested
4700 if opts.cookiefile is not None:
4703 except (IOError, OSError), err:
4704 sys.exit(u'ERROR: unable to save cookie jar')
4711 except DownloadError:
4713 except SameFileError:
4714 sys.exit(u'ERROR: fixed output name but more than one file to download')
4715 except KeyboardInterrupt:
4716 sys.exit(u'\nERROR: Interrupted by user')
4718 if __name__ == '__main__':
4721 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: