2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
20 __license__ = 'Public Domain'
21 __version__ = '2012.02.27'
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
56 except ImportError: # Python 2.4
59 import cStringIO as StringIO
63 # parse_qs was moved from the cgi module to the urlparse module recently.
65 from urlparse import parse_qs
67 from cgi import parse_qs
75 import xml.etree.ElementTree
76 except ImportError: # Python<2.5: Not officially supported, but let it slip
77 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
80 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
81 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
82 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
83 'Accept-Encoding': 'gzip, deflate',
84 'Accept-Language': 'en-us,en;q=0.5',
89 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
95 def raiseError(msg, i):
96 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
97 def skipSpace(i, expectMore=True):
98 while i < len(s) and s[i] in ' \t\r\n':
102 raiseError('Premature end', i)
104 def decodeEscape(match):
120 return unichr(int(esc[1:5], 16))
121 if len(esc) == 5+6 and esc[5:7] == '\\u':
122 hi = int(esc[1:5], 16)
123 low = int(esc[7:11], 16)
124 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
125 raise ValueError('Unknown escape ' + str(esc))
132 while s[e-bslashes-1] == '\\':
134 if bslashes % 2 == 1:
138 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
139 stri = rexp.sub(decodeEscape, s[i:e])
145 if s[i] == '}': # Empty dictionary
149 raiseError('Expected a string object key', i)
150 i,key = parseString(i)
152 if i >= len(s) or s[i] != ':':
153 raiseError('Expected a colon', i)
160 raiseError('Expected comma or closing curly brace', i)
165 if s[i] == ']': # Empty array
170 i = skipSpace(i) # Raise exception if premature end
174 raiseError('Expected a comma or closing bracket', i)
176 def parseDiscrete(i):
177 for k,v in {'true': True, 'false': False, 'null': None}.items():
178 if s.startswith(k, i):
180 raiseError('Not a boolean (or null)', i)
182 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
184 raiseError('Not a number', i)
186 if '.' in nums or 'e' in nums or 'E' in nums:
187 return (i+len(nums), float(nums))
188 return (i+len(nums), int(nums))
189 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
192 i,res = CHARMAP.get(s[i], parseNumber)(i)
193 i = skipSpace(i, False)
197 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
200 def preferredencoding():
201 """Get preferred encoding.
203 Returns the best encoding scheme for the system, based on
204 locale.getpreferredencoding() and some further tweaks.
206 def yield_preferredencoding():
208 pref = locale.getpreferredencoding()
214 return yield_preferredencoding().next()
217 def htmlentity_transform(matchobj):
218 """Transforms an HTML entity to a Unicode character.
220 This function receives a match object and is intended to be used with
221 the re.sub() function.
223 entity = matchobj.group(1)
225 # Known non-numeric HTML entity
226 if entity in htmlentitydefs.name2codepoint:
227 return unichr(htmlentitydefs.name2codepoint[entity])
230 mobj = re.match(ur'(?u)#(x?\d+)', entity)
232 numstr = mobj.group(1)
233 if numstr.startswith(u'x'):
235 numstr = u'0%s' % numstr
238 return unichr(long(numstr, base))
240 # Unknown entity in name, return its literal representation
241 return (u'&%s;' % entity)
244 def sanitize_title(utitle):
245 """Sanitizes a video title so it could be used as part of a filename."""
246 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
247 return utitle.replace(unicode(os.sep), u'%')
250 def sanitize_open(filename, open_mode):
251 """Try to open the given filename, and slightly tweak it if this fails.
253 Attempts to open the given filename. If this fails, it tries to change
254 the filename slightly, step by step, until it's either able to open it
255 or it fails and raises a final exception, like the standard open()
258 It returns the tuple (stream, definitive_file_name).
262 if sys.platform == 'win32':
264 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
265 return (sys.stdout, filename)
266 stream = open(_encodeFilename(filename), open_mode)
267 return (stream, filename)
268 except (IOError, OSError), err:
269 # In case of error, try to remove win32 forbidden chars
270 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
272 # An exception here should be caught in the caller
273 stream = open(_encodeFilename(filename), open_mode)
274 return (stream, filename)
277 def timeconvert(timestr):
278 """Convert RFC 2822 defined time string into system timestamp"""
280 timetuple = email.utils.parsedate_tz(timestr)
281 if timetuple is not None:
282 timestamp = email.utils.mktime_tz(timetuple)
285 def _simplify_title(title):
286 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
287 return expr.sub(u'_', title).strip(u'_')
289 def _orderedSet(iterable):
290 """ Remove all duplicates from the input iterable """
297 def _unescapeHTML(s):
299 @param s a string (of type unicode)
301 assert type(s) == type(u'')
303 htmlParser = HTMLParser.HTMLParser()
304 return htmlParser.unescape(s)
306 def _encodeFilename(s):
308 @param s The name of the file (of type unicode)
311 assert type(s) == type(u'')
313 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
314 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
315 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
316 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
319 return s.encode(sys.getfilesystemencoding(), 'ignore')
321 class DownloadError(Exception):
322 """Download Error exception.
324 This exception may be thrown by FileDownloader objects if they are not
325 configured to continue on errors. They will contain the appropriate
331 class SameFileError(Exception):
332 """Same File exception.
334 This exception will be thrown by FileDownloader objects if they detect
335 multiple files would have to be downloaded to the same file on disk.
340 class PostProcessingError(Exception):
341 """Post Processing exception.
343 This exception may be raised by PostProcessor's .run() method to
344 indicate an error in the postprocessing task.
348 class MaxDownloadsReached(Exception):
349 """ --max-downloads limit has been reached. """
353 class UnavailableVideoError(Exception):
354 """Unavailable Format exception.
356 This exception will be thrown when a video is requested
357 in a format that is not available for that video.
362 class ContentTooShortError(Exception):
363 """Content Too Short exception.
365 This exception may be raised by FileDownloader objects when a file they
366 download is too small for what the server announced first, indicating
367 the connection was probably interrupted.
373 def __init__(self, downloaded, expected):
374 self.downloaded = downloaded
375 self.expected = expected
378 class YoutubeDLHandler(urllib2.HTTPHandler):
379 """Handler for HTTP requests and responses.
381 This class, when installed with an OpenerDirector, automatically adds
382 the standard headers to every HTTP request and handles gzipped and
383 deflated responses from web servers. If compression is to be avoided in
384 a particular request, the original request in the program code only has
385 to include the HTTP header "Youtubedl-No-Compression", which will be
386 removed before making the real request.
388 Part of this code was copied from:
390 http://techknack.net/python-urllib2-handlers/
392 Andrew Rowls, the author of that code, agreed to release it to the
399 return zlib.decompress(data, -zlib.MAX_WBITS)
401 return zlib.decompress(data)
404 def addinfourl_wrapper(stream, headers, url, code):
405 if hasattr(urllib2.addinfourl, 'getcode'):
406 return urllib2.addinfourl(stream, headers, url, code)
407 ret = urllib2.addinfourl(stream, headers, url)
411 def http_request(self, req):
412 for h in std_headers:
415 req.add_header(h, std_headers[h])
416 if 'Youtubedl-no-compression' in req.headers:
417 if 'Accept-encoding' in req.headers:
418 del req.headers['Accept-encoding']
419 del req.headers['Youtubedl-no-compression']
422 def http_response(self, req, resp):
425 if resp.headers.get('Content-encoding', '') == 'gzip':
426 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
427 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
428 resp.msg = old_resp.msg
430 if resp.headers.get('Content-encoding', '') == 'deflate':
431 gz = StringIO.StringIO(self.deflate(resp.read()))
432 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
433 resp.msg = old_resp.msg
437 class FileDownloader(object):
438 """File Downloader class.
440 File downloader objects are the ones responsible of downloading the
441 actual video file and writing it to disk if the user has requested
442 it, among some other tasks. In most cases there should be one per
443 program. As, given a video URL, the downloader doesn't know how to
444 extract all the needed information, task that InfoExtractors do, it
445 has to pass the URL to one of them.
447 For this, file downloader objects have a method that allows
448 InfoExtractors to be registered in a given order. When it is passed
449 a URL, the file downloader handles it to the first InfoExtractor it
450 finds that reports being able to handle it. The InfoExtractor extracts
451 all the information about the video or videos the URL refers to, and
452 asks the FileDownloader to process the video information, possibly
453 downloading the video.
455 File downloaders accept a lot of parameters. In order not to saturate
456 the object constructor with arguments, it receives a dictionary of
457 options instead. These options are available through the params
458 attribute for the InfoExtractors to use. The FileDownloader also
459 registers itself as the downloader in charge for the InfoExtractors
460 that are added to it, so this is a "mutual registration".
464 username: Username for authentication purposes.
465 password: Password for authentication purposes.
466 usenetrc: Use netrc for authentication instead.
467 quiet: Do not print messages to stdout.
468 forceurl: Force printing final URL.
469 forcetitle: Force printing title.
470 forcethumbnail: Force printing thumbnail URL.
471 forcedescription: Force printing description.
472 forcefilename: Force printing final filename.
473 simulate: Do not download the video files.
474 format: Video format code.
475 format_limit: Highest quality format to try.
476 outtmpl: Template for output names.
477 ignoreerrors: Do not stop on download errors.
478 ratelimit: Download speed limit, in bytes/sec.
479 nooverwrites: Prevent overwriting files.
480 retries: Number of times to retry for HTTP error 5xx
481 continuedl: Try to continue downloads if possible.
482 noprogress: Do not print the progress bar.
483 playliststart: Playlist item to start at.
484 playlistend: Playlist item to end at.
485 matchtitle: Download only matching titles.
486 rejecttitle: Reject downloads for matching titles.
487 logtostderr: Log messages to stderr instead of stdout.
488 consoletitle: Display progress in console window's titlebar.
489 nopart: Do not use temporary .part files.
490 updatetime: Use the Last-modified header to set output file timestamps.
491 writedescription: Write the video description to a .description file
492 writeinfojson: Write the video description to a .info.json file
498 _download_retcode = None
499 _num_downloads = None
502 def __init__(self, params):
503 """Create a FileDownloader object with the given options."""
506 self._download_retcode = 0
507 self._num_downloads = 0
508 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
512 def format_bytes(bytes):
515 if type(bytes) is str:
520 exponent = long(math.log(bytes, 1024.0))
521 suffix = 'bkMGTPEZY'[exponent]
522 converted = float(bytes) / float(1024 ** exponent)
523 return '%.2f%s' % (converted, suffix)
526 def calc_percent(byte_counter, data_len):
529 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
532 def calc_eta(start, now, total, current):
536 if current == 0 or dif < 0.001: # One millisecond
538 rate = float(current) / dif
539 eta = long((float(total) - float(current)) / rate)
540 (eta_mins, eta_secs) = divmod(eta, 60)
543 return '%02d:%02d' % (eta_mins, eta_secs)
546 def calc_speed(start, now, bytes):
548 if bytes == 0 or dif < 0.001: # One millisecond
549 return '%10s' % '---b/s'
550 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
553 def best_block_size(elapsed_time, bytes):
554 new_min = max(bytes / 2.0, 1.0)
555 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
556 if elapsed_time < 0.001:
558 rate = bytes / elapsed_time
566 def parse_bytes(bytestr):
567 """Parse a string indicating a byte quantity into a long integer."""
568 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
571 number = float(matchobj.group(1))
572 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
573 return long(round(number * multiplier))
575 def add_info_extractor(self, ie):
576 """Add an InfoExtractor object to the end of the list."""
578 ie.set_downloader(self)
580 def add_post_processor(self, pp):
581 """Add a PostProcessor object to the end of the chain."""
583 pp.set_downloader(self)
585 def to_screen(self, message, skip_eol=False):
586 """Print message to stdout if not in quiet mode."""
587 assert type(message) == type(u'')
588 if not self.params.get('quiet', False):
589 terminator = [u'\n', u''][skip_eol]
590 output = message + terminator
592 if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
593 output = output.encode(preferredencoding(), 'ignore')
594 self._screen_file.write(output)
595 self._screen_file.flush()
597 def to_stderr(self, message):
598 """Print message to stderr."""
599 print >>sys.stderr, message.encode(preferredencoding())
601 def to_cons_title(self, message):
602 """Set console/terminal window title to message."""
603 if not self.params.get('consoletitle', False):
605 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
606 # c_wchar_p() might not be necessary if `message` is
607 # already of type unicode()
608 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
609 elif 'TERM' in os.environ:
610 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
612 def fixed_template(self):
613 """Checks if the output template is fixed."""
614 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
616 def trouble(self, message=None):
617 """Determine action to take when a download problem appears.
619 Depending on if the downloader has been configured to ignore
620 download errors or not, this method may throw an exception or
621 not when errors are found, after printing the message.
623 if message is not None:
624 self.to_stderr(message)
625 if not self.params.get('ignoreerrors', False):
626 raise DownloadError(message)
627 self._download_retcode = 1
629 def slow_down(self, start_time, byte_counter):
630 """Sleep if the download speed is over the rate limit."""
631 rate_limit = self.params.get('ratelimit', None)
632 if rate_limit is None or byte_counter == 0:
635 elapsed = now - start_time
638 speed = float(byte_counter) / elapsed
639 if speed > rate_limit:
640 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
642 def temp_name(self, filename):
643 """Returns a temporary filename for the given filename."""
644 if self.params.get('nopart', False) or filename == u'-' or \
645 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
647 return filename + u'.part'
649 def undo_temp_name(self, filename):
650 if filename.endswith(u'.part'):
651 return filename[:-len(u'.part')]
654 def try_rename(self, old_filename, new_filename):
656 if old_filename == new_filename:
658 os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
659 except (IOError, OSError), err:
660 self.trouble(u'ERROR: unable to rename file')
662 def try_utime(self, filename, last_modified_hdr):
663 """Try to set the last-modified time of the given file."""
664 if last_modified_hdr is None:
666 if not os.path.isfile(_encodeFilename(filename)):
668 timestr = last_modified_hdr
671 filetime = timeconvert(timestr)
675 os.utime(filename, (time.time(), filetime))
680 def report_writedescription(self, descfn):
681 """ Report that the description file is being written """
682 self.to_screen(u'[info] Writing video description to: ' + descfn)
684 def report_writeinfojson(self, infofn):
685 """ Report that the metadata file has been written """
686 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
688 def report_destination(self, filename):
689 """Report destination filename."""
690 self.to_screen(u'[download] Destination: ' + filename)
692 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
693 """Report download progress."""
694 if self.params.get('noprogress', False):
696 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
697 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
698 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
699 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
701 def report_resuming_byte(self, resume_len):
702 """Report attempt to resume at given byte."""
703 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
705 def report_retry(self, count, retries):
706 """Report retry in case of HTTP error 5xx"""
707 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
709 def report_file_already_downloaded(self, file_name):
710 """Report file has already been fully downloaded."""
712 self.to_screen(u'[download] %s has already been downloaded' % file_name)
713 except (UnicodeEncodeError), err:
714 self.to_screen(u'[download] The file has already been downloaded')
716 def report_unable_to_resume(self):
717 """Report it was impossible to resume download."""
718 self.to_screen(u'[download] Unable to resume')
720 def report_finish(self):
721 """Report download finished."""
722 if self.params.get('noprogress', False):
723 self.to_screen(u'[download] Download completed')
727 def increment_downloads(self):
728 """Increment the ordinal that assigns a number to each file."""
729 self._num_downloads += 1
731 def prepare_filename(self, info_dict):
732 """Generate the output filename."""
734 template_dict = dict(info_dict)
735 template_dict['epoch'] = unicode(long(time.time()))
736 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
737 filename = self.params['outtmpl'] % template_dict
739 except (ValueError, KeyError), err:
740 self.trouble(u'ERROR: invalid system charset or erroneous output template')
743 def _match_entry(self, info_dict):
744 """ Returns None iff the file should be downloaded """
746 title = info_dict['title']
747 matchtitle = self.params.get('matchtitle', False)
748 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
749 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
750 rejecttitle = self.params.get('rejecttitle', False)
751 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
752 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
755 def process_info(self, info_dict):
756 """Process a single dictionary returned by an InfoExtractor."""
758 reason = self._match_entry(info_dict)
759 if reason is not None:
760 self.to_screen(u'[download] ' + reason)
763 max_downloads = self.params.get('max_downloads')
764 if max_downloads is not None:
765 if self._num_downloads > int(max_downloads):
766 raise MaxDownloadsReached()
768 filename = self.prepare_filename(info_dict)
771 if self.params.get('forcetitle', False):
772 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
773 if self.params.get('forceurl', False):
774 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
775 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
776 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
777 if self.params.get('forcedescription', False) and 'description' in info_dict:
778 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
779 if self.params.get('forcefilename', False) and filename is not None:
780 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
781 if self.params.get('forceformat', False):
782 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
784 # Do nothing else if in simulate mode
785 if self.params.get('simulate', False):
792 dn = os.path.dirname(_encodeFilename(filename))
793 if dn != '' and not os.path.exists(dn): # dn is already encoded
795 except (OSError, IOError), err:
796 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
799 if self.params.get('writedescription', False):
801 descfn = filename + u'.description'
802 self.report_writedescription(descfn)
803 descfile = open(_encodeFilename(descfn), 'wb')
805 descfile.write(info_dict['description'].encode('utf-8'))
808 except (OSError, IOError):
809 self.trouble(u'ERROR: Cannot write description file ' + descfn)
812 if self.params.get('writeinfojson', False):
813 infofn = filename + u'.info.json'
814 self.report_writeinfojson(infofn)
817 except (NameError,AttributeError):
818 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
821 infof = open(_encodeFilename(infofn), 'wb')
823 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
824 json.dump(json_info_dict, infof)
827 except (OSError, IOError):
828 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
831 if not self.params.get('skip_download', False):
832 if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
836 success = self._do_download(filename, info_dict)
837 except (OSError, IOError), err:
838 raise UnavailableVideoError
839 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
840 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
842 except (ContentTooShortError, ), err:
843 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
848 self.post_process(filename, info_dict)
849 except (PostProcessingError), err:
850 self.trouble(u'ERROR: postprocessing: %s' % str(err))
853 def download(self, url_list):
854 """Download a given list of URLs."""
855 if len(url_list) > 1 and self.fixed_template():
856 raise SameFileError(self.params['outtmpl'])
859 suitable_found = False
861 # Go to next InfoExtractor if not suitable
862 if not ie.suitable(url):
865 # Suitable InfoExtractor found
866 suitable_found = True
868 # Extract information from URL and process it
871 # Suitable InfoExtractor had been found; go to next URL
874 if not suitable_found:
875 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
877 return self._download_retcode
879 def post_process(self, filename, ie_info):
880 """Run the postprocessing chain on the given file."""
882 info['filepath'] = filename
888 def _download_with_rtmpdump(self, filename, url, player_url):
889 self.report_destination(filename)
890 tmpfilename = self.temp_name(filename)
892 # Check for rtmpdump first
894 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
895 except (OSError, IOError):
896 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
899 # Download using rtmpdump. rtmpdump returns exit code 2 when
900 # the connection was interrumpted and resuming appears to be
901 # possible. This is part of rtmpdump's normal usage, AFAIK.
902 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
903 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
904 if self.params.get('verbose', False):
907 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
910 self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
911 retval = subprocess.call(args)
912 while retval == 2 or retval == 1:
913 prevsize = os.path.getsize(_encodeFilename(tmpfilename))
914 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
915 time.sleep(5.0) # This seems to be needed
916 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
917 cursize = os.path.getsize(_encodeFilename(tmpfilename))
918 if prevsize == cursize and retval == 1:
920 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
921 if prevsize == cursize and retval == 2 and cursize > 1024:
922 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
926 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
927 self.try_rename(tmpfilename, filename)
930 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
933 def _do_download(self, filename, info_dict):
934 url = info_dict['url']
935 player_url = info_dict.get('player_url', None)
937 # Check file already present
938 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
939 self.report_file_already_downloaded(filename)
942 # Attempt to download using rtmpdump
943 if url.startswith('rtmp'):
944 return self._download_with_rtmpdump(filename, url, player_url)
946 tmpfilename = self.temp_name(filename)
949 # Do not include the Accept-Encoding header
950 headers = {'Youtubedl-no-compression': 'True'}
951 basic_request = urllib2.Request(url, None, headers)
952 request = urllib2.Request(url, None, headers)
954 # Establish possible resume length
955 if os.path.isfile(_encodeFilename(tmpfilename)):
956 resume_len = os.path.getsize(_encodeFilename(tmpfilename))
962 if self.params.get('continuedl', False):
963 self.report_resuming_byte(resume_len)
964 request.add_header('Range','bytes=%d-' % resume_len)
970 retries = self.params.get('retries', 0)
971 while count <= retries:
972 # Establish connection
974 if count == 0 and 'urlhandle' in info_dict:
975 data = info_dict['urlhandle']
976 data = urllib2.urlopen(request)
978 except (urllib2.HTTPError, ), err:
979 if (err.code < 500 or err.code >= 600) and err.code != 416:
980 # Unexpected HTTP error
982 elif err.code == 416:
983 # Unable to resume (requested range not satisfiable)
985 # Open the connection again without the range header
986 data = urllib2.urlopen(basic_request)
987 content_length = data.info()['Content-Length']
988 except (urllib2.HTTPError, ), err:
989 if err.code < 500 or err.code >= 600:
992 # Examine the reported length
993 if (content_length is not None and
994 (resume_len - 100 < long(content_length) < resume_len + 100)):
995 # The file had already been fully downloaded.
996 # Explanation to the above condition: in issue #175 it was revealed that
997 # YouTube sometimes adds or removes a few bytes from the end of the file,
998 # changing the file size slightly and causing problems for some users. So
999 # I decided to implement a suggested change and consider the file
1000 # completely downloaded if the file size differs less than 100 bytes from
1001 # the one in the hard drive.
1002 self.report_file_already_downloaded(filename)
1003 self.try_rename(tmpfilename, filename)
1006 # The length does not match, we start the download over
1007 self.report_unable_to_resume()
1012 if count <= retries:
1013 self.report_retry(count, retries)
1016 self.trouble(u'ERROR: giving up after %s retries' % retries)
1019 data_len = data.info().get('Content-length', None)
1020 if data_len is not None:
1021 data_len = long(data_len) + resume_len
1022 data_len_str = self.format_bytes(data_len)
1023 byte_counter = 0 + resume_len
1027 # Download and write
1028 before = time.time()
1029 data_block = data.read(block_size)
1031 if len(data_block) == 0:
1033 byte_counter += len(data_block)
1035 # Open file just in time
1038 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1039 assert stream is not None
1040 filename = self.undo_temp_name(tmpfilename)
1041 self.report_destination(filename)
1042 except (OSError, IOError), err:
1043 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1046 stream.write(data_block)
1047 except (IOError, OSError), err:
1048 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1050 block_size = self.best_block_size(after - before, len(data_block))
1053 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1054 if data_len is None:
1055 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1057 percent_str = self.calc_percent(byte_counter, data_len)
1058 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1059 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1062 self.slow_down(start, byte_counter - resume_len)
1065 self.trouble(u'\nERROR: Did not get any data blocks')
1068 self.report_finish()
1069 if data_len is not None and byte_counter != data_len:
1070 raise ContentTooShortError(byte_counter, long(data_len))
1071 self.try_rename(tmpfilename, filename)
1073 # Update file modification time
1074 if self.params.get('updatetime', True):
1075 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1080 class InfoExtractor(object):
1081 """Information Extractor class.
1083 Information extractors are the classes that, given a URL, extract
1084 information from the video (or videos) the URL refers to. This
1085 information includes the real video URL, the video title and simplified
1086 title, author and others. The information is stored in a dictionary
1087 which is then passed to the FileDownloader. The FileDownloader
1088 processes this information possibly downloading the video to the file
1089 system, among other possible outcomes. The dictionaries must include
1090 the following fields:
1092 id: Video identifier.
1093 url: Final video URL.
1094 uploader: Nickname of the video uploader.
1095 title: Literal title.
1096 stitle: Simplified title.
1097 ext: Video filename extension.
1098 format: Video format.
1099 player_url: SWF Player URL (may be None).
1101 The following fields are optional. Their primary purpose is to allow
1102 youtube-dl to serve as the backend for a video search function, such
1103 as the one in youtube2mp3. They are only used when their respective
1104 forced printing functions are called:
1106 thumbnail: Full URL to a video thumbnail image.
1107 description: One-line video description.
1109 Subclasses of this one should re-define the _real_initialize() and
1110 _real_extract() methods and define a _VALID_URL regexp.
1111 Probably, they should also be added to the list of extractors.
1117 def __init__(self, downloader=None):
1118 """Constructor. Receives an optional downloader."""
1120 self.set_downloader(downloader)
1122 def suitable(self, url):
1123 """Receives a URL and returns True if suitable for this IE."""
1124 return re.match(self._VALID_URL, url) is not None
1126 def initialize(self):
1127 """Initializes an instance (authentication, etc)."""
1129 self._real_initialize()
1132 def extract(self, url):
1133 """Extracts URL information and returns it in list of dicts."""
1135 return self._real_extract(url)
1137 def set_downloader(self, downloader):
1138 """Sets the downloader for this IE."""
1139 self._downloader = downloader
1141 def _real_initialize(self):
1142 """Real initialization process. Redefine in subclasses."""
1145 def _real_extract(self, url):
1146 """Real extraction process. Redefine in subclasses."""
1150 class YoutubeIE(InfoExtractor):
1151 """Information extractor for youtube.com."""
1153 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1154 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1155 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1156 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1157 _NETRC_MACHINE = 'youtube'
1158 # Listed in order of quality
1159 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1160 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1161 _video_extensions = {
1167 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1172 _video_dimensions = {
1187 IE_NAME = u'youtube'
1189 def report_lang(self):
1190 """Report attempt to set language."""
1191 self._downloader.to_screen(u'[youtube] Setting language')
1193 def report_login(self):
1194 """Report attempt to log in."""
1195 self._downloader.to_screen(u'[youtube] Logging in')
1197 def report_age_confirmation(self):
1198 """Report attempt to confirm age."""
1199 self._downloader.to_screen(u'[youtube] Confirming age')
1201 def report_video_webpage_download(self, video_id):
1202 """Report attempt to download video webpage."""
1203 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1205 def report_video_info_webpage_download(self, video_id):
1206 """Report attempt to download video info webpage."""
1207 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1209 def report_information_extraction(self, video_id):
1210 """Report attempt to extract video information."""
1211 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1213 def report_unavailable_format(self, video_id, format):
1214 """Report extracted video URL."""
1215 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1217 def report_rtmp_download(self):
1218 """Indicate the download will use the RTMP protocol."""
1219 self._downloader.to_screen(u'[youtube] RTMP download detected')
1221 def _print_formats(self, formats):
1222 print 'Available formats:'
1224 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1226 def _real_initialize(self):
1227 if self._downloader is None:
1232 downloader_params = self._downloader.params
1234 # Attempt to use provided username and password or .netrc data
1235 if downloader_params.get('username', None) is not None:
1236 username = downloader_params['username']
1237 password = downloader_params['password']
1238 elif downloader_params.get('usenetrc', False):
1240 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1241 if info is not None:
1245 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1246 except (IOError, netrc.NetrcParseError), err:
1247 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1251 request = urllib2.Request(self._LANG_URL)
1254 urllib2.urlopen(request).read()
1255 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1256 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1259 # No authentication to be performed
1260 if username is None:
1265 'current_form': 'loginForm',
1267 'action_login': 'Log In',
1268 'username': username,
1269 'password': password,
1271 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1274 login_results = urllib2.urlopen(request).read()
1275 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1276 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1278 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1279 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1285 'action_confirm': 'Confirm',
1287 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1289 self.report_age_confirmation()
1290 age_results = urllib2.urlopen(request).read()
1291 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1292 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1295 def _real_extract(self, url):
1296 # Extract video id from URL
1297 mobj = re.match(self._VALID_URL, url)
1299 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1301 video_id = mobj.group(2)
1304 self.report_video_webpage_download(video_id)
1305 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1307 video_webpage = urllib2.urlopen(request).read()
1308 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1309 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1312 # Attempt to extract SWF player URL
1313 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1314 if mobj is not None:
1315 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1320 self.report_video_info_webpage_download(video_id)
1321 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1322 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1323 % (video_id, el_type))
1324 request = urllib2.Request(video_info_url)
1326 video_info_webpage = urllib2.urlopen(request).read()
1327 video_info = parse_qs(video_info_webpage)
1328 if 'token' in video_info:
1330 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1331 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1333 if 'token' not in video_info:
1334 if 'reason' in video_info:
1335 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1337 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1340 # Start extracting information
1341 self.report_information_extraction(video_id)
1344 if 'author' not in video_info:
1345 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1347 video_uploader = urllib.unquote_plus(video_info['author'][0])
1350 if 'title' not in video_info:
1351 self._downloader.trouble(u'ERROR: unable to extract video title')
1353 video_title = urllib.unquote_plus(video_info['title'][0])
1354 video_title = video_title.decode('utf-8')
1355 video_title = sanitize_title(video_title)
1358 simple_title = _simplify_title(video_title)
1361 if 'thumbnail_url' not in video_info:
1362 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1363 video_thumbnail = ''
1364 else: # don't panic if we can't find it
1365 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1369 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1370 if mobj is not None:
1371 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1372 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1373 for expression in format_expressions:
1375 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1383 video_description = u'No description available.'
1384 mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1385 if mobj is not None:
1386 video_description = mobj.group(1).decode('utf-8')
1388 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1389 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1390 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1391 # TODO use another parser
1394 video_token = urllib.unquote_plus(video_info['token'][0])
1396 # Decide which formats to download
1397 req_format = self._downloader.params.get('format', None)
1399 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1400 self.report_rtmp_download()
1401 video_url_list = [(None, video_info['conn'][0])]
1402 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1403 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1404 url_data = [parse_qs(uds) for uds in url_data_strs]
1405 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1406 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1408 format_limit = self._downloader.params.get('format_limit', None)
1409 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1410 if format_limit is not None and format_limit in available_formats:
1411 format_list = available_formats[available_formats.index(format_limit):]
1413 format_list = available_formats
1414 existing_formats = [x for x in format_list if x in url_map]
1415 if len(existing_formats) == 0:
1416 self._downloader.trouble(u'ERROR: no known formats available for video')
1418 if self._downloader.params.get('listformats', None):
1419 self._print_formats(existing_formats)
1421 if req_format is None or req_format == 'best':
1422 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1423 elif req_format == 'worst':
1424 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1425 elif req_format in ('-1', 'all'):
1426 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1428 # Specific formats. We pick the first in a slash-delimeted sequence.
1429 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1430 req_formats = req_format.split('/')
1431 video_url_list = None
1432 for rf in req_formats:
1434 video_url_list = [(rf, url_map[rf])]
1436 if video_url_list is None:
1437 self._downloader.trouble(u'ERROR: requested format not available')
1440 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1443 for format_param, video_real_url in video_url_list:
1444 # At this point we have a new video
1445 self._downloader.increment_downloads()
1448 video_extension = self._video_extensions.get(format_param, 'flv')
1451 # Process video information
1452 self._downloader.process_info({
1453 'id': video_id.decode('utf-8'),
1454 'url': video_real_url.decode('utf-8'),
1455 'uploader': video_uploader.decode('utf-8'),
1456 'upload_date': upload_date,
1457 'title': video_title,
1458 'stitle': simple_title,
1459 'ext': video_extension.decode('utf-8'),
1460 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1461 'thumbnail': video_thumbnail.decode('utf-8'),
1462 'description': video_description,
1463 'player_url': player_url,
1465 except UnavailableVideoError, err:
1466 self._downloader.trouble(u'\nERROR: unable to download video')
1469 class MetacafeIE(InfoExtractor):
1470 """Information Extractor for metacafe.com."""
1472 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1473 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1474 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1476 IE_NAME = u'metacafe'
1478 def __init__(self, youtube_ie, downloader=None):
1479 InfoExtractor.__init__(self, downloader)
1480 self._youtube_ie = youtube_ie
1482 def report_disclaimer(self):
1483 """Report disclaimer retrieval."""
1484 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1486 def report_age_confirmation(self):
1487 """Report attempt to confirm age."""
1488 self._downloader.to_screen(u'[metacafe] Confirming age')
1490 def report_download_webpage(self, video_id):
1491 """Report webpage download."""
1492 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1494 def report_extraction(self, video_id):
1495 """Report information extraction."""
1496 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1498 def _real_initialize(self):
1499 # Retrieve disclaimer
1500 request = urllib2.Request(self._DISCLAIMER)
1502 self.report_disclaimer()
1503 disclaimer = urllib2.urlopen(request).read()
1504 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1505 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1511 'submit': "Continue - I'm over 18",
1513 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1515 self.report_age_confirmation()
1516 disclaimer = urllib2.urlopen(request).read()
1517 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1518 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1521 def _real_extract(self, url):
1522 # Extract id and simplified title from URL
1523 mobj = re.match(self._VALID_URL, url)
1525 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1528 video_id = mobj.group(1)
1530 # Check if video comes from YouTube
1531 mobj2 = re.match(r'^yt-(.*)$', video_id)
1532 if mobj2 is not None:
1533 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1536 # At this point we have a new video
1537 self._downloader.increment_downloads()
1539 simple_title = mobj.group(2).decode('utf-8')
1541 # Retrieve video webpage to extract further information
1542 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1544 self.report_download_webpage(video_id)
1545 webpage = urllib2.urlopen(request).read()
1546 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1547 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1550 # Extract URL, uploader and title from webpage
1551 self.report_extraction(video_id)
1552 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1553 if mobj is not None:
1554 mediaURL = urllib.unquote(mobj.group(1))
1555 video_extension = mediaURL[-3:]
1557 # Extract gdaKey if available
1558 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1560 video_url = mediaURL
1562 gdaKey = mobj.group(1)
1563 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1565 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1567 self._downloader.trouble(u'ERROR: unable to extract media URL')
1569 vardict = parse_qs(mobj.group(1))
1570 if 'mediaData' not in vardict:
1571 self._downloader.trouble(u'ERROR: unable to extract media URL')
1573 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1575 self._downloader.trouble(u'ERROR: unable to extract media URL')
1577 mediaURL = mobj.group(1).replace('\\/', '/')
1578 video_extension = mediaURL[-3:]
1579 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1581 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1583 self._downloader.trouble(u'ERROR: unable to extract title')
1585 video_title = mobj.group(1).decode('utf-8')
1586 video_title = sanitize_title(video_title)
1588 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1590 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1592 video_uploader = mobj.group(1)
1595 # Process video information
1596 self._downloader.process_info({
1597 'id': video_id.decode('utf-8'),
1598 'url': video_url.decode('utf-8'),
1599 'uploader': video_uploader.decode('utf-8'),
1600 'upload_date': u'NA',
1601 'title': video_title,
1602 'stitle': simple_title,
1603 'ext': video_extension.decode('utf-8'),
1607 except UnavailableVideoError:
1608 self._downloader.trouble(u'\nERROR: unable to download video')
1611 class DailymotionIE(InfoExtractor):
1612 """Information Extractor for Dailymotion"""
1614 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1615 IE_NAME = u'dailymotion'
1617 def __init__(self, downloader=None):
1618 InfoExtractor.__init__(self, downloader)
1620 def report_download_webpage(self, video_id):
1621 """Report webpage download."""
1622 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1624 def report_extraction(self, video_id):
1625 """Report information extraction."""
1626 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1628 def _real_extract(self, url):
1629 # Extract id and simplified title from URL
1630 mobj = re.match(self._VALID_URL, url)
1632 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1635 # At this point we have a new video
1636 self._downloader.increment_downloads()
1637 video_id = mobj.group(1)
1639 video_extension = 'flv'
1641 # Retrieve video webpage to extract further information
1642 request = urllib2.Request(url)
1643 request.add_header('Cookie', 'family_filter=off')
1645 self.report_download_webpage(video_id)
1646 webpage = urllib2.urlopen(request).read()
1647 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1648 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1651 # Extract URL, uploader and title from webpage
1652 self.report_extraction(video_id)
1653 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1655 self._downloader.trouble(u'ERROR: unable to extract media URL')
1657 sequence = urllib.unquote(mobj.group(1))
1658 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1660 self._downloader.trouble(u'ERROR: unable to extract media URL')
1662 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1664 # if needed add http://www.dailymotion.com/ if relative URL
1666 video_url = mediaURL
1668 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1670 self._downloader.trouble(u'ERROR: unable to extract title')
1672 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1673 video_title = sanitize_title(video_title)
1674 simple_title = _simplify_title(video_title)
1676 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1678 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1680 video_uploader = mobj.group(1)
1683 # Process video information
1684 self._downloader.process_info({
1685 'id': video_id.decode('utf-8'),
1686 'url': video_url.decode('utf-8'),
1687 'uploader': video_uploader.decode('utf-8'),
1688 'upload_date': u'NA',
1689 'title': video_title,
1690 'stitle': simple_title,
1691 'ext': video_extension.decode('utf-8'),
1695 except UnavailableVideoError:
1696 self._downloader.trouble(u'\nERROR: unable to download video')
1699 class GoogleIE(InfoExtractor):
1700 """Information extractor for video.google.com."""
1702 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1703 IE_NAME = u'video.google'
1705 def __init__(self, downloader=None):
1706 InfoExtractor.__init__(self, downloader)
1708 def report_download_webpage(self, video_id):
1709 """Report webpage download."""
1710 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1712 def report_extraction(self, video_id):
1713 """Report information extraction."""
1714 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1716 def _real_extract(self, url):
1717 # Extract id from URL
1718 mobj = re.match(self._VALID_URL, url)
1720 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1723 # At this point we have a new video
1724 self._downloader.increment_downloads()
1725 video_id = mobj.group(1)
1727 video_extension = 'mp4'
1729 # Retrieve video webpage to extract further information
1730 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1732 self.report_download_webpage(video_id)
1733 webpage = urllib2.urlopen(request).read()
1734 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1735 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1738 # Extract URL, uploader, and title from webpage
1739 self.report_extraction(video_id)
1740 mobj = re.search(r"download_url:'([^']+)'", webpage)
1742 video_extension = 'flv'
1743 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1745 self._downloader.trouble(u'ERROR: unable to extract media URL')
1747 mediaURL = urllib.unquote(mobj.group(1))
1748 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1749 mediaURL = mediaURL.replace('\\x26', '\x26')
1751 video_url = mediaURL
1753 mobj = re.search(r'<title>(.*)</title>', webpage)
1755 self._downloader.trouble(u'ERROR: unable to extract title')
1757 video_title = mobj.group(1).decode('utf-8')
1758 video_title = sanitize_title(video_title)
1759 simple_title = _simplify_title(video_title)
1761 # Extract video description
1762 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1764 self._downloader.trouble(u'ERROR: unable to extract video description')
1766 video_description = mobj.group(1).decode('utf-8')
1767 if not video_description:
1768 video_description = 'No description available.'
1770 # Extract video thumbnail
1771 if self._downloader.params.get('forcethumbnail', False):
1772 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1774 webpage = urllib2.urlopen(request).read()
1775 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1776 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1778 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1780 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1782 video_thumbnail = mobj.group(1)
1783 else: # we need something to pass to process_info
1784 video_thumbnail = ''
1787 # Process video information
1788 self._downloader.process_info({
1789 'id': video_id.decode('utf-8'),
1790 'url': video_url.decode('utf-8'),
1792 'upload_date': u'NA',
1793 'title': video_title,
1794 'stitle': simple_title,
1795 'ext': video_extension.decode('utf-8'),
1799 except UnavailableVideoError:
1800 self._downloader.trouble(u'\nERROR: unable to download video')
1803 class PhotobucketIE(InfoExtractor):
1804 """Information extractor for photobucket.com."""
1806 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1807 IE_NAME = u'photobucket'
1809 def __init__(self, downloader=None):
1810 InfoExtractor.__init__(self, downloader)
1812 def report_download_webpage(self, video_id):
1813 """Report webpage download."""
1814 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1816 def report_extraction(self, video_id):
1817 """Report information extraction."""
1818 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1820 def _real_extract(self, url):
1821 # Extract id from URL
1822 mobj = re.match(self._VALID_URL, url)
1824 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1827 # At this point we have a new video
1828 self._downloader.increment_downloads()
1829 video_id = mobj.group(1)
1831 video_extension = 'flv'
1833 # Retrieve video webpage to extract further information
1834 request = urllib2.Request(url)
1836 self.report_download_webpage(video_id)
1837 webpage = urllib2.urlopen(request).read()
1838 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1839 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1842 # Extract URL, uploader, and title from webpage
1843 self.report_extraction(video_id)
1844 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1846 self._downloader.trouble(u'ERROR: unable to extract media URL')
1848 mediaURL = urllib.unquote(mobj.group(1))
1850 video_url = mediaURL
1852 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1854 self._downloader.trouble(u'ERROR: unable to extract title')
1856 video_title = mobj.group(1).decode('utf-8')
1857 video_title = sanitize_title(video_title)
1858 simple_title = _simplify_title(vide_title)
1860 video_uploader = mobj.group(2).decode('utf-8')
1863 # Process video information
1864 self._downloader.process_info({
1865 'id': video_id.decode('utf-8'),
1866 'url': video_url.decode('utf-8'),
1867 'uploader': video_uploader,
1868 'upload_date': u'NA',
1869 'title': video_title,
1870 'stitle': simple_title,
1871 'ext': video_extension.decode('utf-8'),
1875 except UnavailableVideoError:
1876 self._downloader.trouble(u'\nERROR: unable to download video')
1879 class YahooIE(InfoExtractor):
1880 """Information extractor for video.yahoo.com."""
1882 # _VALID_URL matches all Yahoo! Video URLs
1883 # _VPAGE_URL matches only the extractable '/watch/' URLs
1884 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1885 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1886 IE_NAME = u'video.yahoo'
1888 def __init__(self, downloader=None):
1889 InfoExtractor.__init__(self, downloader)
1891 def report_download_webpage(self, video_id):
1892 """Report webpage download."""
1893 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1895 def report_extraction(self, video_id):
1896 """Report information extraction."""
1897 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1899 def _real_extract(self, url, new_video=True):
1900 # Extract ID from URL
1901 mobj = re.match(self._VALID_URL, url)
1903 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1906 # At this point we have a new video
1907 self._downloader.increment_downloads()
1908 video_id = mobj.group(2)
1909 video_extension = 'flv'
1911 # Rewrite valid but non-extractable URLs as
1912 # extractable English language /watch/ URLs
1913 if re.match(self._VPAGE_URL, url) is None:
1914 request = urllib2.Request(url)
1916 webpage = urllib2.urlopen(request).read()
1917 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1918 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1921 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1923 self._downloader.trouble(u'ERROR: Unable to extract id field')
1925 yahoo_id = mobj.group(1)
1927 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1929 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1931 yahoo_vid = mobj.group(1)
1933 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1934 return self._real_extract(url, new_video=False)
1936 # Retrieve video webpage to extract further information
1937 request = urllib2.Request(url)
1939 self.report_download_webpage(video_id)
1940 webpage = urllib2.urlopen(request).read()
1941 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1942 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1945 # Extract uploader and title from webpage
1946 self.report_extraction(video_id)
1947 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1949 self._downloader.trouble(u'ERROR: unable to extract video title')
1951 video_title = mobj.group(1).decode('utf-8')
1952 simple_title = _simplify_title(video_title)
1954 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1956 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1958 video_uploader = mobj.group(1).decode('utf-8')
1960 # Extract video thumbnail
1961 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1963 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1965 video_thumbnail = mobj.group(1).decode('utf-8')
1967 # Extract video description
1968 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1970 self._downloader.trouble(u'ERROR: unable to extract video description')
1972 video_description = mobj.group(1).decode('utf-8')
1973 if not video_description:
1974 video_description = 'No description available.'
1976 # Extract video height and width
1977 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1979 self._downloader.trouble(u'ERROR: unable to extract video height')
1981 yv_video_height = mobj.group(1)
1983 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1985 self._downloader.trouble(u'ERROR: unable to extract video width')
1987 yv_video_width = mobj.group(1)
1989 # Retrieve video playlist to extract media URL
1990 # I'm not completely sure what all these options are, but we
1991 # seem to need most of them, otherwise the server sends a 401.
1992 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1993 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1994 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1995 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1996 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1998 self.report_download_webpage(video_id)
1999 webpage = urllib2.urlopen(request).read()
2000 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2001 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2004 # Extract media URL from playlist XML
2005 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2007 self._downloader.trouble(u'ERROR: Unable to extract media URL')
2009 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2010 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2013 # Process video information
2014 self._downloader.process_info({
2015 'id': video_id.decode('utf-8'),
2017 'uploader': video_uploader,
2018 'upload_date': u'NA',
2019 'title': video_title,
2020 'stitle': simple_title,
2021 'ext': video_extension.decode('utf-8'),
2022 'thumbnail': video_thumbnail.decode('utf-8'),
2023 'description': video_description,
2024 'thumbnail': video_thumbnail,
2027 except UnavailableVideoError:
2028 self._downloader.trouble(u'\nERROR: unable to download video')
2031 class VimeoIE(InfoExtractor):
2032 """Information extractor for vimeo.com."""
2034 # _VALID_URL matches Vimeo URLs
2035 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2038 def __init__(self, downloader=None):
2039 InfoExtractor.__init__(self, downloader)
2041 def report_download_webpage(self, video_id):
2042 """Report webpage download."""
2043 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2045 def report_extraction(self, video_id):
2046 """Report information extraction."""
2047 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2049 def _real_extract(self, url, new_video=True):
2050 # Extract ID from URL
2051 mobj = re.match(self._VALID_URL, url)
2053 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2056 # At this point we have a new video
2057 self._downloader.increment_downloads()
2058 video_id = mobj.group(1)
2060 # Retrieve video webpage to extract further information
2061 request = urllib2.Request(url, None, std_headers)
2063 self.report_download_webpage(video_id)
2064 webpage = urllib2.urlopen(request).read()
2065 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2066 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2069 # Now we begin extracting as much information as we can from what we
2070 # retrieved. First we extract the information common to all extractors,
2071 # and latter we extract those that are Vimeo specific.
2072 self.report_extraction(video_id)
2074 # Extract the config JSON
2075 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2077 config = json.loads(config)
2079 self._downloader.trouble(u'ERROR: unable to extract info section')
2083 video_title = config["video"]["title"]
2084 simple_title = _simplify_title(video_title)
2087 video_uploader = config["video"]["owner"]["name"]
2089 # Extract video thumbnail
2090 video_thumbnail = config["video"]["thumbnail"]
2092 # Extract video description
2096 video_description = u'No description available.'
2097 mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2098 if mobj is not None:
2099 video_description = mobj.group(1)
2101 html_parser = lxml.etree.HTMLParser()
2102 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2103 video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2104 # TODO use another parser
2106 # Extract upload date
2107 video_upload_date = u'NA'
2108 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2109 if mobj is not None:
2110 video_upload_date = mobj.group(1)
2112 # Vimeo specific: extract request signature and timestamp
2113 sig = config['request']['signature']
2114 timestamp = config['request']['timestamp']
2116 # Vimeo specific: extract video quality information
2117 # TODO bind to format param
2118 if 'hd' in config["video"]["files"]["h264"]: quality = 'hd'
2119 else: quality = 'sd'
2121 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=H264&type=moogaloop_local&embed_location=" \
2122 %(video_id, sig, timestamp, quality)
2125 # Process video information
2126 self._downloader.process_info({
2129 'uploader': video_uploader,
2130 'upload_date': video_upload_date,
2131 'title': video_title,
2132 'stitle': simple_title,
2134 'thumbnail': video_thumbnail,
2135 'description': video_description,
2138 except UnavailableVideoError:
2139 self._downloader.trouble(u'ERROR: unable to download video')
2142 class GenericIE(InfoExtractor):
2143 """Generic last-resort information extractor."""
2146 IE_NAME = u'generic'
2148 def __init__(self, downloader=None):
2149 InfoExtractor.__init__(self, downloader)
2151 def report_download_webpage(self, video_id):
2152 """Report webpage download."""
2153 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2154 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2156 def report_extraction(self, video_id):
2157 """Report information extraction."""
2158 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2160 def _real_extract(self, url):
2161 # At this point we have a new video
2162 self._downloader.increment_downloads()
2164 video_id = url.split('/')[-1]
2165 request = urllib2.Request(url)
2167 self.report_download_webpage(video_id)
2168 webpage = urllib2.urlopen(request).read()
2169 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2170 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2172 except ValueError, err:
2173 # since this is the last-resort InfoExtractor, if
2174 # this error is thrown, it'll be thrown here
2175 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2178 self.report_extraction(video_id)
2179 # Start with something easy: JW Player in SWFObject
2180 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2182 # Broaden the search a little bit
2183 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2185 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2188 # It's possible that one of the regexes
2189 # matched, but returned an empty group:
2190 if mobj.group(1) is None:
2191 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2194 video_url = urllib.unquote(mobj.group(1))
2195 video_id = os.path.basename(video_url)
2197 # here's a fun little line of code for you:
2198 video_extension = os.path.splitext(video_id)[1][1:]
2199 video_id = os.path.splitext(video_id)[0]
2201 # it's tempting to parse this further, but you would
2202 # have to take into account all the variations like
2203 # Video Title - Site Name
2204 # Site Name | Video Title
2205 # Video Title - Tagline | Site Name
2206 # and so on and so forth; it's just not practical
2207 mobj = re.search(r'<title>(.*)</title>', webpage)
2209 self._downloader.trouble(u'ERROR: unable to extract title')
2211 video_title = mobj.group(1).decode('utf-8')
2212 video_title = sanitize_title(video_title)
2213 simple_title = _simplify_title(video_title)
2215 # video uploader is domain name
2216 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2218 self._downloader.trouble(u'ERROR: unable to extract title')
2220 video_uploader = mobj.group(1).decode('utf-8')
2223 # Process video information
2224 self._downloader.process_info({
2225 'id': video_id.decode('utf-8'),
2226 'url': video_url.decode('utf-8'),
2227 'uploader': video_uploader,
2228 'upload_date': u'NA',
2229 'title': video_title,
2230 'stitle': simple_title,
2231 'ext': video_extension.decode('utf-8'),
2235 except UnavailableVideoError, err:
2236 self._downloader.trouble(u'\nERROR: unable to download video')
2239 class YoutubeSearchIE(InfoExtractor):
2240 """Information Extractor for YouTube search queries."""
2241 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2242 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2243 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2244 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2246 _max_youtube_results = 1000
2247 IE_NAME = u'youtube:search'
2249 def __init__(self, youtube_ie, downloader=None):
2250 InfoExtractor.__init__(self, downloader)
2251 self._youtube_ie = youtube_ie
2253 def report_download_page(self, query, pagenum):
2254 """Report attempt to download playlist page with given number."""
2255 query = query.decode(preferredencoding())
2256 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2258 def _real_initialize(self):
2259 self._youtube_ie.initialize()
2261 def _real_extract(self, query):
2262 mobj = re.match(self._VALID_URL, query)
2264 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2267 prefix, query = query.split(':')
2269 query = query.encode('utf-8')
2271 self._download_n_results(query, 1)
2273 elif prefix == 'all':
2274 self._download_n_results(query, self._max_youtube_results)
2280 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2282 elif n > self._max_youtube_results:
2283 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2284 n = self._max_youtube_results
2285 self._download_n_results(query, n)
2287 except ValueError: # parsing prefix as integer fails
2288 self._download_n_results(query, 1)
2291 def _download_n_results(self, query, n):
2292 """Downloads a specified number of results for a query"""
2295 already_seen = set()
2299 self.report_download_page(query, pagenum)
2300 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2301 request = urllib2.Request(result_url)
2303 page = urllib2.urlopen(request).read()
2304 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2305 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2308 # Extract video identifiers
2309 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2310 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2311 if video_id not in already_seen:
2312 video_ids.append(video_id)
2313 already_seen.add(video_id)
2314 if len(video_ids) == n:
2315 # Specified n videos reached
2316 for id in video_ids:
2317 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2320 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2321 for id in video_ids:
2322 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2325 pagenum = pagenum + 1
2328 class GoogleSearchIE(InfoExtractor):
2329 """Information Extractor for Google Video search queries."""
2330 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2331 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2332 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2333 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2335 _max_google_results = 1000
2336 IE_NAME = u'video.google:search'
2338 def __init__(self, google_ie, downloader=None):
2339 InfoExtractor.__init__(self, downloader)
2340 self._google_ie = google_ie
2342 def report_download_page(self, query, pagenum):
2343 """Report attempt to download playlist page with given number."""
2344 query = query.decode(preferredencoding())
2345 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2347 def _real_initialize(self):
2348 self._google_ie.initialize()
2350 def _real_extract(self, query):
2351 mobj = re.match(self._VALID_URL, query)
2353 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2356 prefix, query = query.split(':')
2358 query = query.encode('utf-8')
2360 self._download_n_results(query, 1)
2362 elif prefix == 'all':
2363 self._download_n_results(query, self._max_google_results)
2369 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2371 elif n > self._max_google_results:
2372 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2373 n = self._max_google_results
2374 self._download_n_results(query, n)
2376 except ValueError: # parsing prefix as integer fails
2377 self._download_n_results(query, 1)
2380 def _download_n_results(self, query, n):
2381 """Downloads a specified number of results for a query"""
2387 self.report_download_page(query, pagenum)
2388 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2389 request = urllib2.Request(result_url)
2391 page = urllib2.urlopen(request).read()
2392 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2393 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2396 # Extract video identifiers
2397 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2398 video_id = mobj.group(1)
2399 if video_id not in video_ids:
2400 video_ids.append(video_id)
2401 if len(video_ids) == n:
2402 # Specified n videos reached
2403 for id in video_ids:
2404 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2407 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2408 for id in video_ids:
2409 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2412 pagenum = pagenum + 1
2415 class YahooSearchIE(InfoExtractor):
2416 """Information Extractor for Yahoo! Video search queries."""
2417 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2418 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2419 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2420 _MORE_PAGES_INDICATOR = r'\s*Next'
2422 _max_yahoo_results = 1000
2423 IE_NAME = u'video.yahoo:search'
2425 def __init__(self, yahoo_ie, downloader=None):
2426 InfoExtractor.__init__(self, downloader)
2427 self._yahoo_ie = yahoo_ie
2429 def report_download_page(self, query, pagenum):
2430 """Report attempt to download playlist page with given number."""
2431 query = query.decode(preferredencoding())
2432 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2434 def _real_initialize(self):
2435 self._yahoo_ie.initialize()
2437 def _real_extract(self, query):
2438 mobj = re.match(self._VALID_URL, query)
2440 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2443 prefix, query = query.split(':')
2445 query = query.encode('utf-8')
2447 self._download_n_results(query, 1)
2449 elif prefix == 'all':
2450 self._download_n_results(query, self._max_yahoo_results)
2456 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2458 elif n > self._max_yahoo_results:
2459 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2460 n = self._max_yahoo_results
2461 self._download_n_results(query, n)
2463 except ValueError: # parsing prefix as integer fails
2464 self._download_n_results(query, 1)
2467 def _download_n_results(self, query, n):
2468 """Downloads a specified number of results for a query"""
2471 already_seen = set()
2475 self.report_download_page(query, pagenum)
2476 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2477 request = urllib2.Request(result_url)
2479 page = urllib2.urlopen(request).read()
2480 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2481 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2484 # Extract video identifiers
2485 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2486 video_id = mobj.group(1)
2487 if video_id not in already_seen:
2488 video_ids.append(video_id)
2489 already_seen.add(video_id)
2490 if len(video_ids) == n:
2491 # Specified n videos reached
2492 for id in video_ids:
2493 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2496 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2497 for id in video_ids:
2498 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2501 pagenum = pagenum + 1
2504 class YoutubePlaylistIE(InfoExtractor):
2505 """Information Extractor for YouTube playlists."""
2507 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2508 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2509 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2510 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2512 IE_NAME = u'youtube:playlist'
2514 def __init__(self, youtube_ie, downloader=None):
2515 InfoExtractor.__init__(self, downloader)
2516 self._youtube_ie = youtube_ie
2518 def report_download_page(self, playlist_id, pagenum):
2519 """Report attempt to download playlist page with given number."""
2520 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2522 def _real_initialize(self):
2523 self._youtube_ie.initialize()
2525 def _real_extract(self, url):
2526 # Extract playlist id
2527 mobj = re.match(self._VALID_URL, url)
2529 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2533 if mobj.group(3) is not None:
2534 self._youtube_ie.extract(mobj.group(3))
2537 # Download playlist pages
2538 # prefix is 'p' as default for playlists but there are other types that need extra care
2539 playlist_prefix = mobj.group(1)
2540 if playlist_prefix == 'a':
2541 playlist_access = 'artist'
2543 playlist_prefix = 'p'
2544 playlist_access = 'view_play_list'
2545 playlist_id = mobj.group(2)
2550 self.report_download_page(playlist_id, pagenum)
2551 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2552 request = urllib2.Request(url)
2554 page = urllib2.urlopen(request).read()
2555 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2556 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2559 # Extract video identifiers
2561 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2562 if mobj.group(1) not in ids_in_page:
2563 ids_in_page.append(mobj.group(1))
2564 video_ids.extend(ids_in_page)
2566 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2568 pagenum = pagenum + 1
2570 playliststart = self._downloader.params.get('playliststart', 1) - 1
2571 playlistend = self._downloader.params.get('playlistend', -1)
2572 video_ids = video_ids[playliststart:playlistend]
2574 for id in video_ids:
2575 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2579 class YoutubeUserIE(InfoExtractor):
2580 """Information Extractor for YouTube users."""
2582 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2583 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2584 _GDATA_PAGE_SIZE = 50
2585 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2586 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2588 IE_NAME = u'youtube:user'
2590 def __init__(self, youtube_ie, downloader=None):
2591 InfoExtractor.__init__(self, downloader)
2592 self._youtube_ie = youtube_ie
2594 def report_download_page(self, username, start_index):
2595 """Report attempt to download user page."""
2596 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2597 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2599 def _real_initialize(self):
2600 self._youtube_ie.initialize()
2602 def _real_extract(self, url):
2604 mobj = re.match(self._VALID_URL, url)
2606 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2609 username = mobj.group(1)
2611 # Download video ids using YouTube Data API. Result size per
2612 # query is limited (currently to 50 videos) so we need to query
2613 # page by page until there are no video ids - it means we got
2620 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2621 self.report_download_page(username, start_index)
2623 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2626 page = urllib2.urlopen(request).read()
2627 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2628 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2631 # Extract video identifiers
2634 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2635 if mobj.group(1) not in ids_in_page:
2636 ids_in_page.append(mobj.group(1))
2638 video_ids.extend(ids_in_page)
2640 # A little optimization - if current page is not
2641 # "full", ie. does not contain PAGE_SIZE video ids then
2642 # we can assume that this page is the last one - there
2643 # are no more ids on further pages - no need to query
2646 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2651 all_ids_count = len(video_ids)
2652 playliststart = self._downloader.params.get('playliststart', 1) - 1
2653 playlistend = self._downloader.params.get('playlistend', -1)
2655 if playlistend == -1:
2656 video_ids = video_ids[playliststart:]
2658 video_ids = video_ids[playliststart:playlistend]
2660 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2661 (username, all_ids_count, len(video_ids)))
2663 for video_id in video_ids:
2664 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2667 class DepositFilesIE(InfoExtractor):
2668 """Information extractor for depositfiles.com"""
2670 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2671 IE_NAME = u'DepositFiles'
2673 def __init__(self, downloader=None):
2674 InfoExtractor.__init__(self, downloader)
2676 def report_download_webpage(self, file_id):
2677 """Report webpage download."""
2678 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2680 def report_extraction(self, file_id):
2681 """Report information extraction."""
2682 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2684 def _real_extract(self, url):
2685 # At this point we have a new file
2686 self._downloader.increment_downloads()
2688 file_id = url.split('/')[-1]
2689 # Rebuild url in english locale
2690 url = 'http://depositfiles.com/en/files/' + file_id
2692 # Retrieve file webpage with 'Free download' button pressed
2693 free_download_indication = { 'gateway_result' : '1' }
2694 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2696 self.report_download_webpage(file_id)
2697 webpage = urllib2.urlopen(request).read()
2698 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2699 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2702 # Search for the real file URL
2703 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2704 if (mobj is None) or (mobj.group(1) is None):
2705 # Try to figure out reason of the error.
2706 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2707 if (mobj is not None) and (mobj.group(1) is not None):
2708 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2709 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2711 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2714 file_url = mobj.group(1)
2715 file_extension = os.path.splitext(file_url)[1][1:]
2717 # Search for file title
2718 mobj = re.search(r'<b title="(.*?)">', webpage)
2720 self._downloader.trouble(u'ERROR: unable to extract title')
2722 file_title = mobj.group(1).decode('utf-8')
2725 # Process file information
2726 self._downloader.process_info({
2727 'id': file_id.decode('utf-8'),
2728 'url': file_url.decode('utf-8'),
2730 'upload_date': u'NA',
2731 'title': file_title,
2732 'stitle': file_title,
2733 'ext': file_extension.decode('utf-8'),
2737 except UnavailableVideoError, err:
2738 self._downloader.trouble(u'ERROR: unable to download file')
2741 class FacebookIE(InfoExtractor):
2742 """Information Extractor for Facebook"""
2744 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2745 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2746 _NETRC_MACHINE = 'facebook'
2747 _available_formats = ['video', 'highqual', 'lowqual']
2748 _video_extensions = {
2753 IE_NAME = u'facebook'
2755 def __init__(self, downloader=None):
2756 InfoExtractor.__init__(self, downloader)
2758 def _reporter(self, message):
2759 """Add header and report message."""
2760 self._downloader.to_screen(u'[facebook] %s' % message)
2762 def report_login(self):
2763 """Report attempt to log in."""
2764 self._reporter(u'Logging in')
2766 def report_video_webpage_download(self, video_id):
2767 """Report attempt to download video webpage."""
2768 self._reporter(u'%s: Downloading video webpage' % video_id)
2770 def report_information_extraction(self, video_id):
2771 """Report attempt to extract video information."""
2772 self._reporter(u'%s: Extracting video information' % video_id)
2774 def _parse_page(self, video_webpage):
2775 """Extract video information from page"""
2777 data = {'title': r'\("video_title", "(.*?)"\)',
2778 'description': r'<div class="datawrap">(.*?)</div>',
2779 'owner': r'\("video_owner_name", "(.*?)"\)',
2780 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2783 for piece in data.keys():
2784 mobj = re.search(data[piece], video_webpage)
2785 if mobj is not None:
2786 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2790 for fmt in self._available_formats:
2791 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2792 if mobj is not None:
2793 # URL is in a Javascript segment inside an escaped Unicode format within
2794 # the generally utf-8 page
2795 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2796 video_info['video_urls'] = video_urls
2800 def _real_initialize(self):
2801 if self._downloader is None:
2806 downloader_params = self._downloader.params
2808 # Attempt to use provided username and password or .netrc data
2809 if downloader_params.get('username', None) is not None:
2810 useremail = downloader_params['username']
2811 password = downloader_params['password']
2812 elif downloader_params.get('usenetrc', False):
2814 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2815 if info is not None:
2819 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2820 except (IOError, netrc.NetrcParseError), err:
2821 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2824 if useremail is None:
2833 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2836 login_results = urllib2.urlopen(request).read()
2837 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2838 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2840 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2841 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2844 def _real_extract(self, url):
2845 mobj = re.match(self._VALID_URL, url)
2847 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2849 video_id = mobj.group('ID')
2852 self.report_video_webpage_download(video_id)
2853 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2855 page = urllib2.urlopen(request)
2856 video_webpage = page.read()
2857 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2858 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2861 # Start extracting information
2862 self.report_information_extraction(video_id)
2864 # Extract information
2865 video_info = self._parse_page(video_webpage)
2868 if 'owner' not in video_info:
2869 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2871 video_uploader = video_info['owner']
2874 if 'title' not in video_info:
2875 self._downloader.trouble(u'ERROR: unable to extract video title')
2877 video_title = video_info['title']
2878 video_title = video_title.decode('utf-8')
2879 video_title = sanitize_title(video_title)
2881 simple_title = _simplify_title(video_title)
2884 if 'thumbnail' not in video_info:
2885 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2886 video_thumbnail = ''
2888 video_thumbnail = video_info['thumbnail']
2892 if 'upload_date' in video_info:
2893 upload_time = video_info['upload_date']
2894 timetuple = email.utils.parsedate_tz(upload_time)
2895 if timetuple is not None:
2897 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2902 video_description = video_info.get('description', 'No description available.')
2904 url_map = video_info['video_urls']
2905 if len(url_map.keys()) > 0:
2906 # Decide which formats to download
2907 req_format = self._downloader.params.get('format', None)
2908 format_limit = self._downloader.params.get('format_limit', None)
2910 if format_limit is not None and format_limit in self._available_formats:
2911 format_list = self._available_formats[self._available_formats.index(format_limit):]
2913 format_list = self._available_formats
2914 existing_formats = [x for x in format_list if x in url_map]
2915 if len(existing_formats) == 0:
2916 self._downloader.trouble(u'ERROR: no known formats available for video')
2918 if req_format is None:
2919 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2920 elif req_format == 'worst':
2921 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2922 elif req_format == '-1':
2923 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2926 if req_format not in url_map:
2927 self._downloader.trouble(u'ERROR: requested format not available')
2929 video_url_list = [(req_format, url_map[req_format])] # Specific format
2931 for format_param, video_real_url in video_url_list:
2933 # At this point we have a new video
2934 self._downloader.increment_downloads()
2937 video_extension = self._video_extensions.get(format_param, 'mp4')
2940 # Process video information
2941 self._downloader.process_info({
2942 'id': video_id.decode('utf-8'),
2943 'url': video_real_url.decode('utf-8'),
2944 'uploader': video_uploader.decode('utf-8'),
2945 'upload_date': upload_date,
2946 'title': video_title,
2947 'stitle': simple_title,
2948 'ext': video_extension.decode('utf-8'),
2949 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2950 'thumbnail': video_thumbnail.decode('utf-8'),
2951 'description': video_description.decode('utf-8'),
2954 except UnavailableVideoError, err:
2955 self._downloader.trouble(u'\nERROR: unable to download video')
2957 class BlipTVIE(InfoExtractor):
2958 """Information extractor for blip.tv"""
2960 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2961 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2962 IE_NAME = u'blip.tv'
2964 def report_extraction(self, file_id):
2965 """Report information extraction."""
2966 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2968 def report_direct_download(self, title):
2969 """Report information extraction."""
2970 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2972 def _real_extract(self, url):
2973 mobj = re.match(self._VALID_URL, url)
2975 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2982 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2983 request = urllib2.Request(json_url)
2984 self.report_extraction(mobj.group(1))
2987 urlh = urllib2.urlopen(request)
2988 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2989 basename = url.split('/')[-1]
2990 title,ext = os.path.splitext(basename)
2991 title = title.decode('UTF-8')
2992 ext = ext.replace('.', '')
2993 self.report_direct_download(title)
2998 'stitle': _simplify_title(title),
3002 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3003 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3005 if info is None: # Regular URL
3007 json_code = urlh.read()
3008 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3009 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3013 json_data = json.loads(json_code)
3014 if 'Post' in json_data:
3015 data = json_data['Post']
3019 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3020 video_url = data['media']['url']
3021 umobj = re.match(self._URL_EXT, video_url)
3023 raise ValueError('Can not determine filename extension')
3024 ext = umobj.group(1)
3027 'id': data['item_id'],
3029 'uploader': data['display_name'],
3030 'upload_date': upload_date,
3031 'title': data['title'],
3032 'stitle': _simplify_title(data['title']),
3034 'format': data['media']['mimeType'],
3035 'thumbnail': data['thumbnailUrl'],
3036 'description': data['description'],
3037 'player_url': data['embedUrl']
3039 except (ValueError,KeyError), err:
3040 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3043 self._downloader.increment_downloads()
3046 self._downloader.process_info(info)
3047 except UnavailableVideoError, err:
3048 self._downloader.trouble(u'\nERROR: unable to download video')
3051 class MyVideoIE(InfoExtractor):
3052 """Information Extractor for myvideo.de."""
3054 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3055 IE_NAME = u'myvideo'
3057 def __init__(self, downloader=None):
3058 InfoExtractor.__init__(self, downloader)
3060 def report_download_webpage(self, video_id):
3061 """Report webpage download."""
3062 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3064 def report_extraction(self, video_id):
3065 """Report information extraction."""
3066 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3068 def _real_extract(self,url):
3069 mobj = re.match(self._VALID_URL, url)
3071 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3074 video_id = mobj.group(1)
3077 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3079 self.report_download_webpage(video_id)
3080 webpage = urllib2.urlopen(request).read()
3081 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3082 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3085 self.report_extraction(video_id)
3086 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3089 self._downloader.trouble(u'ERROR: unable to extract media URL')
3091 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3093 mobj = re.search('<title>([^<]+)</title>', webpage)
3095 self._downloader.trouble(u'ERROR: unable to extract title')
3098 video_title = mobj.group(1)
3099 video_title = sanitize_title(video_title)
3101 simple_title = _simplify_title(video_title)
3104 self._downloader.process_info({
3108 'upload_date': u'NA',
3109 'title': video_title,
3110 'stitle': simple_title,
3115 except UnavailableVideoError:
3116 self._downloader.trouble(u'\nERROR: Unable to download video')
3118 class ComedyCentralIE(InfoExtractor):
3119 """Information extractor for The Daily Show and Colbert Report """
3121 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3122 IE_NAME = u'comedycentral'
3124 def report_extraction(self, episode_id):
3125 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3127 def report_config_download(self, episode_id):
3128 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3130 def report_index_download(self, episode_id):
3131 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3133 def report_player_url(self, episode_id):
3134 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3136 def _real_extract(self, url):
3137 mobj = re.match(self._VALID_URL, url)
3139 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3142 if mobj.group('shortname'):
3143 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3144 url = u'http://www.thedailyshow.com/full-episodes/'
3146 url = u'http://www.colbertnation.com/full-episodes/'
3147 mobj = re.match(self._VALID_URL, url)
3148 assert mobj is not None
3150 dlNewest = not mobj.group('episode')
3152 epTitle = mobj.group('showname')
3154 epTitle = mobj.group('episode')
3156 req = urllib2.Request(url)
3157 self.report_extraction(epTitle)
3159 htmlHandle = urllib2.urlopen(req)
3160 html = htmlHandle.read()
3161 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3162 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3165 url = htmlHandle.geturl()
3166 mobj = re.match(self._VALID_URL, url)
3168 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3170 if mobj.group('episode') == '':
3171 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3173 epTitle = mobj.group('episode')
3175 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3176 if len(mMovieParams) == 0:
3177 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3180 playerUrl_raw = mMovieParams[0][0]
3181 self.report_player_url(epTitle)
3183 urlHandle = urllib2.urlopen(playerUrl_raw)
3184 playerUrl = urlHandle.geturl()
3185 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3186 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3189 uri = mMovieParams[0][1]
3190 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3191 self.report_index_download(epTitle)
3193 indexXml = urllib2.urlopen(indexUrl).read()
3194 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3195 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3198 idoc = xml.etree.ElementTree.fromstring(indexXml)
3199 itemEls = idoc.findall('.//item')
3200 for itemEl in itemEls:
3201 mediaId = itemEl.findall('./guid')[0].text
3202 shortMediaId = mediaId.split(':')[-1]
3203 showId = mediaId.split(':')[-2].replace('.com', '')
3204 officialTitle = itemEl.findall('./title')[0].text
3205 officialDate = itemEl.findall('./pubDate')[0].text
3207 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3208 urllib.urlencode({'uri': mediaId}))
3209 configReq = urllib2.Request(configUrl)
3210 self.report_config_download(epTitle)
3212 configXml = urllib2.urlopen(configReq).read()
3213 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3214 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3217 cdoc = xml.etree.ElementTree.fromstring(configXml)
3219 for rendition in cdoc.findall('.//rendition'):
3220 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3224 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3227 # For now, just pick the highest bitrate
3228 format,video_url = turls[-1]
3230 self._downloader.increment_downloads()
3232 effTitle = showId + u'-' + epTitle
3237 'upload_date': officialDate,
3239 'stitle': _simplify_title(effTitle),
3243 'description': officialTitle,
3244 'player_url': playerUrl
3248 self._downloader.process_info(info)
3249 except UnavailableVideoError, err:
3250 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3254 class EscapistIE(InfoExtractor):
3255 """Information extractor for The Escapist """
3257 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3258 IE_NAME = u'escapist'
3260 def report_extraction(self, showName):
3261 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3263 def report_config_download(self, showName):
3264 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3266 def _real_extract(self, url):
3267 htmlParser = HTMLParser.HTMLParser()
3269 mobj = re.match(self._VALID_URL, url)
3271 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3273 showName = mobj.group('showname')
3274 videoId = mobj.group('episode')
3276 self.report_extraction(showName)
3278 webPage = urllib2.urlopen(url).read()
3279 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3280 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3283 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3284 description = htmlParser.unescape(descMatch.group(1))
3285 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3286 imgUrl = htmlParser.unescape(imgMatch.group(1))
3287 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3288 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3289 configUrlMatch = re.search('config=(.*)$', playerUrl)
3290 configUrl = urllib2.unquote(configUrlMatch.group(1))
3292 self.report_config_download(showName)
3294 configJSON = urllib2.urlopen(configUrl).read()
3295 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3296 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3299 # Technically, it's JavaScript, not JSON
3300 configJSON = configJSON.replace("'", '"')
3303 config = json.loads(configJSON)
3304 except (ValueError,), err:
3305 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3308 playlist = config['playlist']
3309 videoUrl = playlist[1]['url']
3311 self._downloader.increment_downloads()
3315 'uploader': showName,
3316 'upload_date': None,
3318 'stitle': _simplify_title(showName),
3321 'thumbnail': imgUrl,
3322 'description': description,
3323 'player_url': playerUrl,
3327 self._downloader.process_info(info)
3328 except UnavailableVideoError, err:
3329 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3332 class CollegeHumorIE(InfoExtractor):
3333 """Information extractor for collegehumor.com"""
3335 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3336 IE_NAME = u'collegehumor'
3338 def report_webpage(self, video_id):
3339 """Report information extraction."""
3340 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3342 def report_extraction(self, video_id):
3343 """Report information extraction."""
3344 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3346 def _real_extract(self, url):
3347 htmlParser = HTMLParser.HTMLParser()
3349 mobj = re.match(self._VALID_URL, url)
3351 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3353 video_id = mobj.group('videoid')
3355 self.report_webpage(video_id)
3356 request = urllib2.Request(url)
3358 webpage = urllib2.urlopen(request).read()
3359 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3360 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3363 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3365 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3367 internal_video_id = m.group('internalvideoid')
3371 'internal_id': internal_video_id,
3374 self.report_extraction(video_id)
3375 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3377 metaXml = urllib2.urlopen(xmlUrl).read()
3378 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3379 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3382 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3384 videoNode = mdoc.findall('./video')[0]
3385 info['description'] = videoNode.findall('./description')[0].text
3386 info['title'] = videoNode.findall('./caption')[0].text
3387 info['stitle'] = _simplify_title(info['title'])
3388 info['url'] = videoNode.findall('./file')[0].text
3389 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3390 info['ext'] = info['url'].rpartition('.')[2]
3391 info['format'] = info['ext']
3393 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3396 self._downloader.increment_downloads()
3399 self._downloader.process_info(info)
3400 except UnavailableVideoError, err:
3401 self._downloader.trouble(u'\nERROR: unable to download video')
3404 class XVideosIE(InfoExtractor):
3405 """Information extractor for xvideos.com"""
3407 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3408 IE_NAME = u'xvideos'
3410 def report_webpage(self, video_id):
3411 """Report information extraction."""
3412 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3414 def report_extraction(self, video_id):
3415 """Report information extraction."""
3416 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3418 def _real_extract(self, url):
3419 htmlParser = HTMLParser.HTMLParser()
3421 mobj = re.match(self._VALID_URL, url)
3423 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3425 video_id = mobj.group(1).decode('utf-8')
3427 self.report_webpage(video_id)
3429 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3431 webpage = urllib2.urlopen(request).read()
3432 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3433 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3436 self.report_extraction(video_id)
3440 mobj = re.search(r'flv_url=(.+?)&', webpage)
3442 self._downloader.trouble(u'ERROR: unable to extract video url')
3444 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3448 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3450 self._downloader.trouble(u'ERROR: unable to extract video title')
3452 video_title = mobj.group(1).decode('utf-8')
3455 # Extract video thumbnail
3456 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3458 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3460 video_thumbnail = mobj.group(1).decode('utf-8')
3464 self._downloader.increment_downloads()
3469 'upload_date': None,
3470 'title': video_title,
3471 'stitle': _simplify_title(video_title),
3474 'thumbnail': video_thumbnail,
3475 'description': None,
3480 self._downloader.process_info(info)
3481 except UnavailableVideoError, err:
3482 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3485 class SoundcloudIE(InfoExtractor):
3486 """Information extractor for soundcloud.com
3487 To access the media, the uid of the song and a stream token
3488 must be extracted from the page source and the script must make
3489 a request to media.soundcloud.com/crossdomain.xml. Then
3490 the media can be grabbed by requesting from an url composed
3491 of the stream token and uid
3494 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3495 IE_NAME = u'soundcloud'
3497 def __init__(self, downloader=None):
3498 InfoExtractor.__init__(self, downloader)
3500 def report_webpage(self, video_id):
3501 """Report information extraction."""
3502 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3504 def report_extraction(self, video_id):
3505 """Report information extraction."""
3506 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3508 def _real_extract(self, url):
3509 htmlParser = HTMLParser.HTMLParser()
3511 mobj = re.match(self._VALID_URL, url)
3513 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3516 # extract uploader (which is in the url)
3517 uploader = mobj.group(1).decode('utf-8')
3518 # extract simple title (uploader + slug of song title)
3519 slug_title = mobj.group(2).decode('utf-8')
3520 simple_title = uploader + '-' + slug_title
3522 self.report_webpage('%s/%s' % (uploader, slug_title))
3524 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3526 webpage = urllib2.urlopen(request).read()
3527 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3528 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3531 self.report_extraction('%s/%s' % (uploader, slug_title))
3533 # extract uid and stream token that soundcloud hands out for access
3534 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3536 video_id = mobj.group(1)
3537 stream_token = mobj.group(2)
3539 # extract unsimplified title
3540 mobj = re.search('"title":"(.*?)",', webpage)
3542 title = mobj.group(1)
3544 # construct media url (with uid/token)
3545 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3546 mediaURL = mediaURL % (video_id, stream_token)
3549 description = u'No description available'
3550 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3552 description = mobj.group(1)
3556 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3559 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3560 except Exception, e:
3563 # for soundcloud, a request to a cross domain is required for cookies
3564 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3567 self._downloader.process_info({
3568 'id': video_id.decode('utf-8'),
3570 'uploader': uploader.decode('utf-8'),
3571 'upload_date': upload_date,
3572 'title': simple_title.decode('utf-8'),
3573 'stitle': simple_title.decode('utf-8'),
3577 'description': description.decode('utf-8')
3579 except UnavailableVideoError:
3580 self._downloader.trouble(u'\nERROR: unable to download video')
3583 class InfoQIE(InfoExtractor):
3584 """Information extractor for infoq.com"""
3586 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3589 def report_webpage(self, video_id):
3590 """Report information extraction."""
3591 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3593 def report_extraction(self, video_id):
3594 """Report information extraction."""
3595 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3597 def _real_extract(self, url):
3598 htmlParser = HTMLParser.HTMLParser()
3600 mobj = re.match(self._VALID_URL, url)
3602 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3605 self.report_webpage(url)
3607 request = urllib2.Request(url)
3609 webpage = urllib2.urlopen(request).read()
3610 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3611 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3614 self.report_extraction(url)
3618 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3620 self._downloader.trouble(u'ERROR: unable to extract video url')
3622 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3626 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3628 self._downloader.trouble(u'ERROR: unable to extract video title')
3630 video_title = mobj.group(1).decode('utf-8')
3632 # Extract description
3633 video_description = u'No description available.'
3634 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3635 if mobj is not None:
3636 video_description = mobj.group(1).decode('utf-8')
3638 video_filename = video_url.split('/')[-1]
3639 video_id, extension = video_filename.split('.')
3641 self._downloader.increment_downloads()
3646 'upload_date': None,
3647 'title': video_title,
3648 'stitle': _simplify_title(video_title),
3650 'format': extension, # Extension is always(?) mp4, but seems to be flv
3652 'description': video_description,
3657 self._downloader.process_info(info)
3658 except UnavailableVideoError, err:
3659 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3661 class MixcloudIE(InfoExtractor):
3662 """Information extractor for www.mixcloud.com"""
3663 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3664 IE_NAME = u'mixcloud'
3666 def __init__(self, downloader=None):
3667 InfoExtractor.__init__(self, downloader)
3669 def report_download_json(self, file_id):
3670 """Report JSON download."""
3671 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3673 def report_extraction(self, file_id):
3674 """Report information extraction."""
3675 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3677 def get_urls(self, jsonData, fmt, bitrate='best'):
3678 """Get urls from 'audio_formats' section in json"""
3681 bitrate_list = jsonData[fmt]
3682 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3683 bitrate = max(bitrate_list) # select highest
3685 url_list = jsonData[fmt][bitrate]
3686 except TypeError: # we have no bitrate info.
3687 url_list = jsonData[fmt]
3691 def check_urls(self, url_list):
3692 """Returns 1st active url from list"""
3693 for url in url_list:
3695 urllib2.urlopen(url)
3697 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3702 def _print_formats(self, formats):
3703 print 'Available formats:'
3704 for fmt in formats.keys():
3705 for b in formats[fmt]:
3707 ext = formats[fmt][b][0]
3708 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3709 except TypeError: # we have no bitrate info
3710 ext = formats[fmt][0]
3711 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3714 def _real_extract(self, url):
3715 mobj = re.match(self._VALID_URL, url)
3717 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3719 # extract uploader & filename from url
3720 uploader = mobj.group(1).decode('utf-8')
3721 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3723 # construct API request
3724 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3725 # retrieve .json file with links to files
3726 request = urllib2.Request(file_url)
3728 self.report_download_json(file_url)
3729 jsonData = urllib2.urlopen(request).read()
3730 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3731 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3735 json_data = json.loads(jsonData)
3736 player_url = json_data['player_swf_url']
3737 formats = dict(json_data['audio_formats'])
3739 req_format = self._downloader.params.get('format', None)
3742 if self._downloader.params.get('listformats', None):
3743 self._print_formats(formats)
3746 if req_format is None or req_format == 'best':
3747 for format_param in formats.keys():
3748 url_list = self.get_urls(formats, format_param)
3750 file_url = self.check_urls(url_list)
3751 if file_url is not None:
3754 if req_format not in formats.keys():
3755 self._downloader.trouble(u'ERROR: format is not available')
3758 url_list = self.get_urls(formats, req_format)
3759 file_url = self.check_urls(url_list)
3760 format_param = req_format
3763 self._downloader.increment_downloads()
3765 # Process file information
3766 self._downloader.process_info({
3767 'id': file_id.decode('utf-8'),
3768 'url': file_url.decode('utf-8'),
3769 'uploader': uploader.decode('utf-8'),
3770 'upload_date': u'NA',
3771 'title': json_data['name'],
3772 'stitle': _simplify_title(json_data['name']),
3773 'ext': file_url.split('.')[-1].decode('utf-8'),
3774 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3775 'thumbnail': json_data['thumbnail_url'],
3776 'description': json_data['description'],
3777 'player_url': player_url.decode('utf-8'),
3779 except UnavailableVideoError, err:
3780 self._downloader.trouble(u'ERROR: unable to download file')
3782 class StanfordOpenClassroomIE(InfoExtractor):
3783 """Information extractor for Stanford's Open ClassRoom"""
3785 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3786 IE_NAME = u'stanfordoc'
3788 def report_download_webpage(self, objid):
3789 """Report information extraction."""
3790 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3792 def report_extraction(self, video_id):
3793 """Report information extraction."""
3794 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3796 def _real_extract(self, url):
3797 mobj = re.match(self._VALID_URL, url)
3799 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3802 if mobj.group('course') and mobj.group('video'): # A specific video
3803 course = mobj.group('course')
3804 video = mobj.group('video')
3806 'id': _simplify_title(course + '_' + video),
3809 self.report_extraction(info['id'])
3810 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3811 xmlUrl = baseUrl + video + '.xml'
3813 metaXml = urllib2.urlopen(xmlUrl).read()
3814 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3815 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3817 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3819 info['title'] = mdoc.findall('./title')[0].text
3820 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3822 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3824 info['stitle'] = _simplify_title(info['title'])
3825 info['ext'] = info['url'].rpartition('.')[2]
3826 info['format'] = info['ext']
3827 self._downloader.increment_downloads()
3829 self._downloader.process_info(info)
3830 except UnavailableVideoError, err:
3831 self._downloader.trouble(u'\nERROR: unable to download video')
3832 elif mobj.group('course'): # A course page
3833 unescapeHTML = HTMLParser.HTMLParser().unescape
3835 course = mobj.group('course')
3837 'id': _simplify_title(course),
3841 self.report_download_webpage(info['id'])
3843 coursepage = urllib2.urlopen(url).read()
3844 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3845 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3848 m = re.search('<h1>([^<]+)</h1>', coursepage)
3850 info['title'] = unescapeHTML(m.group(1))
3852 info['title'] = info['id']
3853 info['stitle'] = _simplify_title(info['title'])
3855 m = re.search('<description>([^<]+)</description>', coursepage)
3857 info['description'] = unescapeHTML(m.group(1))
3859 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3862 'type': 'reference',
3863 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3867 for entry in info['list']:
3868 assert entry['type'] == 'reference'
3869 self.extract(entry['url'])
3871 unescapeHTML = HTMLParser.HTMLParser().unescape
3874 'id': 'Stanford OpenClassroom',
3878 self.report_download_webpage(info['id'])
3879 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3881 rootpage = urllib2.urlopen(rootURL).read()
3882 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3883 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3886 info['title'] = info['id']
3887 info['stitle'] = _simplify_title(info['title'])
3889 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3892 'type': 'reference',
3893 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3897 for entry in info['list']:
3898 assert entry['type'] == 'reference'
3899 self.extract(entry['url'])
3901 class MTVIE(InfoExtractor):
3902 """Information extractor for MTV.com"""
3904 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3907 def report_webpage(self, video_id):
3908 """Report information extraction."""
3909 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3911 def report_extraction(self, video_id):
3912 """Report information extraction."""
3913 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3915 def _real_extract(self, url):
3916 mobj = re.match(self._VALID_URL, url)
3918 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3920 if not mobj.group('proto'):
3921 url = 'http://' + url
3922 video_id = mobj.group('videoid')
3923 self.report_webpage(video_id)
3925 request = urllib2.Request(url)
3927 webpage = urllib2.urlopen(request).read()
3928 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3929 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3932 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3934 self._downloader.trouble(u'ERROR: unable to extract song name')
3936 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3937 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3939 self._downloader.trouble(u'ERROR: unable to extract performer')
3941 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3942 video_title = performer + ' - ' + song_name
3944 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3946 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3948 mtvn_uri = mobj.group(1)
3950 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3952 self._downloader.trouble(u'ERROR: unable to extract content id')
3954 content_id = mobj.group(1)
3956 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3957 self.report_extraction(video_id)
3958 request = urllib2.Request(videogen_url)
3960 metadataXml = urllib2.urlopen(request).read()
3961 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3962 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3965 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3966 renditions = mdoc.findall('.//rendition')
3968 # For now, always pick the highest quality.
3969 rendition = renditions[-1]
3972 _,_,ext = rendition.attrib['type'].partition('/')
3973 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3974 video_url = rendition.find('./src').text
3976 self._downloader.trouble('Invalid rendition field.')
3979 self._downloader.increment_downloads()
3983 'uploader': performer,
3984 'title': video_title,
3985 'stitle': _simplify_title(video_title),
3991 self._downloader.process_info(info)
3992 except UnavailableVideoError, err:
3993 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3996 class PostProcessor(object):
3997 """Post Processor class.
3999 PostProcessor objects can be added to downloaders with their
4000 add_post_processor() method. When the downloader has finished a
4001 successful download, it will take its internal chain of PostProcessors
4002 and start calling the run() method on each one of them, first with
4003 an initial argument and then with the returned value of the previous
4006 The chain will be stopped if one of them ever returns None or the end
4007 of the chain is reached.
4009 PostProcessor objects follow a "mutual registration" process similar
4010 to InfoExtractor objects.
4015 def __init__(self, downloader=None):
4016 self._downloader = downloader
4018 def set_downloader(self, downloader):
4019 """Sets the downloader for this PP."""
4020 self._downloader = downloader
4022 def run(self, information):
4023 """Run the PostProcessor.
4025 The "information" argument is a dictionary like the ones
4026 composed by InfoExtractors. The only difference is that this
4027 one has an extra field called "filepath" that points to the
4030 When this method returns None, the postprocessing chain is
4031 stopped. However, this method may return an information
4032 dictionary that will be passed to the next postprocessing
4033 object in the chain. It can be the one it received after
4034 changing some fields.
4036 In addition, this method may raise a PostProcessingError
4037 exception that will be taken into account by the downloader
4040 return information # by default, do nothing
4042 class AudioConversionError(BaseException):
4043 def __init__(self, message):
4044 self.message = message
4046 class FFmpegExtractAudioPP(PostProcessor):
4048 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4049 PostProcessor.__init__(self, downloader)
4050 if preferredcodec is None:
4051 preferredcodec = 'best'
4052 self._preferredcodec = preferredcodec
4053 self._preferredquality = preferredquality
4054 self._keepvideo = keepvideo
4057 def get_audio_codec(path):
4059 cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4060 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4061 output = handle.communicate()[0]
4062 if handle.wait() != 0:
4064 except (IOError, OSError):
4067 for line in output.split('\n'):
4068 if line.startswith('codec_name='):
4069 audio_codec = line.split('=')[1].strip()
4070 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4075 def run_ffmpeg(path, out_path, codec, more_opts):
4079 acodec_opts = ['-acodec', codec]
4080 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4082 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4083 stdout,stderr = p.communicate()
4084 except (IOError, OSError):
4085 e = sys.exc_info()[1]
4086 if isinstance(e, OSError) and e.errno == 2:
4087 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4090 if p.returncode != 0:
4091 msg = stderr.strip().split('\n')[-1]
4092 raise AudioConversionError(msg)
4094 def run(self, information):
4095 path = information['filepath']
4097 filecodec = self.get_audio_codec(path)
4098 if filecodec is None:
4099 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4103 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4104 if self._preferredcodec == 'm4a' and filecodec == 'aac':
4105 # Lossless, but in another container
4107 extension = self._preferredcodec
4108 more_opts = ['-absf', 'aac_adtstoasc']
4109 elif filecodec in ['aac', 'mp3', 'vorbis']:
4110 # Lossless if possible
4112 extension = filecodec
4113 if filecodec == 'aac':
4114 more_opts = ['-f', 'adts']
4115 if filecodec == 'vorbis':
4119 acodec = 'libmp3lame'
4122 if self._preferredquality is not None:
4123 more_opts += ['-ab', self._preferredquality]
4125 # We convert the audio (lossy)
4126 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4127 extension = self._preferredcodec
4129 if self._preferredquality is not None:
4130 more_opts += ['-ab', self._preferredquality]
4131 if self._preferredcodec == 'aac':
4132 more_opts += ['-f', 'adts']
4133 if self._preferredcodec == 'm4a':
4134 more_opts += ['-absf', 'aac_adtstoasc']
4135 if self._preferredcodec == 'vorbis':
4137 if self._preferredcodec == 'wav':
4139 more_opts += ['-f', 'wav']
4141 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4142 new_path = prefix + sep + extension
4143 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4145 self.run_ffmpeg(path, new_path, acodec, more_opts)
4147 etype,e,tb = sys.exc_info()
4148 if isinstance(e, AudioConversionError):
4149 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4151 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4154 # Try to update the date time for extracted audio file.
4155 if information.get('filetime') is not None:
4157 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4159 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4161 if not self._keepvideo:
4163 os.remove(_encodeFilename(path))
4164 except (IOError, OSError):
4165 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4168 information['filepath'] = new_path
4172 def updateSelf(downloader, filename):
4173 ''' Update the program file with the latest version from the repository '''
4174 # Note: downloader only used for options
4175 if not os.access(filename, os.W_OK):
4176 sys.exit('ERROR: no write permissions on %s' % filename)
4178 downloader.to_screen(u'Updating to latest version...')
4182 urlh = urllib.urlopen(UPDATE_URL)
4183 newcontent = urlh.read()
4185 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4186 if vmatch is not None and vmatch.group(1) == __version__:
4187 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4191 except (IOError, OSError), err:
4192 sys.exit('ERROR: unable to download latest version')
4195 outf = open(filename, 'wb')
4197 outf.write(newcontent)
4200 except (IOError, OSError), err:
4201 sys.exit('ERROR: unable to overwrite current version')
4203 downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4206 def _readOptions(filename_bytes):
4208 optionf = open(filename_bytes)
4210 return [] # silently skip if file is not present
4214 res += shlex.split(l, comments=True)
4219 def _format_option_string(option):
4220 ''' ('-o', '--option') -> -o, --format METAVAR'''
4224 if option._short_opts: opts.append(option._short_opts[0])
4225 if option._long_opts: opts.append(option._long_opts[0])
4226 if len(opts) > 1: opts.insert(1, ', ')
4228 if option.takes_value(): opts.append(' %s' % option.metavar)
4230 return "".join(opts)
4232 def _find_term_columns():
4233 columns = os.environ.get('COLUMNS', None)
4238 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4239 out,err = sp.communicate()
4240 return int(out.split()[1])
4246 max_help_position = 80
4248 # No need to wrap help messages if we're on a wide console
4249 columns = _find_term_columns()
4250 if columns: max_width = columns
4252 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4253 fmt.format_option_strings = _format_option_string
4256 'version' : __version__,
4258 'usage' : '%prog [options] url [url...]',
4259 'conflict_handler' : 'resolve',
4262 parser = optparse.OptionParser(**kw)
4265 general = optparse.OptionGroup(parser, 'General Options')
4266 selection = optparse.OptionGroup(parser, 'Video Selection')
4267 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4268 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4269 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4270 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4271 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4273 general.add_option('-h', '--help',
4274 action='help', help='print this help text and exit')
4275 general.add_option('-v', '--version',
4276 action='version', help='print program version and exit')
4277 general.add_option('-U', '--update',
4278 action='store_true', dest='update_self', help='update this program to latest version')
4279 general.add_option('-i', '--ignore-errors',
4280 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4281 general.add_option('-r', '--rate-limit',
4282 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4283 general.add_option('-R', '--retries',
4284 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4285 general.add_option('--dump-user-agent',
4286 action='store_true', dest='dump_user_agent',
4287 help='display the current browser identification', default=False)
4288 general.add_option('--list-extractors',
4289 action='store_true', dest='list_extractors',
4290 help='List all supported extractors and the URLs they would handle', default=False)
4292 selection.add_option('--playlist-start',
4293 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4294 selection.add_option('--playlist-end',
4295 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4296 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4297 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4298 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4300 authentication.add_option('-u', '--username',
4301 dest='username', metavar='USERNAME', help='account username')
4302 authentication.add_option('-p', '--password',
4303 dest='password', metavar='PASSWORD', help='account password')
4304 authentication.add_option('-n', '--netrc',
4305 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4308 video_format.add_option('-f', '--format',
4309 action='store', dest='format', metavar='FORMAT', help='video format code')
4310 video_format.add_option('--all-formats',
4311 action='store_const', dest='format', help='download all available video formats', const='all')
4312 video_format.add_option('--prefer-free-formats',
4313 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4314 video_format.add_option('--max-quality',
4315 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4316 video_format.add_option('-F', '--list-formats',
4317 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4320 verbosity.add_option('-q', '--quiet',
4321 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4322 verbosity.add_option('-s', '--simulate',
4323 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4324 verbosity.add_option('--skip-download',
4325 action='store_true', dest='skip_download', help='do not download the video', default=False)
4326 verbosity.add_option('-g', '--get-url',
4327 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4328 verbosity.add_option('-e', '--get-title',
4329 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4330 verbosity.add_option('--get-thumbnail',
4331 action='store_true', dest='getthumbnail',
4332 help='simulate, quiet but print thumbnail URL', default=False)
4333 verbosity.add_option('--get-description',
4334 action='store_true', dest='getdescription',
4335 help='simulate, quiet but print video description', default=False)
4336 verbosity.add_option('--get-filename',
4337 action='store_true', dest='getfilename',
4338 help='simulate, quiet but print output filename', default=False)
4339 verbosity.add_option('--get-format',
4340 action='store_true', dest='getformat',
4341 help='simulate, quiet but print output format', default=False)
4342 verbosity.add_option('--no-progress',
4343 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4344 verbosity.add_option('--console-title',
4345 action='store_true', dest='consoletitle',
4346 help='display progress in console titlebar', default=False)
4347 verbosity.add_option('-v', '--verbose',
4348 action='store_true', dest='verbose', help='print various debugging information', default=False)
4351 filesystem.add_option('-t', '--title',
4352 action='store_true', dest='usetitle', help='use title in file name', default=False)
4353 filesystem.add_option('-l', '--literal',
4354 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4355 filesystem.add_option('-A', '--auto-number',
4356 action='store_true', dest='autonumber',
4357 help='number downloaded files starting from 00000', default=False)
4358 filesystem.add_option('-o', '--output',
4359 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4360 filesystem.add_option('-a', '--batch-file',
4361 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4362 filesystem.add_option('-w', '--no-overwrites',
4363 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4364 filesystem.add_option('-c', '--continue',
4365 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4366 filesystem.add_option('--no-continue',
4367 action='store_false', dest='continue_dl',
4368 help='do not resume partially downloaded files (restart from beginning)')
4369 filesystem.add_option('--cookies',
4370 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4371 filesystem.add_option('--no-part',
4372 action='store_true', dest='nopart', help='do not use .part files', default=False)
4373 filesystem.add_option('--no-mtime',
4374 action='store_false', dest='updatetime',
4375 help='do not use the Last-modified header to set the file modification time', default=True)
4376 filesystem.add_option('--write-description',
4377 action='store_true', dest='writedescription',
4378 help='write video description to a .description file', default=False)
4379 filesystem.add_option('--write-info-json',
4380 action='store_true', dest='writeinfojson',
4381 help='write video metadata to a .info.json file', default=False)
4384 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4385 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4386 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4387 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4388 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4389 help='ffmpeg audio bitrate specification, 128k by default')
4390 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4391 help='keeps the video file on disk after the post-processing; the video is erased by default')
4394 parser.add_option_group(general)
4395 parser.add_option_group(selection)
4396 parser.add_option_group(filesystem)
4397 parser.add_option_group(verbosity)
4398 parser.add_option_group(video_format)
4399 parser.add_option_group(authentication)
4400 parser.add_option_group(postproc)
4402 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4404 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4406 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4407 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4408 opts, args = parser.parse_args(argv)
4410 return parser, opts, args
4412 def gen_extractors():
4413 """ Return a list of an instance of every supported extractor.
4414 The order does matter; the first extractor matched is the one handling the URL.
4416 youtube_ie = YoutubeIE()
4417 google_ie = GoogleIE()
4418 yahoo_ie = YahooIE()
4420 YoutubePlaylistIE(youtube_ie),
4421 YoutubeUserIE(youtube_ie),
4422 YoutubeSearchIE(youtube_ie),
4424 MetacafeIE(youtube_ie),
4427 GoogleSearchIE(google_ie),
4430 YahooSearchIE(yahoo_ie),
4443 StanfordOpenClassroomIE(),
4450 parser, opts, args = parseOpts()
4452 # Open appropriate CookieJar
4453 if opts.cookiefile is None:
4454 jar = cookielib.CookieJar()
4457 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4458 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4460 except (IOError, OSError), err:
4461 sys.exit(u'ERROR: unable to open cookie file')
4464 if opts.dump_user_agent:
4465 print std_headers['User-Agent']
4468 # Batch file verification
4470 if opts.batchfile is not None:
4472 if opts.batchfile == '-':
4475 batchfd = open(opts.batchfile, 'r')
4476 batchurls = batchfd.readlines()
4477 batchurls = [x.strip() for x in batchurls]
4478 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4480 sys.exit(u'ERROR: batch file could not be read')
4481 all_urls = batchurls + args
4483 # General configuration
4484 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4485 proxy_handler = urllib2.ProxyHandler()
4486 opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4487 urllib2.install_opener(opener)
4488 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4491 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4493 extractors = gen_extractors()
4495 if opts.list_extractors:
4496 for ie in extractors:
4498 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4499 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4500 for mu in matchedUrls:
4504 # Conflicting, missing and erroneous options
4505 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4506 parser.error(u'using .netrc conflicts with giving username/password')
4507 if opts.password is not None and opts.username is None:
4508 parser.error(u'account username missing')
4509 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4510 parser.error(u'using output template conflicts with using title, literal title or auto number')
4511 if opts.usetitle and opts.useliteral:
4512 parser.error(u'using title conflicts with using literal title')
4513 if opts.username is not None and opts.password is None:
4514 opts.password = getpass.getpass(u'Type account password and press return:')
4515 if opts.ratelimit is not None:
4516 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4517 if numeric_limit is None:
4518 parser.error(u'invalid rate limit specified')
4519 opts.ratelimit = numeric_limit
4520 if opts.retries is not None:
4522 opts.retries = long(opts.retries)
4523 except (TypeError, ValueError), err:
4524 parser.error(u'invalid retry count specified')
4526 opts.playliststart = int(opts.playliststart)
4527 if opts.playliststart <= 0:
4528 raise ValueError(u'Playlist start must be positive')
4529 except (TypeError, ValueError), err:
4530 parser.error(u'invalid playlist start number specified')
4532 opts.playlistend = int(opts.playlistend)
4533 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4534 raise ValueError(u'Playlist end must be greater than playlist start')
4535 except (TypeError, ValueError), err:
4536 parser.error(u'invalid playlist end number specified')
4537 if opts.extractaudio:
4538 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4539 parser.error(u'invalid audio format specified')
4542 fd = FileDownloader({
4543 'usenetrc': opts.usenetrc,
4544 'username': opts.username,
4545 'password': opts.password,
4546 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4547 'forceurl': opts.geturl,
4548 'forcetitle': opts.gettitle,
4549 'forcethumbnail': opts.getthumbnail,
4550 'forcedescription': opts.getdescription,
4551 'forcefilename': opts.getfilename,
4552 'forceformat': opts.getformat,
4553 'simulate': opts.simulate,
4554 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4555 'format': opts.format,
4556 'format_limit': opts.format_limit,
4557 'listformats': opts.listformats,
4558 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4559 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4560 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4561 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4562 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4563 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4564 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4565 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4566 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4567 or u'%(id)s.%(ext)s'),
4568 'ignoreerrors': opts.ignoreerrors,
4569 'ratelimit': opts.ratelimit,
4570 'nooverwrites': opts.nooverwrites,
4571 'retries': opts.retries,
4572 'continuedl': opts.continue_dl,
4573 'noprogress': opts.noprogress,
4574 'playliststart': opts.playliststart,
4575 'playlistend': opts.playlistend,
4576 'logtostderr': opts.outtmpl == '-',
4577 'consoletitle': opts.consoletitle,
4578 'nopart': opts.nopart,
4579 'updatetime': opts.updatetime,
4580 'writedescription': opts.writedescription,
4581 'writeinfojson': opts.writeinfojson,
4582 'matchtitle': opts.matchtitle,
4583 'rejecttitle': opts.rejecttitle,
4584 'max_downloads': opts.max_downloads,
4585 'prefer_free_formats': opts.prefer_free_formats,
4586 'verbose': opts.verbose,
4588 for extractor in extractors:
4589 fd.add_info_extractor(extractor)
4592 if opts.extractaudio:
4593 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4596 if opts.update_self:
4597 updateSelf(fd, sys.argv[0])
4600 if len(all_urls) < 1:
4601 if not opts.update_self:
4602 parser.error(u'you must provide at least one URL')
4607 retcode = fd.download(all_urls)
4608 except MaxDownloadsReached:
4609 fd.to_screen(u'--max-download limit reached, aborting.')
4612 # Dump cookie jar if requested
4613 if opts.cookiefile is not None:
4616 except (IOError, OSError), err:
4617 sys.exit(u'ERROR: unable to save cookie jar')
4624 except DownloadError:
4626 except SameFileError:
4627 sys.exit(u'ERROR: fixed output name but more than one file to download')
4628 except KeyboardInterrupt:
4629 sys.exit(u'\nERROR: Interrupted by user')
4631 if __name__ == '__main__':
4634 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: