2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
20 __license__ = 'Public Domain'
21 __version__ = '2012.02.27'
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
56 except ImportError: # Python 2.4
59 import cStringIO as StringIO
63 # parse_qs was moved from the cgi module to the urlparse module recently.
65 from urlparse import parse_qs
67 from cgi import parse_qs
75 import xml.etree.ElementTree
76 except ImportError: # Python<2.5: Not officially supported, but let it slip
77 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
80 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
81 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
82 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
83 'Accept-Encoding': 'gzip, deflate',
84 'Accept-Language': 'en-us,en;q=0.5',
89 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
95 def raiseError(msg, i):
96 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
97 def skipSpace(i, expectMore=True):
98 while i < len(s) and s[i] in ' \t\r\n':
102 raiseError('Premature end', i)
104 def decodeEscape(match):
120 return unichr(int(esc[1:5], 16))
121 if len(esc) == 5+6 and esc[5:7] == '\\u':
122 hi = int(esc[1:5], 16)
123 low = int(esc[7:11], 16)
124 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
125 raise ValueError('Unknown escape ' + str(esc))
132 while s[e-bslashes-1] == '\\':
134 if bslashes % 2 == 1:
138 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
139 stri = rexp.sub(decodeEscape, s[i:e])
145 if s[i] == '}': # Empty dictionary
149 raiseError('Expected a string object key', i)
150 i,key = parseString(i)
152 if i >= len(s) or s[i] != ':':
153 raiseError('Expected a colon', i)
160 raiseError('Expected comma or closing curly brace', i)
165 if s[i] == ']': # Empty array
170 i = skipSpace(i) # Raise exception if premature end
174 raiseError('Expected a comma or closing bracket', i)
176 def parseDiscrete(i):
177 for k,v in {'true': True, 'false': False, 'null': None}.items():
178 if s.startswith(k, i):
180 raiseError('Not a boolean (or null)', i)
182 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
184 raiseError('Not a number', i)
186 if '.' in nums or 'e' in nums or 'E' in nums:
187 return (i+len(nums), float(nums))
188 return (i+len(nums), int(nums))
189 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
192 i,res = CHARMAP.get(s[i], parseNumber)(i)
193 i = skipSpace(i, False)
197 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
200 def preferredencoding():
201 """Get preferred encoding.
203 Returns the best encoding scheme for the system, based on
204 locale.getpreferredencoding() and some further tweaks.
206 def yield_preferredencoding():
208 pref = locale.getpreferredencoding()
214 return yield_preferredencoding().next()
217 def htmlentity_transform(matchobj):
218 """Transforms an HTML entity to a Unicode character.
220 This function receives a match object and is intended to be used with
221 the re.sub() function.
223 entity = matchobj.group(1)
225 # Known non-numeric HTML entity
226 if entity in htmlentitydefs.name2codepoint:
227 return unichr(htmlentitydefs.name2codepoint[entity])
230 mobj = re.match(ur'(?u)#(x?\d+)', entity)
232 numstr = mobj.group(1)
233 if numstr.startswith(u'x'):
235 numstr = u'0%s' % numstr
238 return unichr(long(numstr, base))
240 # Unknown entity in name, return its literal representation
241 return (u'&%s;' % entity)
244 def sanitize_title(utitle):
245 """Sanitizes a video title so it could be used as part of a filename."""
246 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
247 return utitle.replace(unicode(os.sep), u'%')
250 def sanitize_open(filename, open_mode):
251 """Try to open the given filename, and slightly tweak it if this fails.
253 Attempts to open the given filename. If this fails, it tries to change
254 the filename slightly, step by step, until it's either able to open it
255 or it fails and raises a final exception, like the standard open()
258 It returns the tuple (stream, definitive_file_name).
262 if sys.platform == 'win32':
264 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
265 return (sys.stdout, filename)
266 stream = open(_encodeFilename(filename), open_mode)
267 return (stream, filename)
268 except (IOError, OSError), err:
269 # In case of error, try to remove win32 forbidden chars
270 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
272 # An exception here should be caught in the caller
273 stream = open(_encodeFilename(filename), open_mode)
274 return (stream, filename)
277 def timeconvert(timestr):
278 """Convert RFC 2822 defined time string into system timestamp"""
280 timetuple = email.utils.parsedate_tz(timestr)
281 if timetuple is not None:
282 timestamp = email.utils.mktime_tz(timetuple)
285 def _simplify_title(title):
286 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
287 return expr.sub(u'_', title).strip(u'_')
289 def _orderedSet(iterable):
290 """ Remove all duplicates from the input iterable """
297 def _unescapeHTML(s):
299 @param s a string (of type unicode)
301 assert type(s) == type(u'')
303 htmlParser = HTMLParser.HTMLParser()
304 return htmlParser.unescape(s)
306 def _encodeFilename(s):
308 @param s The name of the file (of type unicode)
311 assert type(s) == type(u'')
313 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
314 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
315 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
316 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
319 return s.encode(sys.getfilesystemencoding(), 'ignore')
321 class DownloadError(Exception):
322 """Download Error exception.
324 This exception may be thrown by FileDownloader objects if they are not
325 configured to continue on errors. They will contain the appropriate
331 class SameFileError(Exception):
332 """Same File exception.
334 This exception will be thrown by FileDownloader objects if they detect
335 multiple files would have to be downloaded to the same file on disk.
340 class PostProcessingError(Exception):
341 """Post Processing exception.
343 This exception may be raised by PostProcessor's .run() method to
344 indicate an error in the postprocessing task.
348 class MaxDownloadsReached(Exception):
349 """ --max-downloads limit has been reached. """
353 class UnavailableVideoError(Exception):
354 """Unavailable Format exception.
356 This exception will be thrown when a video is requested
357 in a format that is not available for that video.
362 class ContentTooShortError(Exception):
363 """Content Too Short exception.
365 This exception may be raised by FileDownloader objects when a file they
366 download is too small for what the server announced first, indicating
367 the connection was probably interrupted.
373 def __init__(self, downloaded, expected):
374 self.downloaded = downloaded
375 self.expected = expected
378 class YoutubeDLHandler(urllib2.HTTPHandler):
379 """Handler for HTTP requests and responses.
381 This class, when installed with an OpenerDirector, automatically adds
382 the standard headers to every HTTP request and handles gzipped and
383 deflated responses from web servers. If compression is to be avoided in
384 a particular request, the original request in the program code only has
385 to include the HTTP header "Youtubedl-No-Compression", which will be
386 removed before making the real request.
388 Part of this code was copied from:
390 http://techknack.net/python-urllib2-handlers/
392 Andrew Rowls, the author of that code, agreed to release it to the
399 return zlib.decompress(data, -zlib.MAX_WBITS)
401 return zlib.decompress(data)
404 def addinfourl_wrapper(stream, headers, url, code):
405 if hasattr(urllib2.addinfourl, 'getcode'):
406 return urllib2.addinfourl(stream, headers, url, code)
407 ret = urllib2.addinfourl(stream, headers, url)
411 def http_request(self, req):
412 for h in std_headers:
415 req.add_header(h, std_headers[h])
416 if 'Youtubedl-no-compression' in req.headers:
417 if 'Accept-encoding' in req.headers:
418 del req.headers['Accept-encoding']
419 del req.headers['Youtubedl-no-compression']
422 def http_response(self, req, resp):
425 if resp.headers.get('Content-encoding', '') == 'gzip':
426 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
427 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
428 resp.msg = old_resp.msg
430 if resp.headers.get('Content-encoding', '') == 'deflate':
431 gz = StringIO.StringIO(self.deflate(resp.read()))
432 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
433 resp.msg = old_resp.msg
437 class FileDownloader(object):
438 """File Downloader class.
440 File downloader objects are the ones responsible of downloading the
441 actual video file and writing it to disk if the user has requested
442 it, among some other tasks. In most cases there should be one per
443 program. As, given a video URL, the downloader doesn't know how to
444 extract all the needed information, task that InfoExtractors do, it
445 has to pass the URL to one of them.
447 For this, file downloader objects have a method that allows
448 InfoExtractors to be registered in a given order. When it is passed
449 a URL, the file downloader handles it to the first InfoExtractor it
450 finds that reports being able to handle it. The InfoExtractor extracts
451 all the information about the video or videos the URL refers to, and
452 asks the FileDownloader to process the video information, possibly
453 downloading the video.
455 File downloaders accept a lot of parameters. In order not to saturate
456 the object constructor with arguments, it receives a dictionary of
457 options instead. These options are available through the params
458 attribute for the InfoExtractors to use. The FileDownloader also
459 registers itself as the downloader in charge for the InfoExtractors
460 that are added to it, so this is a "mutual registration".
464 username: Username for authentication purposes.
465 password: Password for authentication purposes.
466 usenetrc: Use netrc for authentication instead.
467 quiet: Do not print messages to stdout.
468 forceurl: Force printing final URL.
469 forcetitle: Force printing title.
470 forcethumbnail: Force printing thumbnail URL.
471 forcedescription: Force printing description.
472 forcefilename: Force printing final filename.
473 simulate: Do not download the video files.
474 format: Video format code.
475 format_limit: Highest quality format to try.
476 outtmpl: Template for output names.
477 ignoreerrors: Do not stop on download errors.
478 ratelimit: Download speed limit, in bytes/sec.
479 nooverwrites: Prevent overwriting files.
480 retries: Number of times to retry for HTTP error 5xx
481 continuedl: Try to continue downloads if possible.
482 noprogress: Do not print the progress bar.
483 playliststart: Playlist item to start at.
484 playlistend: Playlist item to end at.
485 matchtitle: Download only matching titles.
486 rejecttitle: Reject downloads for matching titles.
487 logtostderr: Log messages to stderr instead of stdout.
488 consoletitle: Display progress in console window's titlebar.
489 nopart: Do not use temporary .part files.
490 updatetime: Use the Last-modified header to set output file timestamps.
491 writedescription: Write the video description to a .description file
492 writeinfojson: Write the video description to a .info.json file
498 _download_retcode = None
499 _num_downloads = None
502 def __init__(self, params):
503 """Create a FileDownloader object with the given options."""
506 self._download_retcode = 0
507 self._num_downloads = 0
508 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
512 def format_bytes(bytes):
515 if type(bytes) is str:
520 exponent = long(math.log(bytes, 1024.0))
521 suffix = 'bkMGTPEZY'[exponent]
522 converted = float(bytes) / float(1024 ** exponent)
523 return '%.2f%s' % (converted, suffix)
526 def calc_percent(byte_counter, data_len):
529 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
532 def calc_eta(start, now, total, current):
536 if current == 0 or dif < 0.001: # One millisecond
538 rate = float(current) / dif
539 eta = long((float(total) - float(current)) / rate)
540 (eta_mins, eta_secs) = divmod(eta, 60)
543 return '%02d:%02d' % (eta_mins, eta_secs)
546 def calc_speed(start, now, bytes):
548 if bytes == 0 or dif < 0.001: # One millisecond
549 return '%10s' % '---b/s'
550 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
553 def best_block_size(elapsed_time, bytes):
554 new_min = max(bytes / 2.0, 1.0)
555 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
556 if elapsed_time < 0.001:
558 rate = bytes / elapsed_time
566 def parse_bytes(bytestr):
567 """Parse a string indicating a byte quantity into a long integer."""
568 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
571 number = float(matchobj.group(1))
572 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
573 return long(round(number * multiplier))
575 def add_info_extractor(self, ie):
576 """Add an InfoExtractor object to the end of the list."""
578 ie.set_downloader(self)
580 def add_post_processor(self, pp):
581 """Add a PostProcessor object to the end of the chain."""
583 pp.set_downloader(self)
585 def to_screen(self, message, skip_eol=False):
586 """Print message to stdout if not in quiet mode."""
587 assert type(message) == type(u'')
588 if not self.params.get('quiet', False):
589 terminator = [u'\n', u''][skip_eol]
590 output = message + terminator
592 if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
593 output = output.encode(preferredencoding(), 'ignore')
594 self._screen_file.write(output)
595 self._screen_file.flush()
597 def to_stderr(self, message):
598 """Print message to stderr."""
599 print >>sys.stderr, message.encode(preferredencoding())
601 def to_cons_title(self, message):
602 """Set console/terminal window title to message."""
603 if not self.params.get('consoletitle', False):
605 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
606 # c_wchar_p() might not be necessary if `message` is
607 # already of type unicode()
608 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
609 elif 'TERM' in os.environ:
610 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
612 def fixed_template(self):
613 """Checks if the output template is fixed."""
614 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
616 def trouble(self, message=None):
617 """Determine action to take when a download problem appears.
619 Depending on if the downloader has been configured to ignore
620 download errors or not, this method may throw an exception or
621 not when errors are found, after printing the message.
623 if message is not None:
624 self.to_stderr(message)
625 if not self.params.get('ignoreerrors', False):
626 raise DownloadError(message)
627 self._download_retcode = 1
629 def slow_down(self, start_time, byte_counter):
630 """Sleep if the download speed is over the rate limit."""
631 rate_limit = self.params.get('ratelimit', None)
632 if rate_limit is None or byte_counter == 0:
635 elapsed = now - start_time
638 speed = float(byte_counter) / elapsed
639 if speed > rate_limit:
640 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
642 def temp_name(self, filename):
643 """Returns a temporary filename for the given filename."""
644 if self.params.get('nopart', False) or filename == u'-' or \
645 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
647 return filename + u'.part'
649 def undo_temp_name(self, filename):
650 if filename.endswith(u'.part'):
651 return filename[:-len(u'.part')]
654 def try_rename(self, old_filename, new_filename):
656 if old_filename == new_filename:
658 os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
659 except (IOError, OSError), err:
660 self.trouble(u'ERROR: unable to rename file')
662 def try_utime(self, filename, last_modified_hdr):
663 """Try to set the last-modified time of the given file."""
664 if last_modified_hdr is None:
666 if not os.path.isfile(_encodeFilename(filename)):
668 timestr = last_modified_hdr
671 filetime = timeconvert(timestr)
675 os.utime(filename, (time.time(), filetime))
680 def report_writedescription(self, descfn):
681 """ Report that the description file is being written """
682 self.to_screen(u'[info] Writing video description to: ' + descfn)
684 def report_writeinfojson(self, infofn):
685 """ Report that the metadata file has been written """
686 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
688 def report_destination(self, filename):
689 """Report destination filename."""
690 self.to_screen(u'[download] Destination: ' + filename)
692 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
693 """Report download progress."""
694 if self.params.get('noprogress', False):
696 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
697 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
698 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
699 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
701 def report_resuming_byte(self, resume_len):
702 """Report attempt to resume at given byte."""
703 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
705 def report_retry(self, count, retries):
706 """Report retry in case of HTTP error 5xx"""
707 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
709 def report_file_already_downloaded(self, file_name):
710 """Report file has already been fully downloaded."""
712 self.to_screen(u'[download] %s has already been downloaded' % file_name)
713 except (UnicodeEncodeError), err:
714 self.to_screen(u'[download] The file has already been downloaded')
716 def report_unable_to_resume(self):
717 """Report it was impossible to resume download."""
718 self.to_screen(u'[download] Unable to resume')
720 def report_finish(self):
721 """Report download finished."""
722 if self.params.get('noprogress', False):
723 self.to_screen(u'[download] Download completed')
727 def increment_downloads(self):
728 """Increment the ordinal that assigns a number to each file."""
729 self._num_downloads += 1
731 def prepare_filename(self, info_dict):
732 """Generate the output filename."""
734 template_dict = dict(info_dict)
735 template_dict['epoch'] = unicode(long(time.time()))
736 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
737 filename = self.params['outtmpl'] % template_dict
739 except (ValueError, KeyError), err:
740 self.trouble(u'ERROR: invalid system charset or erroneous output template')
743 def _match_entry(self, info_dict):
744 """ Returns None iff the file should be downloaded """
746 title = info_dict['title']
747 matchtitle = self.params.get('matchtitle', False)
748 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
749 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
750 rejecttitle = self.params.get('rejecttitle', False)
751 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
752 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
755 def process_info(self, info_dict):
756 """Process a single dictionary returned by an InfoExtractor."""
758 reason = self._match_entry(info_dict)
759 if reason is not None:
760 self.to_screen(u'[download] ' + reason)
763 max_downloads = self.params.get('max_downloads')
764 if max_downloads is not None:
765 if self._num_downloads > int(max_downloads):
766 raise MaxDownloadsReached()
768 filename = self.prepare_filename(info_dict)
771 if self.params.get('forcetitle', False):
772 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
773 if self.params.get('forceurl', False):
774 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
775 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
776 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
777 if self.params.get('forcedescription', False) and 'description' in info_dict:
778 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
779 if self.params.get('forcefilename', False) and filename is not None:
780 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
781 if self.params.get('forceformat', False):
782 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
784 # Do nothing else if in simulate mode
785 if self.params.get('simulate', False):
792 dn = os.path.dirname(_encodeFilename(filename))
793 if dn != '' and not os.path.exists(dn): # dn is already encoded
795 except (OSError, IOError), err:
796 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
799 if self.params.get('writedescription', False):
801 descfn = filename + u'.description'
802 self.report_writedescription(descfn)
803 descfile = open(_encodeFilename(descfn), 'wb')
805 descfile.write(info_dict['description'].encode('utf-8'))
808 except (OSError, IOError):
809 self.trouble(u'ERROR: Cannot write description file ' + descfn)
812 if self.params.get('writeinfojson', False):
813 infofn = filename + u'.info.json'
814 self.report_writeinfojson(infofn)
817 except (NameError,AttributeError):
818 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
821 infof = open(_encodeFilename(infofn), 'wb')
823 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
824 json.dump(json_info_dict, infof)
827 except (OSError, IOError):
828 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
831 if not self.params.get('skip_download', False):
832 if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
836 success = self._do_download(filename, info_dict)
837 except (OSError, IOError), err:
838 raise UnavailableVideoError
839 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
840 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
842 except (ContentTooShortError, ), err:
843 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
848 self.post_process(filename, info_dict)
849 except (PostProcessingError), err:
850 self.trouble(u'ERROR: postprocessing: %s' % str(err))
853 def download(self, url_list):
854 """Download a given list of URLs."""
855 if len(url_list) > 1 and self.fixed_template():
856 raise SameFileError(self.params['outtmpl'])
859 suitable_found = False
861 # Go to next InfoExtractor if not suitable
862 if not ie.suitable(url):
865 # Suitable InfoExtractor found
866 suitable_found = True
868 # Extract information from URL and process it
871 # Suitable InfoExtractor had been found; go to next URL
874 if not suitable_found:
875 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
877 return self._download_retcode
879 def post_process(self, filename, ie_info):
880 """Run the postprocessing chain on the given file."""
882 info['filepath'] = filename
888 def _download_with_rtmpdump(self, filename, url, player_url):
889 self.report_destination(filename)
890 tmpfilename = self.temp_name(filename)
892 # Check for rtmpdump first
894 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
895 except (OSError, IOError):
896 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
899 # Download using rtmpdump. rtmpdump returns exit code 2 when
900 # the connection was interrumpted and resuming appears to be
901 # possible. This is part of rtmpdump's normal usage, AFAIK.
902 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
903 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
904 if self.params.get('verbose', False):
907 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
910 self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
911 retval = subprocess.call(args)
912 while retval == 2 or retval == 1:
913 prevsize = os.path.getsize(_encodeFilename(tmpfilename))
914 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
915 time.sleep(5.0) # This seems to be needed
916 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
917 cursize = os.path.getsize(_encodeFilename(tmpfilename))
918 if prevsize == cursize and retval == 1:
920 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
921 if prevsize == cursize and retval == 2 and cursize > 1024:
922 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
926 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
927 self.try_rename(tmpfilename, filename)
930 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
933 def _do_download(self, filename, info_dict):
934 url = info_dict['url']
935 player_url = info_dict.get('player_url', None)
937 # Check file already present
938 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
939 self.report_file_already_downloaded(filename)
942 # Attempt to download using rtmpdump
943 if url.startswith('rtmp'):
944 return self._download_with_rtmpdump(filename, url, player_url)
946 tmpfilename = self.temp_name(filename)
949 # Do not include the Accept-Encoding header
950 headers = {'Youtubedl-no-compression': 'True'}
951 basic_request = urllib2.Request(url, None, headers)
952 request = urllib2.Request(url, None, headers)
954 # Establish possible resume length
955 if os.path.isfile(_encodeFilename(tmpfilename)):
956 resume_len = os.path.getsize(_encodeFilename(tmpfilename))
962 if self.params.get('continuedl', False):
963 self.report_resuming_byte(resume_len)
964 request.add_header('Range','bytes=%d-' % resume_len)
970 retries = self.params.get('retries', 0)
971 while count <= retries:
972 # Establish connection
974 if count == 0 and 'urlhandle' in info_dict:
975 data = info_dict['urlhandle']
976 data = urllib2.urlopen(request)
978 except (urllib2.HTTPError, ), err:
979 if (err.code < 500 or err.code >= 600) and err.code != 416:
980 # Unexpected HTTP error
982 elif err.code == 416:
983 # Unable to resume (requested range not satisfiable)
985 # Open the connection again without the range header
986 data = urllib2.urlopen(basic_request)
987 content_length = data.info()['Content-Length']
988 except (urllib2.HTTPError, ), err:
989 if err.code < 500 or err.code >= 600:
992 # Examine the reported length
993 if (content_length is not None and
994 (resume_len - 100 < long(content_length) < resume_len + 100)):
995 # The file had already been fully downloaded.
996 # Explanation to the above condition: in issue #175 it was revealed that
997 # YouTube sometimes adds or removes a few bytes from the end of the file,
998 # changing the file size slightly and causing problems for some users. So
999 # I decided to implement a suggested change and consider the file
1000 # completely downloaded if the file size differs less than 100 bytes from
1001 # the one in the hard drive.
1002 self.report_file_already_downloaded(filename)
1003 self.try_rename(tmpfilename, filename)
1006 # The length does not match, we start the download over
1007 self.report_unable_to_resume()
1012 if count <= retries:
1013 self.report_retry(count, retries)
1016 self.trouble(u'ERROR: giving up after %s retries' % retries)
1019 data_len = data.info().get('Content-length', None)
1020 if data_len is not None:
1021 data_len = long(data_len) + resume_len
1022 data_len_str = self.format_bytes(data_len)
1023 byte_counter = 0 + resume_len
1027 # Download and write
1028 before = time.time()
1029 data_block = data.read(block_size)
1031 if len(data_block) == 0:
1033 byte_counter += len(data_block)
1035 # Open file just in time
1038 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1039 assert stream is not None
1040 filename = self.undo_temp_name(tmpfilename)
1041 self.report_destination(filename)
1042 except (OSError, IOError), err:
1043 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1046 stream.write(data_block)
1047 except (IOError, OSError), err:
1048 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1050 block_size = self.best_block_size(after - before, len(data_block))
1053 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1054 if data_len is None:
1055 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1057 percent_str = self.calc_percent(byte_counter, data_len)
1058 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1059 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1062 self.slow_down(start, byte_counter - resume_len)
1065 self.trouble(u'\nERROR: Did not get any data blocks')
1068 self.report_finish()
1069 if data_len is not None and byte_counter != data_len:
1070 raise ContentTooShortError(byte_counter, long(data_len))
1071 self.try_rename(tmpfilename, filename)
1073 # Update file modification time
1074 if self.params.get('updatetime', True):
1075 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1080 class InfoExtractor(object):
1081 """Information Extractor class.
1083 Information extractors are the classes that, given a URL, extract
1084 information from the video (or videos) the URL refers to. This
1085 information includes the real video URL, the video title and simplified
1086 title, author and others. The information is stored in a dictionary
1087 which is then passed to the FileDownloader. The FileDownloader
1088 processes this information possibly downloading the video to the file
1089 system, among other possible outcomes. The dictionaries must include
1090 the following fields:
1092 id: Video identifier.
1093 url: Final video URL.
1094 uploader: Nickname of the video uploader.
1095 title: Literal title.
1096 stitle: Simplified title.
1097 ext: Video filename extension.
1098 format: Video format.
1099 player_url: SWF Player URL (may be None).
1101 The following fields are optional. Their primary purpose is to allow
1102 youtube-dl to serve as the backend for a video search function, such
1103 as the one in youtube2mp3. They are only used when their respective
1104 forced printing functions are called:
1106 thumbnail: Full URL to a video thumbnail image.
1107 description: One-line video description.
1109 Subclasses of this one should re-define the _real_initialize() and
1110 _real_extract() methods and define a _VALID_URL regexp.
1111 Probably, they should also be added to the list of extractors.
1117 def __init__(self, downloader=None):
1118 """Constructor. Receives an optional downloader."""
1120 self.set_downloader(downloader)
1122 def suitable(self, url):
1123 """Receives a URL and returns True if suitable for this IE."""
1124 return re.match(self._VALID_URL, url) is not None
1126 def initialize(self):
1127 """Initializes an instance (authentication, etc)."""
1129 self._real_initialize()
1132 def extract(self, url):
1133 """Extracts URL information and returns it in list of dicts."""
1135 return self._real_extract(url)
1137 def set_downloader(self, downloader):
1138 """Sets the downloader for this IE."""
1139 self._downloader = downloader
1141 def _real_initialize(self):
1142 """Real initialization process. Redefine in subclasses."""
1145 def _real_extract(self, url):
1146 """Real extraction process. Redefine in subclasses."""
1150 class YoutubeIE(InfoExtractor):
1151 """Information extractor for youtube.com."""
1153 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1154 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1155 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1156 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1157 _NETRC_MACHINE = 'youtube'
1158 # Listed in order of quality
1159 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1160 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1161 _video_extensions = {
1167 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1172 _video_dimensions = {
1187 IE_NAME = u'youtube'
1189 def report_lang(self):
1190 """Report attempt to set language."""
1191 self._downloader.to_screen(u'[youtube] Setting language')
1193 def report_login(self):
1194 """Report attempt to log in."""
1195 self._downloader.to_screen(u'[youtube] Logging in')
1197 def report_age_confirmation(self):
1198 """Report attempt to confirm age."""
1199 self._downloader.to_screen(u'[youtube] Confirming age')
1201 def report_video_webpage_download(self, video_id):
1202 """Report attempt to download video webpage."""
1203 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1205 def report_video_info_webpage_download(self, video_id):
1206 """Report attempt to download video info webpage."""
1207 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1209 def report_information_extraction(self, video_id):
1210 """Report attempt to extract video information."""
1211 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1213 def report_unavailable_format(self, video_id, format):
1214 """Report extracted video URL."""
1215 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1217 def report_rtmp_download(self):
1218 """Indicate the download will use the RTMP protocol."""
1219 self._downloader.to_screen(u'[youtube] RTMP download detected')
1221 def _print_formats(self, formats):
1222 print 'Available formats:'
1224 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1226 def _real_initialize(self):
1227 if self._downloader is None:
1232 downloader_params = self._downloader.params
1234 # Attempt to use provided username and password or .netrc data
1235 if downloader_params.get('username', None) is not None:
1236 username = downloader_params['username']
1237 password = downloader_params['password']
1238 elif downloader_params.get('usenetrc', False):
1240 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1241 if info is not None:
1245 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1246 except (IOError, netrc.NetrcParseError), err:
1247 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1251 request = urllib2.Request(self._LANG_URL)
1254 urllib2.urlopen(request).read()
1255 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1256 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1259 # No authentication to be performed
1260 if username is None:
1265 'current_form': 'loginForm',
1267 'action_login': 'Log In',
1268 'username': username,
1269 'password': password,
1271 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1274 login_results = urllib2.urlopen(request).read()
1275 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1276 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1278 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1279 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1285 'action_confirm': 'Confirm',
1287 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1289 self.report_age_confirmation()
1290 age_results = urllib2.urlopen(request).read()
1291 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1292 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1295 def _real_extract(self, url):
1296 # Extract video id from URL
1297 mobj = re.match(self._VALID_URL, url)
1299 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1301 video_id = mobj.group(2)
1304 self.report_video_webpage_download(video_id)
1305 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1307 video_webpage = urllib2.urlopen(request).read()
1308 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1309 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1312 # Attempt to extract SWF player URL
1313 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1314 if mobj is not None:
1315 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1320 self.report_video_info_webpage_download(video_id)
1321 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1322 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1323 % (video_id, el_type))
1324 request = urllib2.Request(video_info_url)
1326 video_info_webpage = urllib2.urlopen(request).read()
1327 video_info = parse_qs(video_info_webpage)
1328 if 'token' in video_info:
1330 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1331 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1333 if 'token' not in video_info:
1334 if 'reason' in video_info:
1335 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1337 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1340 # Start extracting information
1341 self.report_information_extraction(video_id)
1344 if 'author' not in video_info:
1345 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1347 video_uploader = urllib.unquote_plus(video_info['author'][0])
1350 if 'title' not in video_info:
1351 self._downloader.trouble(u'ERROR: unable to extract video title')
1353 video_title = urllib.unquote_plus(video_info['title'][0])
1354 video_title = video_title.decode('utf-8')
1355 video_title = sanitize_title(video_title)
1358 simple_title = _simplify_title(video_title)
1361 if 'thumbnail_url' not in video_info:
1362 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1363 video_thumbnail = ''
1364 else: # don't panic if we can't find it
1365 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1369 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1370 if mobj is not None:
1371 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1372 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1373 for expression in format_expressions:
1375 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1383 video_description = u'No description available.'
1384 mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1385 if mobj is not None:
1386 video_description = mobj.group(1).decode('utf-8')
1388 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1389 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1390 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1391 # TODO use another parser
1394 video_token = urllib.unquote_plus(video_info['token'][0])
1396 # Decide which formats to download
1397 req_format = self._downloader.params.get('format', None)
1399 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1400 self.report_rtmp_download()
1401 video_url_list = [(None, video_info['conn'][0])]
1402 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1403 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1404 url_data = [parse_qs(uds) for uds in url_data_strs]
1405 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1406 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1408 format_limit = self._downloader.params.get('format_limit', None)
1409 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1410 if format_limit is not None and format_limit in available_formats:
1411 format_list = available_formats[available_formats.index(format_limit):]
1413 format_list = available_formats
1414 existing_formats = [x for x in format_list if x in url_map]
1415 if len(existing_formats) == 0:
1416 self._downloader.trouble(u'ERROR: no known formats available for video')
1418 if self._downloader.params.get('listformats', None):
1419 self._print_formats(existing_formats)
1421 if req_format is None or req_format == 'best':
1422 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1423 elif req_format == 'worst':
1424 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1425 elif req_format in ('-1', 'all'):
1426 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1428 # Specific formats. We pick the first in a slash-delimeted sequence.
1429 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1430 req_formats = req_format.split('/')
1431 video_url_list = None
1432 for rf in req_formats:
1434 video_url_list = [(rf, url_map[rf])]
1436 if video_url_list is None:
1437 self._downloader.trouble(u'ERROR: requested format not available')
1440 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1443 for format_param, video_real_url in video_url_list:
1444 # At this point we have a new video
1445 self._downloader.increment_downloads()
1448 video_extension = self._video_extensions.get(format_param, 'flv')
1451 # Process video information
1452 self._downloader.process_info({
1453 'id': video_id.decode('utf-8'),
1454 'url': video_real_url.decode('utf-8'),
1455 'uploader': video_uploader.decode('utf-8'),
1456 'upload_date': upload_date,
1457 'title': video_title,
1458 'stitle': simple_title,
1459 'ext': video_extension.decode('utf-8'),
1460 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1461 'thumbnail': video_thumbnail.decode('utf-8'),
1462 'description': video_description,
1463 'player_url': player_url,
1465 except UnavailableVideoError, err:
1466 self._downloader.trouble(u'\nERROR: unable to download video')
1469 class MetacafeIE(InfoExtractor):
1470 """Information Extractor for metacafe.com."""
1472 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1473 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1474 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1476 IE_NAME = u'metacafe'
1478 def __init__(self, youtube_ie, downloader=None):
1479 InfoExtractor.__init__(self, downloader)
1480 self._youtube_ie = youtube_ie
1482 def report_disclaimer(self):
1483 """Report disclaimer retrieval."""
1484 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1486 def report_age_confirmation(self):
1487 """Report attempt to confirm age."""
1488 self._downloader.to_screen(u'[metacafe] Confirming age')
1490 def report_download_webpage(self, video_id):
1491 """Report webpage download."""
1492 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1494 def report_extraction(self, video_id):
1495 """Report information extraction."""
1496 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1498 def _real_initialize(self):
1499 # Retrieve disclaimer
1500 request = urllib2.Request(self._DISCLAIMER)
1502 self.report_disclaimer()
1503 disclaimer = urllib2.urlopen(request).read()
1504 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1505 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1511 'submit': "Continue - I'm over 18",
1513 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1515 self.report_age_confirmation()
1516 disclaimer = urllib2.urlopen(request).read()
1517 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1518 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1521 def _real_extract(self, url):
1522 # Extract id and simplified title from URL
1523 mobj = re.match(self._VALID_URL, url)
1525 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1528 video_id = mobj.group(1)
1530 # Check if video comes from YouTube
1531 mobj2 = re.match(r'^yt-(.*)$', video_id)
1532 if mobj2 is not None:
1533 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1536 # At this point we have a new video
1537 self._downloader.increment_downloads()
1539 simple_title = mobj.group(2).decode('utf-8')
1541 # Retrieve video webpage to extract further information
1542 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1544 self.report_download_webpage(video_id)
1545 webpage = urllib2.urlopen(request).read()
1546 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1547 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1550 # Extract URL, uploader and title from webpage
1551 self.report_extraction(video_id)
1552 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1553 if mobj is not None:
1554 mediaURL = urllib.unquote(mobj.group(1))
1555 video_extension = mediaURL[-3:]
1557 # Extract gdaKey if available
1558 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1560 video_url = mediaURL
1562 gdaKey = mobj.group(1)
1563 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1565 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1567 self._downloader.trouble(u'ERROR: unable to extract media URL')
1569 vardict = parse_qs(mobj.group(1))
1570 if 'mediaData' not in vardict:
1571 self._downloader.trouble(u'ERROR: unable to extract media URL')
1573 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1575 self._downloader.trouble(u'ERROR: unable to extract media URL')
1577 mediaURL = mobj.group(1).replace('\\/', '/')
1578 video_extension = mediaURL[-3:]
1579 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1581 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1583 self._downloader.trouble(u'ERROR: unable to extract title')
1585 video_title = mobj.group(1).decode('utf-8')
1586 video_title = sanitize_title(video_title)
1588 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1590 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1592 video_uploader = mobj.group(1)
1595 # Process video information
1596 self._downloader.process_info({
1597 'id': video_id.decode('utf-8'),
1598 'url': video_url.decode('utf-8'),
1599 'uploader': video_uploader.decode('utf-8'),
1600 'upload_date': u'NA',
1601 'title': video_title,
1602 'stitle': simple_title,
1603 'ext': video_extension.decode('utf-8'),
1607 except UnavailableVideoError:
1608 self._downloader.trouble(u'\nERROR: unable to download video')
1611 class DailymotionIE(InfoExtractor):
1612 """Information Extractor for Dailymotion"""
1614 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1615 IE_NAME = u'dailymotion'
1617 def __init__(self, downloader=None):
1618 InfoExtractor.__init__(self, downloader)
1620 def report_download_webpage(self, video_id):
1621 """Report webpage download."""
1622 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1624 def report_extraction(self, video_id):
1625 """Report information extraction."""
1626 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1628 def _real_extract(self, url):
1629 # Extract id and simplified title from URL
1630 mobj = re.match(self._VALID_URL, url)
1632 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1635 # At this point we have a new video
1636 self._downloader.increment_downloads()
1637 video_id = mobj.group(1)
1639 video_extension = 'flv'
1641 # Retrieve video webpage to extract further information
1642 request = urllib2.Request(url)
1643 request.add_header('Cookie', 'family_filter=off')
1645 self.report_download_webpage(video_id)
1646 webpage = urllib2.urlopen(request).read()
1647 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1648 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1651 # Extract URL, uploader and title from webpage
1652 self.report_extraction(video_id)
1653 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1655 self._downloader.trouble(u'ERROR: unable to extract media URL')
1657 sequence = urllib.unquote(mobj.group(1))
1658 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1660 self._downloader.trouble(u'ERROR: unable to extract media URL')
1662 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1664 # if needed add http://www.dailymotion.com/ if relative URL
1666 video_url = mediaURL
1668 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1670 self._downloader.trouble(u'ERROR: unable to extract title')
1672 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1673 video_title = sanitize_title(video_title)
1674 simple_title = _simplify_title(video_title)
1676 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1678 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1680 video_uploader = mobj.group(1)
1683 # Process video information
1684 self._downloader.process_info({
1685 'id': video_id.decode('utf-8'),
1686 'url': video_url.decode('utf-8'),
1687 'uploader': video_uploader.decode('utf-8'),
1688 'upload_date': u'NA',
1689 'title': video_title,
1690 'stitle': simple_title,
1691 'ext': video_extension.decode('utf-8'),
1695 except UnavailableVideoError:
1696 self._downloader.trouble(u'\nERROR: unable to download video')
1699 class GoogleIE(InfoExtractor):
1700 """Information extractor for video.google.com."""
1702 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1703 IE_NAME = u'video.google'
1705 def __init__(self, downloader=None):
1706 InfoExtractor.__init__(self, downloader)
1708 def report_download_webpage(self, video_id):
1709 """Report webpage download."""
1710 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1712 def report_extraction(self, video_id):
1713 """Report information extraction."""
1714 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1716 def _real_extract(self, url):
1717 # Extract id from URL
1718 mobj = re.match(self._VALID_URL, url)
1720 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1723 # At this point we have a new video
1724 self._downloader.increment_downloads()
1725 video_id = mobj.group(1)
1727 video_extension = 'mp4'
1729 # Retrieve video webpage to extract further information
1730 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1732 self.report_download_webpage(video_id)
1733 webpage = urllib2.urlopen(request).read()
1734 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1735 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1738 # Extract URL, uploader, and title from webpage
1739 self.report_extraction(video_id)
1740 mobj = re.search(r"download_url:'([^']+)'", webpage)
1742 video_extension = 'flv'
1743 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1745 self._downloader.trouble(u'ERROR: unable to extract media URL')
1747 mediaURL = urllib.unquote(mobj.group(1))
1748 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1749 mediaURL = mediaURL.replace('\\x26', '\x26')
1751 video_url = mediaURL
1753 mobj = re.search(r'<title>(.*)</title>', webpage)
1755 self._downloader.trouble(u'ERROR: unable to extract title')
1757 video_title = mobj.group(1).decode('utf-8')
1758 video_title = sanitize_title(video_title)
1759 simple_title = _simplify_title(video_title)
1761 # Extract video description
1762 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1764 self._downloader.trouble(u'ERROR: unable to extract video description')
1766 video_description = mobj.group(1).decode('utf-8')
1767 if not video_description:
1768 video_description = 'No description available.'
1770 # Extract video thumbnail
1771 if self._downloader.params.get('forcethumbnail', False):
1772 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1774 webpage = urllib2.urlopen(request).read()
1775 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1776 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1778 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1780 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1782 video_thumbnail = mobj.group(1)
1783 else: # we need something to pass to process_info
1784 video_thumbnail = ''
1787 # Process video information
1788 self._downloader.process_info({
1789 'id': video_id.decode('utf-8'),
1790 'url': video_url.decode('utf-8'),
1792 'upload_date': u'NA',
1793 'title': video_title,
1794 'stitle': simple_title,
1795 'ext': video_extension.decode('utf-8'),
1799 except UnavailableVideoError:
1800 self._downloader.trouble(u'\nERROR: unable to download video')
1803 class PhotobucketIE(InfoExtractor):
1804 """Information extractor for photobucket.com."""
1806 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1807 IE_NAME = u'photobucket'
1809 def __init__(self, downloader=None):
1810 InfoExtractor.__init__(self, downloader)
1812 def report_download_webpage(self, video_id):
1813 """Report webpage download."""
1814 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1816 def report_extraction(self, video_id):
1817 """Report information extraction."""
1818 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1820 def _real_extract(self, url):
1821 # Extract id from URL
1822 mobj = re.match(self._VALID_URL, url)
1824 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1827 # At this point we have a new video
1828 self._downloader.increment_downloads()
1829 video_id = mobj.group(1)
1831 video_extension = 'flv'
1833 # Retrieve video webpage to extract further information
1834 request = urllib2.Request(url)
1836 self.report_download_webpage(video_id)
1837 webpage = urllib2.urlopen(request).read()
1838 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1839 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1842 # Extract URL, uploader, and title from webpage
1843 self.report_extraction(video_id)
1844 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1846 self._downloader.trouble(u'ERROR: unable to extract media URL')
1848 mediaURL = urllib.unquote(mobj.group(1))
1850 video_url = mediaURL
1852 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1854 self._downloader.trouble(u'ERROR: unable to extract title')
1856 video_title = mobj.group(1).decode('utf-8')
1857 video_title = sanitize_title(video_title)
1858 simple_title = _simplify_title(vide_title)
1860 video_uploader = mobj.group(2).decode('utf-8')
1863 # Process video information
1864 self._downloader.process_info({
1865 'id': video_id.decode('utf-8'),
1866 'url': video_url.decode('utf-8'),
1867 'uploader': video_uploader,
1868 'upload_date': u'NA',
1869 'title': video_title,
1870 'stitle': simple_title,
1871 'ext': video_extension.decode('utf-8'),
1875 except UnavailableVideoError:
1876 self._downloader.trouble(u'\nERROR: unable to download video')
1879 class YahooIE(InfoExtractor):
1880 """Information extractor for video.yahoo.com."""
1882 # _VALID_URL matches all Yahoo! Video URLs
1883 # _VPAGE_URL matches only the extractable '/watch/' URLs
1884 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1885 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1886 IE_NAME = u'video.yahoo'
1888 def __init__(self, downloader=None):
1889 InfoExtractor.__init__(self, downloader)
1891 def report_download_webpage(self, video_id):
1892 """Report webpage download."""
1893 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1895 def report_extraction(self, video_id):
1896 """Report information extraction."""
1897 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1899 def _real_extract(self, url, new_video=True):
1900 # Extract ID from URL
1901 mobj = re.match(self._VALID_URL, url)
1903 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1906 # At this point we have a new video
1907 self._downloader.increment_downloads()
1908 video_id = mobj.group(2)
1909 video_extension = 'flv'
1911 # Rewrite valid but non-extractable URLs as
1912 # extractable English language /watch/ URLs
1913 if re.match(self._VPAGE_URL, url) is None:
1914 request = urllib2.Request(url)
1916 webpage = urllib2.urlopen(request).read()
1917 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1918 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1921 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1923 self._downloader.trouble(u'ERROR: Unable to extract id field')
1925 yahoo_id = mobj.group(1)
1927 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1929 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1931 yahoo_vid = mobj.group(1)
1933 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1934 return self._real_extract(url, new_video=False)
1936 # Retrieve video webpage to extract further information
1937 request = urllib2.Request(url)
1939 self.report_download_webpage(video_id)
1940 webpage = urllib2.urlopen(request).read()
1941 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1942 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1945 # Extract uploader and title from webpage
1946 self.report_extraction(video_id)
1947 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1949 self._downloader.trouble(u'ERROR: unable to extract video title')
1951 video_title = mobj.group(1).decode('utf-8')
1952 simple_title = _simplify_title(video_title)
1954 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1956 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1958 video_uploader = mobj.group(1).decode('utf-8')
1960 # Extract video thumbnail
1961 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1963 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1965 video_thumbnail = mobj.group(1).decode('utf-8')
1967 # Extract video description
1968 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1970 self._downloader.trouble(u'ERROR: unable to extract video description')
1972 video_description = mobj.group(1).decode('utf-8')
1973 if not video_description:
1974 video_description = 'No description available.'
1976 # Extract video height and width
1977 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1979 self._downloader.trouble(u'ERROR: unable to extract video height')
1981 yv_video_height = mobj.group(1)
1983 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1985 self._downloader.trouble(u'ERROR: unable to extract video width')
1987 yv_video_width = mobj.group(1)
1989 # Retrieve video playlist to extract media URL
1990 # I'm not completely sure what all these options are, but we
1991 # seem to need most of them, otherwise the server sends a 401.
1992 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1993 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1994 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1995 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1996 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1998 self.report_download_webpage(video_id)
1999 webpage = urllib2.urlopen(request).read()
2000 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2001 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2004 # Extract media URL from playlist XML
2005 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2007 self._downloader.trouble(u'ERROR: Unable to extract media URL')
2009 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2010 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2013 # Process video information
2014 self._downloader.process_info({
2015 'id': video_id.decode('utf-8'),
2017 'uploader': video_uploader,
2018 'upload_date': u'NA',
2019 'title': video_title,
2020 'stitle': simple_title,
2021 'ext': video_extension.decode('utf-8'),
2022 'thumbnail': video_thumbnail.decode('utf-8'),
2023 'description': video_description,
2024 'thumbnail': video_thumbnail,
2027 except UnavailableVideoError:
2028 self._downloader.trouble(u'\nERROR: unable to download video')
2031 class VimeoIE(InfoExtractor):
2032 """Information extractor for vimeo.com."""
2034 # _VALID_URL matches Vimeo URLs
2035 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2038 def __init__(self, downloader=None):
2039 InfoExtractor.__init__(self, downloader)
2041 def report_download_webpage(self, video_id):
2042 """Report webpage download."""
2043 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2045 def report_extraction(self, video_id):
2046 """Report information extraction."""
2047 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2049 def _real_extract(self, url, new_video=True):
2050 # Extract ID from URL
2051 mobj = re.match(self._VALID_URL, url)
2053 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2056 # At this point we have a new video
2057 self._downloader.increment_downloads()
2058 video_id = mobj.group(1)
2060 # Retrieve video webpage to extract further information
2061 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2063 self.report_download_webpage(video_id)
2064 webpage = urllib2.urlopen(request).read()
2065 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2066 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2069 # Now we begin extracting as much information as we can from what we
2070 # retrieved. First we extract the information common to all extractors,
2071 # and latter we extract those that are Vimeo specific.
2072 self.report_extraction(video_id)
2075 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2077 self._downloader.trouble(u'ERROR: unable to extract video title')
2079 video_title = mobj.group(1).decode('utf-8')
2080 simple_title = _simplify_title(video_title)
2083 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2085 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2087 video_uploader = mobj.group(1).decode('utf-8')
2089 # Extract video thumbnail
2090 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2092 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2094 video_thumbnail = mobj.group(1).decode('utf-8')
2096 # # Extract video description
2097 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2099 # self._downloader.trouble(u'ERROR: unable to extract video description')
2101 # video_description = mobj.group(1).decode('utf-8')
2102 # if not video_description: video_description = 'No description available.'
2103 video_description = 'Foo.'
2105 # Vimeo specific: extract request signature
2106 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2108 self._downloader.trouble(u'ERROR: unable to extract request signature')
2110 sig = mobj.group(1).decode('utf-8')
2112 # Vimeo specific: extract video quality information
2113 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2115 self._downloader.trouble(u'ERROR: unable to extract video quality information')
2117 quality = mobj.group(1).decode('utf-8')
2119 if int(quality) == 1:
2124 # Vimeo specific: Extract request signature expiration
2125 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2127 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2129 sig_exp = mobj.group(1).decode('utf-8')
2131 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2134 # Process video information
2135 self._downloader.process_info({
2136 'id': video_id.decode('utf-8'),
2138 'uploader': video_uploader,
2139 'upload_date': u'NA',
2140 'title': video_title,
2141 'stitle': simple_title,
2143 'thumbnail': video_thumbnail.decode('utf-8'),
2144 'description': video_description,
2145 'thumbnail': video_thumbnail,
2146 'description': video_description,
2149 except UnavailableVideoError:
2150 self._downloader.trouble(u'ERROR: unable to download video')
2153 class GenericIE(InfoExtractor):
2154 """Generic last-resort information extractor."""
2157 IE_NAME = u'generic'
2159 def __init__(self, downloader=None):
2160 InfoExtractor.__init__(self, downloader)
2162 def report_download_webpage(self, video_id):
2163 """Report webpage download."""
2164 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2165 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2167 def report_extraction(self, video_id):
2168 """Report information extraction."""
2169 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2171 def _real_extract(self, url):
2172 # At this point we have a new video
2173 self._downloader.increment_downloads()
2175 video_id = url.split('/')[-1]
2176 request = urllib2.Request(url)
2178 self.report_download_webpage(video_id)
2179 webpage = urllib2.urlopen(request).read()
2180 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2181 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2183 except ValueError, err:
2184 # since this is the last-resort InfoExtractor, if
2185 # this error is thrown, it'll be thrown here
2186 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2189 self.report_extraction(video_id)
2190 # Start with something easy: JW Player in SWFObject
2191 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2193 # Broaden the search a little bit
2194 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2196 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2199 # It's possible that one of the regexes
2200 # matched, but returned an empty group:
2201 if mobj.group(1) is None:
2202 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2205 video_url = urllib.unquote(mobj.group(1))
2206 video_id = os.path.basename(video_url)
2208 # here's a fun little line of code for you:
2209 video_extension = os.path.splitext(video_id)[1][1:]
2210 video_id = os.path.splitext(video_id)[0]
2212 # it's tempting to parse this further, but you would
2213 # have to take into account all the variations like
2214 # Video Title - Site Name
2215 # Site Name | Video Title
2216 # Video Title - Tagline | Site Name
2217 # and so on and so forth; it's just not practical
2218 mobj = re.search(r'<title>(.*)</title>', webpage)
2220 self._downloader.trouble(u'ERROR: unable to extract title')
2222 video_title = mobj.group(1).decode('utf-8')
2223 video_title = sanitize_title(video_title)
2224 simple_title = _simplify_title(video_title)
2226 # video uploader is domain name
2227 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2229 self._downloader.trouble(u'ERROR: unable to extract title')
2231 video_uploader = mobj.group(1).decode('utf-8')
2234 # Process video information
2235 self._downloader.process_info({
2236 'id': video_id.decode('utf-8'),
2237 'url': video_url.decode('utf-8'),
2238 'uploader': video_uploader,
2239 'upload_date': u'NA',
2240 'title': video_title,
2241 'stitle': simple_title,
2242 'ext': video_extension.decode('utf-8'),
2246 except UnavailableVideoError, err:
2247 self._downloader.trouble(u'\nERROR: unable to download video')
2250 class YoutubeSearchIE(InfoExtractor):
2251 """Information Extractor for YouTube search queries."""
2252 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2253 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2254 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2255 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2257 _max_youtube_results = 1000
2258 IE_NAME = u'youtube:search'
2260 def __init__(self, youtube_ie, downloader=None):
2261 InfoExtractor.__init__(self, downloader)
2262 self._youtube_ie = youtube_ie
2264 def report_download_page(self, query, pagenum):
2265 """Report attempt to download playlist page with given number."""
2266 query = query.decode(preferredencoding())
2267 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2269 def _real_initialize(self):
2270 self._youtube_ie.initialize()
2272 def _real_extract(self, query):
2273 mobj = re.match(self._VALID_URL, query)
2275 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2278 prefix, query = query.split(':')
2280 query = query.encode('utf-8')
2282 self._download_n_results(query, 1)
2284 elif prefix == 'all':
2285 self._download_n_results(query, self._max_youtube_results)
2291 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2293 elif n > self._max_youtube_results:
2294 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2295 n = self._max_youtube_results
2296 self._download_n_results(query, n)
2298 except ValueError: # parsing prefix as integer fails
2299 self._download_n_results(query, 1)
2302 def _download_n_results(self, query, n):
2303 """Downloads a specified number of results for a query"""
2306 already_seen = set()
2310 self.report_download_page(query, pagenum)
2311 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2312 request = urllib2.Request(result_url)
2314 page = urllib2.urlopen(request).read()
2315 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2316 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2319 # Extract video identifiers
2320 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2321 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2322 if video_id not in already_seen:
2323 video_ids.append(video_id)
2324 already_seen.add(video_id)
2325 if len(video_ids) == n:
2326 # Specified n videos reached
2327 for id in video_ids:
2328 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2331 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2332 for id in video_ids:
2333 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2336 pagenum = pagenum + 1
2339 class GoogleSearchIE(InfoExtractor):
2340 """Information Extractor for Google Video search queries."""
2341 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2342 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2343 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2344 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2346 _max_google_results = 1000
2347 IE_NAME = u'video.google:search'
2349 def __init__(self, google_ie, downloader=None):
2350 InfoExtractor.__init__(self, downloader)
2351 self._google_ie = google_ie
2353 def report_download_page(self, query, pagenum):
2354 """Report attempt to download playlist page with given number."""
2355 query = query.decode(preferredencoding())
2356 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2358 def _real_initialize(self):
2359 self._google_ie.initialize()
2361 def _real_extract(self, query):
2362 mobj = re.match(self._VALID_URL, query)
2364 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2367 prefix, query = query.split(':')
2369 query = query.encode('utf-8')
2371 self._download_n_results(query, 1)
2373 elif prefix == 'all':
2374 self._download_n_results(query, self._max_google_results)
2380 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2382 elif n > self._max_google_results:
2383 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2384 n = self._max_google_results
2385 self._download_n_results(query, n)
2387 except ValueError: # parsing prefix as integer fails
2388 self._download_n_results(query, 1)
2391 def _download_n_results(self, query, n):
2392 """Downloads a specified number of results for a query"""
2395 already_seen = set()
2399 self.report_download_page(query, pagenum)
2400 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2401 request = urllib2.Request(result_url)
2403 page = urllib2.urlopen(request).read()
2404 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2405 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2408 # Extract video identifiers
2409 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2410 video_id = mobj.group(1)
2411 if video_id not in already_seen:
2412 video_ids.append(video_id)
2413 already_seen.add(video_id)
2414 if len(video_ids) == n:
2415 # Specified n videos reached
2416 for id in video_ids:
2417 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2420 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2421 for id in video_ids:
2422 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2425 pagenum = pagenum + 1
2428 class YahooSearchIE(InfoExtractor):
2429 """Information Extractor for Yahoo! Video search queries."""
2430 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2431 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2432 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2433 _MORE_PAGES_INDICATOR = r'\s*Next'
2435 _max_yahoo_results = 1000
2436 IE_NAME = u'video.yahoo:search'
2438 def __init__(self, yahoo_ie, downloader=None):
2439 InfoExtractor.__init__(self, downloader)
2440 self._yahoo_ie = yahoo_ie
2442 def report_download_page(self, query, pagenum):
2443 """Report attempt to download playlist page with given number."""
2444 query = query.decode(preferredencoding())
2445 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2447 def _real_initialize(self):
2448 self._yahoo_ie.initialize()
2450 def _real_extract(self, query):
2451 mobj = re.match(self._VALID_URL, query)
2453 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2456 prefix, query = query.split(':')
2458 query = query.encode('utf-8')
2460 self._download_n_results(query, 1)
2462 elif prefix == 'all':
2463 self._download_n_results(query, self._max_yahoo_results)
2469 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2471 elif n > self._max_yahoo_results:
2472 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2473 n = self._max_yahoo_results
2474 self._download_n_results(query, n)
2476 except ValueError: # parsing prefix as integer fails
2477 self._download_n_results(query, 1)
2480 def _download_n_results(self, query, n):
2481 """Downloads a specified number of results for a query"""
2484 already_seen = set()
2488 self.report_download_page(query, pagenum)
2489 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2490 request = urllib2.Request(result_url)
2492 page = urllib2.urlopen(request).read()
2493 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2494 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2497 # Extract video identifiers
2498 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2499 video_id = mobj.group(1)
2500 if video_id not in already_seen:
2501 video_ids.append(video_id)
2502 already_seen.add(video_id)
2503 if len(video_ids) == n:
2504 # Specified n videos reached
2505 for id in video_ids:
2506 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2509 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2510 for id in video_ids:
2511 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2514 pagenum = pagenum + 1
2517 class YoutubePlaylistIE(InfoExtractor):
2518 """Information Extractor for YouTube playlists."""
2520 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2521 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2522 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=PL%s&'
2523 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2525 IE_NAME = u'youtube:playlist'
2527 def __init__(self, youtube_ie, downloader=None):
2528 InfoExtractor.__init__(self, downloader)
2529 self._youtube_ie = youtube_ie
2531 def report_download_page(self, playlist_id, pagenum):
2532 """Report attempt to download playlist page with given number."""
2533 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2535 def _real_initialize(self):
2536 self._youtube_ie.initialize()
2538 def _real_extract(self, url):
2539 # Extract playlist id
2540 mobj = re.match(self._VALID_URL, url)
2542 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2546 if mobj.group(3) is not None:
2547 self._youtube_ie.extract(mobj.group(3))
2550 # Download playlist pages
2551 # prefix is 'p' as default for playlists but there are other types that need extra care
2552 playlist_prefix = mobj.group(1)
2553 if playlist_prefix == 'a':
2554 playlist_access = 'artist'
2556 playlist_prefix = 'p'
2557 playlist_access = 'view_play_list'
2558 playlist_id = mobj.group(2)
2563 self.report_download_page(playlist_id, pagenum)
2564 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2565 request = urllib2.Request(url)
2567 page = urllib2.urlopen(request).read()
2568 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2569 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2572 # Extract video identifiers
2574 video_indicator = self._VIDEO_INDICATOR_TEMPLATE % playlist_id
2575 for mobj in re.finditer(video_indicator, page):
2576 if mobj.group(1) not in ids_in_page:
2577 ids_in_page.append(mobj.group(1))
2578 video_ids.extend(ids_in_page)
2580 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2582 pagenum = pagenum + 1
2584 playliststart = self._downloader.params.get('playliststart', 1) - 1
2585 playlistend = self._downloader.params.get('playlistend', -1)
2587 if playlistend == -1:
2588 video_ids = video_ids[playliststart:]
2590 video_ids = video_ids[playliststart:playlistend]
2592 for id in video_ids:
2593 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2597 class YoutubeUserIE(InfoExtractor):
2598 """Information Extractor for YouTube users."""
2600 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2601 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2602 _GDATA_PAGE_SIZE = 50
2603 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2604 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2606 IE_NAME = u'youtube:user'
2608 def __init__(self, youtube_ie, downloader=None):
2609 InfoExtractor.__init__(self, downloader)
2610 self._youtube_ie = youtube_ie
2612 def report_download_page(self, username, start_index):
2613 """Report attempt to download user page."""
2614 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2615 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2617 def _real_initialize(self):
2618 self._youtube_ie.initialize()
2620 def _real_extract(self, url):
2622 mobj = re.match(self._VALID_URL, url)
2624 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2627 username = mobj.group(1)
2629 # Download video ids using YouTube Data API. Result size per
2630 # query is limited (currently to 50 videos) so we need to query
2631 # page by page until there are no video ids - it means we got
2638 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2639 self.report_download_page(username, start_index)
2641 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2644 page = urllib2.urlopen(request).read()
2645 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2646 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2649 # Extract video identifiers
2652 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2653 if mobj.group(1) not in ids_in_page:
2654 ids_in_page.append(mobj.group(1))
2656 video_ids.extend(ids_in_page)
2658 # A little optimization - if current page is not
2659 # "full", ie. does not contain PAGE_SIZE video ids then
2660 # we can assume that this page is the last one - there
2661 # are no more ids on further pages - no need to query
2664 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2669 all_ids_count = len(video_ids)
2670 playliststart = self._downloader.params.get('playliststart', 1) - 1
2671 playlistend = self._downloader.params.get('playlistend', -1)
2673 if playlistend == -1:
2674 video_ids = video_ids[playliststart:]
2676 video_ids = video_ids[playliststart:playlistend]
2678 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2679 (username, all_ids_count, len(video_ids)))
2681 for video_id in video_ids:
2682 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2685 class DepositFilesIE(InfoExtractor):
2686 """Information extractor for depositfiles.com"""
2688 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2689 IE_NAME = u'DepositFiles'
2691 def __init__(self, downloader=None):
2692 InfoExtractor.__init__(self, downloader)
2694 def report_download_webpage(self, file_id):
2695 """Report webpage download."""
2696 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2698 def report_extraction(self, file_id):
2699 """Report information extraction."""
2700 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2702 def _real_extract(self, url):
2703 # At this point we have a new file
2704 self._downloader.increment_downloads()
2706 file_id = url.split('/')[-1]
2707 # Rebuild url in english locale
2708 url = 'http://depositfiles.com/en/files/' + file_id
2710 # Retrieve file webpage with 'Free download' button pressed
2711 free_download_indication = { 'gateway_result' : '1' }
2712 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2714 self.report_download_webpage(file_id)
2715 webpage = urllib2.urlopen(request).read()
2716 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2717 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2720 # Search for the real file URL
2721 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2722 if (mobj is None) or (mobj.group(1) is None):
2723 # Try to figure out reason of the error.
2724 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2725 if (mobj is not None) and (mobj.group(1) is not None):
2726 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2727 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2729 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2732 file_url = mobj.group(1)
2733 file_extension = os.path.splitext(file_url)[1][1:]
2735 # Search for file title
2736 mobj = re.search(r'<b title="(.*?)">', webpage)
2738 self._downloader.trouble(u'ERROR: unable to extract title')
2740 file_title = mobj.group(1).decode('utf-8')
2743 # Process file information
2744 self._downloader.process_info({
2745 'id': file_id.decode('utf-8'),
2746 'url': file_url.decode('utf-8'),
2748 'upload_date': u'NA',
2749 'title': file_title,
2750 'stitle': file_title,
2751 'ext': file_extension.decode('utf-8'),
2755 except UnavailableVideoError, err:
2756 self._downloader.trouble(u'ERROR: unable to download file')
2759 class FacebookIE(InfoExtractor):
2760 """Information Extractor for Facebook"""
2762 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2763 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2764 _NETRC_MACHINE = 'facebook'
2765 _available_formats = ['video', 'highqual', 'lowqual']
2766 _video_extensions = {
2771 IE_NAME = u'facebook'
2773 def __init__(self, downloader=None):
2774 InfoExtractor.__init__(self, downloader)
2776 def _reporter(self, message):
2777 """Add header and report message."""
2778 self._downloader.to_screen(u'[facebook] %s' % message)
2780 def report_login(self):
2781 """Report attempt to log in."""
2782 self._reporter(u'Logging in')
2784 def report_video_webpage_download(self, video_id):
2785 """Report attempt to download video webpage."""
2786 self._reporter(u'%s: Downloading video webpage' % video_id)
2788 def report_information_extraction(self, video_id):
2789 """Report attempt to extract video information."""
2790 self._reporter(u'%s: Extracting video information' % video_id)
2792 def _parse_page(self, video_webpage):
2793 """Extract video information from page"""
2795 data = {'title': r'\("video_title", "(.*?)"\)',
2796 'description': r'<div class="datawrap">(.*?)</div>',
2797 'owner': r'\("video_owner_name", "(.*?)"\)',
2798 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2801 for piece in data.keys():
2802 mobj = re.search(data[piece], video_webpage)
2803 if mobj is not None:
2804 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2808 for fmt in self._available_formats:
2809 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2810 if mobj is not None:
2811 # URL is in a Javascript segment inside an escaped Unicode format within
2812 # the generally utf-8 page
2813 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2814 video_info['video_urls'] = video_urls
2818 def _real_initialize(self):
2819 if self._downloader is None:
2824 downloader_params = self._downloader.params
2826 # Attempt to use provided username and password or .netrc data
2827 if downloader_params.get('username', None) is not None:
2828 useremail = downloader_params['username']
2829 password = downloader_params['password']
2830 elif downloader_params.get('usenetrc', False):
2832 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2833 if info is not None:
2837 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2838 except (IOError, netrc.NetrcParseError), err:
2839 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2842 if useremail is None:
2851 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2854 login_results = urllib2.urlopen(request).read()
2855 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2856 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2858 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2859 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2862 def _real_extract(self, url):
2863 mobj = re.match(self._VALID_URL, url)
2865 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2867 video_id = mobj.group('ID')
2870 self.report_video_webpage_download(video_id)
2871 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2873 page = urllib2.urlopen(request)
2874 video_webpage = page.read()
2875 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2876 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2879 # Start extracting information
2880 self.report_information_extraction(video_id)
2882 # Extract information
2883 video_info = self._parse_page(video_webpage)
2886 if 'owner' not in video_info:
2887 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2889 video_uploader = video_info['owner']
2892 if 'title' not in video_info:
2893 self._downloader.trouble(u'ERROR: unable to extract video title')
2895 video_title = video_info['title']
2896 video_title = video_title.decode('utf-8')
2897 video_title = sanitize_title(video_title)
2899 simple_title = _simplify_title(video_title)
2902 if 'thumbnail' not in video_info:
2903 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2904 video_thumbnail = ''
2906 video_thumbnail = video_info['thumbnail']
2910 if 'upload_date' in video_info:
2911 upload_time = video_info['upload_date']
2912 timetuple = email.utils.parsedate_tz(upload_time)
2913 if timetuple is not None:
2915 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2920 video_description = video_info.get('description', 'No description available.')
2922 url_map = video_info['video_urls']
2923 if len(url_map.keys()) > 0:
2924 # Decide which formats to download
2925 req_format = self._downloader.params.get('format', None)
2926 format_limit = self._downloader.params.get('format_limit', None)
2928 if format_limit is not None and format_limit in self._available_formats:
2929 format_list = self._available_formats[self._available_formats.index(format_limit):]
2931 format_list = self._available_formats
2932 existing_formats = [x for x in format_list if x in url_map]
2933 if len(existing_formats) == 0:
2934 self._downloader.trouble(u'ERROR: no known formats available for video')
2936 if req_format is None:
2937 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2938 elif req_format == 'worst':
2939 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2940 elif req_format == '-1':
2941 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2944 if req_format not in url_map:
2945 self._downloader.trouble(u'ERROR: requested format not available')
2947 video_url_list = [(req_format, url_map[req_format])] # Specific format
2949 for format_param, video_real_url in video_url_list:
2951 # At this point we have a new video
2952 self._downloader.increment_downloads()
2955 video_extension = self._video_extensions.get(format_param, 'mp4')
2958 # Process video information
2959 self._downloader.process_info({
2960 'id': video_id.decode('utf-8'),
2961 'url': video_real_url.decode('utf-8'),
2962 'uploader': video_uploader.decode('utf-8'),
2963 'upload_date': upload_date,
2964 'title': video_title,
2965 'stitle': simple_title,
2966 'ext': video_extension.decode('utf-8'),
2967 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2968 'thumbnail': video_thumbnail.decode('utf-8'),
2969 'description': video_description.decode('utf-8'),
2972 except UnavailableVideoError, err:
2973 self._downloader.trouble(u'\nERROR: unable to download video')
2975 class BlipTVIE(InfoExtractor):
2976 """Information extractor for blip.tv"""
2978 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2979 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2980 IE_NAME = u'blip.tv'
2982 def report_extraction(self, file_id):
2983 """Report information extraction."""
2984 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2986 def report_direct_download(self, title):
2987 """Report information extraction."""
2988 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2990 def _real_extract(self, url):
2991 mobj = re.match(self._VALID_URL, url)
2993 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3000 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3001 request = urllib2.Request(json_url)
3002 self.report_extraction(mobj.group(1))
3005 urlh = urllib2.urlopen(request)
3006 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3007 basename = url.split('/')[-1]
3008 title,ext = os.path.splitext(basename)
3009 title = title.decode('UTF-8')
3010 ext = ext.replace('.', '')
3011 self.report_direct_download(title)
3016 'stitle': _simplify_title(title),
3020 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3021 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3023 if info is None: # Regular URL
3025 json_code = urlh.read()
3026 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3027 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3031 json_data = json.loads(json_code)
3032 if 'Post' in json_data:
3033 data = json_data['Post']
3037 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3038 video_url = data['media']['url']
3039 umobj = re.match(self._URL_EXT, video_url)
3041 raise ValueError('Can not determine filename extension')
3042 ext = umobj.group(1)
3045 'id': data['item_id'],
3047 'uploader': data['display_name'],
3048 'upload_date': upload_date,
3049 'title': data['title'],
3050 'stitle': _simplify_title(data['title']),
3052 'format': data['media']['mimeType'],
3053 'thumbnail': data['thumbnailUrl'],
3054 'description': data['description'],
3055 'player_url': data['embedUrl']
3057 except (ValueError,KeyError), err:
3058 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3061 self._downloader.increment_downloads()
3064 self._downloader.process_info(info)
3065 except UnavailableVideoError, err:
3066 self._downloader.trouble(u'\nERROR: unable to download video')
3069 class MyVideoIE(InfoExtractor):
3070 """Information Extractor for myvideo.de."""
3072 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3073 IE_NAME = u'myvideo'
3075 def __init__(self, downloader=None):
3076 InfoExtractor.__init__(self, downloader)
3078 def report_download_webpage(self, video_id):
3079 """Report webpage download."""
3080 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3082 def report_extraction(self, video_id):
3083 """Report information extraction."""
3084 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3086 def _real_extract(self,url):
3087 mobj = re.match(self._VALID_URL, url)
3089 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3092 video_id = mobj.group(1)
3095 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3097 self.report_download_webpage(video_id)
3098 webpage = urllib2.urlopen(request).read()
3099 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3100 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3103 self.report_extraction(video_id)
3104 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3107 self._downloader.trouble(u'ERROR: unable to extract media URL')
3109 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3111 mobj = re.search('<title>([^<]+)</title>', webpage)
3113 self._downloader.trouble(u'ERROR: unable to extract title')
3116 video_title = mobj.group(1)
3117 video_title = sanitize_title(video_title)
3119 simple_title = _simplify_title(video_title)
3122 self._downloader.process_info({
3126 'upload_date': u'NA',
3127 'title': video_title,
3128 'stitle': simple_title,
3133 except UnavailableVideoError:
3134 self._downloader.trouble(u'\nERROR: Unable to download video')
3136 class ComedyCentralIE(InfoExtractor):
3137 """Information extractor for The Daily Show and Colbert Report """
3139 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3140 IE_NAME = u'comedycentral'
3142 def report_extraction(self, episode_id):
3143 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3145 def report_config_download(self, episode_id):
3146 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3148 def report_index_download(self, episode_id):
3149 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3151 def report_player_url(self, episode_id):
3152 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3154 def _real_extract(self, url):
3155 mobj = re.match(self._VALID_URL, url)
3157 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3160 if mobj.group('shortname'):
3161 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3162 url = u'http://www.thedailyshow.com/full-episodes/'
3164 url = u'http://www.colbertnation.com/full-episodes/'
3165 mobj = re.match(self._VALID_URL, url)
3166 assert mobj is not None
3168 dlNewest = not mobj.group('episode')
3170 epTitle = mobj.group('showname')
3172 epTitle = mobj.group('episode')
3174 req = urllib2.Request(url)
3175 self.report_extraction(epTitle)
3177 htmlHandle = urllib2.urlopen(req)
3178 html = htmlHandle.read()
3179 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3180 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3183 url = htmlHandle.geturl()
3184 mobj = re.match(self._VALID_URL, url)
3186 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3188 if mobj.group('episode') == '':
3189 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3191 epTitle = mobj.group('episode')
3193 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3194 if len(mMovieParams) == 0:
3195 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3198 playerUrl_raw = mMovieParams[0][0]
3199 self.report_player_url(epTitle)
3201 urlHandle = urllib2.urlopen(playerUrl_raw)
3202 playerUrl = urlHandle.geturl()
3203 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3204 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3207 uri = mMovieParams[0][1]
3208 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3209 self.report_index_download(epTitle)
3211 indexXml = urllib2.urlopen(indexUrl).read()
3212 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3213 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3216 idoc = xml.etree.ElementTree.fromstring(indexXml)
3217 itemEls = idoc.findall('.//item')
3218 for itemEl in itemEls:
3219 mediaId = itemEl.findall('./guid')[0].text
3220 shortMediaId = mediaId.split(':')[-1]
3221 showId = mediaId.split(':')[-2].replace('.com', '')
3222 officialTitle = itemEl.findall('./title')[0].text
3223 officialDate = itemEl.findall('./pubDate')[0].text
3225 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3226 urllib.urlencode({'uri': mediaId}))
3227 configReq = urllib2.Request(configUrl)
3228 self.report_config_download(epTitle)
3230 configXml = urllib2.urlopen(configReq).read()
3231 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3232 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3235 cdoc = xml.etree.ElementTree.fromstring(configXml)
3237 for rendition in cdoc.findall('.//rendition'):
3238 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3242 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3245 # For now, just pick the highest bitrate
3246 format,video_url = turls[-1]
3248 self._downloader.increment_downloads()
3250 effTitle = showId + u'-' + epTitle
3255 'upload_date': officialDate,
3257 'stitle': _simplify_title(effTitle),
3261 'description': officialTitle,
3262 'player_url': playerUrl
3266 self._downloader.process_info(info)
3267 except UnavailableVideoError, err:
3268 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3272 class EscapistIE(InfoExtractor):
3273 """Information extractor for The Escapist """
3275 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3276 IE_NAME = u'escapist'
3278 def report_extraction(self, showName):
3279 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3281 def report_config_download(self, showName):
3282 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3284 def _real_extract(self, url):
3285 htmlParser = HTMLParser.HTMLParser()
3287 mobj = re.match(self._VALID_URL, url)
3289 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3291 showName = mobj.group('showname')
3292 videoId = mobj.group('episode')
3294 self.report_extraction(showName)
3296 webPage = urllib2.urlopen(url).read()
3297 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3298 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3301 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3302 description = htmlParser.unescape(descMatch.group(1))
3303 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3304 imgUrl = htmlParser.unescape(imgMatch.group(1))
3305 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3306 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3307 configUrlMatch = re.search('config=(.*)$', playerUrl)
3308 configUrl = urllib2.unquote(configUrlMatch.group(1))
3310 self.report_config_download(showName)
3312 configJSON = urllib2.urlopen(configUrl).read()
3313 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3314 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3317 # Technically, it's JavaScript, not JSON
3318 configJSON = configJSON.replace("'", '"')
3321 config = json.loads(configJSON)
3322 except (ValueError,), err:
3323 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3326 playlist = config['playlist']
3327 videoUrl = playlist[1]['url']
3329 self._downloader.increment_downloads()
3333 'uploader': showName,
3334 'upload_date': None,
3336 'stitle': _simplify_title(showName),
3339 'thumbnail': imgUrl,
3340 'description': description,
3341 'player_url': playerUrl,
3345 self._downloader.process_info(info)
3346 except UnavailableVideoError, err:
3347 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3350 class CollegeHumorIE(InfoExtractor):
3351 """Information extractor for collegehumor.com"""
3353 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3354 IE_NAME = u'collegehumor'
3356 def report_webpage(self, video_id):
3357 """Report information extraction."""
3358 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3360 def report_extraction(self, video_id):
3361 """Report information extraction."""
3362 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3364 def _real_extract(self, url):
3365 htmlParser = HTMLParser.HTMLParser()
3367 mobj = re.match(self._VALID_URL, url)
3369 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3371 video_id = mobj.group('videoid')
3373 self.report_webpage(video_id)
3374 request = urllib2.Request(url)
3376 webpage = urllib2.urlopen(request).read()
3377 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3378 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3381 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3383 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3385 internal_video_id = m.group('internalvideoid')
3389 'internal_id': internal_video_id,
3392 self.report_extraction(video_id)
3393 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3395 metaXml = urllib2.urlopen(xmlUrl).read()
3396 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3397 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3400 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3402 videoNode = mdoc.findall('./video')[0]
3403 info['description'] = videoNode.findall('./description')[0].text
3404 info['title'] = videoNode.findall('./caption')[0].text
3405 info['stitle'] = _simplify_title(info['title'])
3406 info['url'] = videoNode.findall('./file')[0].text
3407 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3408 info['ext'] = info['url'].rpartition('.')[2]
3409 info['format'] = info['ext']
3411 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3414 self._downloader.increment_downloads()
3417 self._downloader.process_info(info)
3418 except UnavailableVideoError, err:
3419 self._downloader.trouble(u'\nERROR: unable to download video')
3422 class XVideosIE(InfoExtractor):
3423 """Information extractor for xvideos.com"""
3425 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3426 IE_NAME = u'xvideos'
3428 def report_webpage(self, video_id):
3429 """Report information extraction."""
3430 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3432 def report_extraction(self, video_id):
3433 """Report information extraction."""
3434 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3436 def _real_extract(self, url):
3437 htmlParser = HTMLParser.HTMLParser()
3439 mobj = re.match(self._VALID_URL, url)
3441 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3443 video_id = mobj.group(1).decode('utf-8')
3445 self.report_webpage(video_id)
3447 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3449 webpage = urllib2.urlopen(request).read()
3450 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3451 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3454 self.report_extraction(video_id)
3458 mobj = re.search(r'flv_url=(.+?)&', webpage)
3460 self._downloader.trouble(u'ERROR: unable to extract video url')
3462 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3466 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3468 self._downloader.trouble(u'ERROR: unable to extract video title')
3470 video_title = mobj.group(1).decode('utf-8')
3473 # Extract video thumbnail
3474 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3476 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3478 video_thumbnail = mobj.group(1).decode('utf-8')
3482 self._downloader.increment_downloads()
3487 'upload_date': None,
3488 'title': video_title,
3489 'stitle': _simplify_title(video_title),
3492 'thumbnail': video_thumbnail,
3493 'description': None,
3498 self._downloader.process_info(info)
3499 except UnavailableVideoError, err:
3500 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3503 class SoundcloudIE(InfoExtractor):
3504 """Information extractor for soundcloud.com
3505 To access the media, the uid of the song and a stream token
3506 must be extracted from the page source and the script must make
3507 a request to media.soundcloud.com/crossdomain.xml. Then
3508 the media can be grabbed by requesting from an url composed
3509 of the stream token and uid
3512 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3513 IE_NAME = u'soundcloud'
3515 def __init__(self, downloader=None):
3516 InfoExtractor.__init__(self, downloader)
3518 def report_webpage(self, video_id):
3519 """Report information extraction."""
3520 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3522 def report_extraction(self, video_id):
3523 """Report information extraction."""
3524 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3526 def _real_extract(self, url):
3527 htmlParser = HTMLParser.HTMLParser()
3529 mobj = re.match(self._VALID_URL, url)
3531 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3534 # extract uploader (which is in the url)
3535 uploader = mobj.group(1).decode('utf-8')
3536 # extract simple title (uploader + slug of song title)
3537 slug_title = mobj.group(2).decode('utf-8')
3538 simple_title = uploader + '-' + slug_title
3540 self.report_webpage('%s/%s' % (uploader, slug_title))
3542 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3544 webpage = urllib2.urlopen(request).read()
3545 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3546 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3549 self.report_extraction('%s/%s' % (uploader, slug_title))
3551 # extract uid and stream token that soundcloud hands out for access
3552 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3554 video_id = mobj.group(1)
3555 stream_token = mobj.group(2)
3557 # extract unsimplified title
3558 mobj = re.search('"title":"(.*?)",', webpage)
3560 title = mobj.group(1)
3562 # construct media url (with uid/token)
3563 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3564 mediaURL = mediaURL % (video_id, stream_token)
3567 description = u'No description available'
3568 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3570 description = mobj.group(1)
3574 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3577 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3578 except Exception, e:
3581 # for soundcloud, a request to a cross domain is required for cookies
3582 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3585 self._downloader.process_info({
3586 'id': video_id.decode('utf-8'),
3588 'uploader': uploader.decode('utf-8'),
3589 'upload_date': upload_date,
3590 'title': simple_title.decode('utf-8'),
3591 'stitle': simple_title.decode('utf-8'),
3595 'description': description.decode('utf-8')
3597 except UnavailableVideoError:
3598 self._downloader.trouble(u'\nERROR: unable to download video')
3601 class InfoQIE(InfoExtractor):
3602 """Information extractor for infoq.com"""
3604 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3607 def report_webpage(self, video_id):
3608 """Report information extraction."""
3609 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3611 def report_extraction(self, video_id):
3612 """Report information extraction."""
3613 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3615 def _real_extract(self, url):
3616 htmlParser = HTMLParser.HTMLParser()
3618 mobj = re.match(self._VALID_URL, url)
3620 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3623 self.report_webpage(url)
3625 request = urllib2.Request(url)
3627 webpage = urllib2.urlopen(request).read()
3628 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3629 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3632 self.report_extraction(url)
3636 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3638 self._downloader.trouble(u'ERROR: unable to extract video url')
3640 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3644 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3646 self._downloader.trouble(u'ERROR: unable to extract video title')
3648 video_title = mobj.group(1).decode('utf-8')
3650 # Extract description
3651 video_description = u'No description available.'
3652 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3653 if mobj is not None:
3654 video_description = mobj.group(1).decode('utf-8')
3656 video_filename = video_url.split('/')[-1]
3657 video_id, extension = video_filename.split('.')
3659 self._downloader.increment_downloads()
3664 'upload_date': None,
3665 'title': video_title,
3666 'stitle': _simplify_title(video_title),
3668 'format': extension, # Extension is always(?) mp4, but seems to be flv
3670 'description': video_description,
3675 self._downloader.process_info(info)
3676 except UnavailableVideoError, err:
3677 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3679 class MixcloudIE(InfoExtractor):
3680 """Information extractor for www.mixcloud.com"""
3681 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3682 IE_NAME = u'mixcloud'
3684 def __init__(self, downloader=None):
3685 InfoExtractor.__init__(self, downloader)
3687 def report_download_json(self, file_id):
3688 """Report JSON download."""
3689 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3691 def report_extraction(self, file_id):
3692 """Report information extraction."""
3693 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3695 def get_urls(self, jsonData, fmt, bitrate='best'):
3696 """Get urls from 'audio_formats' section in json"""
3699 bitrate_list = jsonData[fmt]
3700 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3701 bitrate = max(bitrate_list) # select highest
3703 url_list = jsonData[fmt][bitrate]
3704 except TypeError: # we have no bitrate info.
3705 url_list = jsonData[fmt]
3709 def check_urls(self, url_list):
3710 """Returns 1st active url from list"""
3711 for url in url_list:
3713 urllib2.urlopen(url)
3715 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3720 def _print_formats(self, formats):
3721 print 'Available formats:'
3722 for fmt in formats.keys():
3723 for b in formats[fmt]:
3725 ext = formats[fmt][b][0]
3726 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3727 except TypeError: # we have no bitrate info
3728 ext = formats[fmt][0]
3729 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3732 def _real_extract(self, url):
3733 mobj = re.match(self._VALID_URL, url)
3735 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3737 # extract uploader & filename from url
3738 uploader = mobj.group(1).decode('utf-8')
3739 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3741 # construct API request
3742 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3743 # retrieve .json file with links to files
3744 request = urllib2.Request(file_url)
3746 self.report_download_json(file_url)
3747 jsonData = urllib2.urlopen(request).read()
3748 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3749 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3753 json_data = json.loads(jsonData)
3754 player_url = json_data['player_swf_url']
3755 formats = dict(json_data['audio_formats'])
3757 req_format = self._downloader.params.get('format', None)
3760 if self._downloader.params.get('listformats', None):
3761 self._print_formats(formats)
3764 if req_format is None or req_format == 'best':
3765 for format_param in formats.keys():
3766 url_list = self.get_urls(formats, format_param)
3768 file_url = self.check_urls(url_list)
3769 if file_url is not None:
3772 if req_format not in formats.keys():
3773 self._downloader.trouble(u'ERROR: format is not available')
3776 url_list = self.get_urls(formats, req_format)
3777 file_url = self.check_urls(url_list)
3778 format_param = req_format
3781 self._downloader.increment_downloads()
3783 # Process file information
3784 self._downloader.process_info({
3785 'id': file_id.decode('utf-8'),
3786 'url': file_url.decode('utf-8'),
3787 'uploader': uploader.decode('utf-8'),
3788 'upload_date': u'NA',
3789 'title': json_data['name'],
3790 'stitle': _simplify_title(json_data['name']),
3791 'ext': file_url.split('.')[-1].decode('utf-8'),
3792 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3793 'thumbnail': json_data['thumbnail_url'],
3794 'description': json_data['description'],
3795 'player_url': player_url.decode('utf-8'),
3797 except UnavailableVideoError, err:
3798 self._downloader.trouble(u'ERROR: unable to download file')
3800 class StanfordOpenClassroomIE(InfoExtractor):
3801 """Information extractor for Stanford's Open ClassRoom"""
3803 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3804 IE_NAME = u'stanfordoc'
3806 def report_download_webpage(self, objid):
3807 """Report information extraction."""
3808 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3810 def report_extraction(self, video_id):
3811 """Report information extraction."""
3812 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3814 def _real_extract(self, url):
3815 mobj = re.match(self._VALID_URL, url)
3817 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3820 if mobj.group('course') and mobj.group('video'): # A specific video
3821 course = mobj.group('course')
3822 video = mobj.group('video')
3824 'id': _simplify_title(course + '_' + video),
3827 self.report_extraction(info['id'])
3828 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3829 xmlUrl = baseUrl + video + '.xml'
3831 metaXml = urllib2.urlopen(xmlUrl).read()
3832 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3833 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3835 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3837 info['title'] = mdoc.findall('./title')[0].text
3838 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3840 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3842 info['stitle'] = _simplify_title(info['title'])
3843 info['ext'] = info['url'].rpartition('.')[2]
3844 info['format'] = info['ext']
3845 self._downloader.increment_downloads()
3847 self._downloader.process_info(info)
3848 except UnavailableVideoError, err:
3849 self._downloader.trouble(u'\nERROR: unable to download video')
3850 elif mobj.group('course'): # A course page
3851 unescapeHTML = HTMLParser.HTMLParser().unescape
3853 course = mobj.group('course')
3855 'id': _simplify_title(course),
3859 self.report_download_webpage(info['id'])
3861 coursepage = urllib2.urlopen(url).read()
3862 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3863 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3866 m = re.search('<h1>([^<]+)</h1>', coursepage)
3868 info['title'] = unescapeHTML(m.group(1))
3870 info['title'] = info['id']
3871 info['stitle'] = _simplify_title(info['title'])
3873 m = re.search('<description>([^<]+)</description>', coursepage)
3875 info['description'] = unescapeHTML(m.group(1))
3877 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3880 'type': 'reference',
3881 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3885 for entry in info['list']:
3886 assert entry['type'] == 'reference'
3887 self.extract(entry['url'])
3889 unescapeHTML = HTMLParser.HTMLParser().unescape
3892 'id': 'Stanford OpenClassroom',
3896 self.report_download_webpage(info['id'])
3897 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3899 rootpage = urllib2.urlopen(rootURL).read()
3900 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3901 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3904 info['title'] = info['id']
3905 info['stitle'] = _simplify_title(info['title'])
3907 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3910 'type': 'reference',
3911 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3915 for entry in info['list']:
3916 assert entry['type'] == 'reference'
3917 self.extract(entry['url'])
3919 class MTVIE(InfoExtractor):
3920 """Information extractor for MTV.com"""
3922 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3925 def report_webpage(self, video_id):
3926 """Report information extraction."""
3927 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3929 def report_extraction(self, video_id):
3930 """Report information extraction."""
3931 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3933 def _real_extract(self, url):
3934 mobj = re.match(self._VALID_URL, url)
3936 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3938 if not mobj.group('proto'):
3939 url = 'http://' + url
3940 video_id = mobj.group('videoid')
3941 self.report_webpage(video_id)
3943 request = urllib2.Request(url)
3945 webpage = urllib2.urlopen(request).read()
3946 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3947 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3950 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3952 self._downloader.trouble(u'ERROR: unable to extract song name')
3954 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3955 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3957 self._downloader.trouble(u'ERROR: unable to extract performer')
3959 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3960 video_title = performer + ' - ' + song_name
3962 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3964 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3966 mtvn_uri = mobj.group(1)
3968 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3970 self._downloader.trouble(u'ERROR: unable to extract content id')
3972 content_id = mobj.group(1)
3974 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3975 self.report_extraction(video_id)
3976 request = urllib2.Request(videogen_url)
3978 metadataXml = urllib2.urlopen(request).read()
3979 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3980 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3983 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3984 renditions = mdoc.findall('.//rendition')
3986 # For now, always pick the highest quality.
3987 rendition = renditions[-1]
3990 _,_,ext = rendition.attrib['type'].partition('/')
3991 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3992 video_url = rendition.find('./src').text
3994 self._downloader.trouble('Invalid rendition field.')
3997 self._downloader.increment_downloads()
4001 'uploader': performer,
4002 'title': video_title,
4003 'stitle': _simplify_title(video_title),
4009 self._downloader.process_info(info)
4010 except UnavailableVideoError, err:
4011 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4014 class PostProcessor(object):
4015 """Post Processor class.
4017 PostProcessor objects can be added to downloaders with their
4018 add_post_processor() method. When the downloader has finished a
4019 successful download, it will take its internal chain of PostProcessors
4020 and start calling the run() method on each one of them, first with
4021 an initial argument and then with the returned value of the previous
4024 The chain will be stopped if one of them ever returns None or the end
4025 of the chain is reached.
4027 PostProcessor objects follow a "mutual registration" process similar
4028 to InfoExtractor objects.
4033 def __init__(self, downloader=None):
4034 self._downloader = downloader
4036 def set_downloader(self, downloader):
4037 """Sets the downloader for this PP."""
4038 self._downloader = downloader
4040 def run(self, information):
4041 """Run the PostProcessor.
4043 The "information" argument is a dictionary like the ones
4044 composed by InfoExtractors. The only difference is that this
4045 one has an extra field called "filepath" that points to the
4048 When this method returns None, the postprocessing chain is
4049 stopped. However, this method may return an information
4050 dictionary that will be passed to the next postprocessing
4051 object in the chain. It can be the one it received after
4052 changing some fields.
4054 In addition, this method may raise a PostProcessingError
4055 exception that will be taken into account by the downloader
4058 return information # by default, do nothing
4060 class AudioConversionError(BaseException):
4061 def __init__(self, message):
4062 self.message = message
4064 class FFmpegExtractAudioPP(PostProcessor):
4066 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4067 PostProcessor.__init__(self, downloader)
4068 if preferredcodec is None:
4069 preferredcodec = 'best'
4070 self._preferredcodec = preferredcodec
4071 self._preferredquality = preferredquality
4072 self._keepvideo = keepvideo
4075 def get_audio_codec(path):
4077 cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4078 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4079 output = handle.communicate()[0]
4080 if handle.wait() != 0:
4082 except (IOError, OSError):
4085 for line in output.split('\n'):
4086 if line.startswith('codec_name='):
4087 audio_codec = line.split('=')[1].strip()
4088 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4093 def run_ffmpeg(path, out_path, codec, more_opts):
4097 acodec_opts = ['-acodec', codec]
4098 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4100 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4101 stdout,stderr = p.communicate()
4102 except (IOError, OSError):
4103 e = sys.exc_info()[1]
4104 if isinstance(e, OSError) and e.errno == 2:
4105 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4108 if p.returncode != 0:
4109 msg = stderr.strip().split('\n')[-1]
4110 raise AudioConversionError(msg)
4112 def run(self, information):
4113 path = information['filepath']
4115 filecodec = self.get_audio_codec(path)
4116 if filecodec is None:
4117 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4121 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4122 if self._preferredcodec == 'm4a' and filecodec == 'aac':
4123 # Lossless, but in another container
4125 extension = self._preferredcodec
4126 more_opts = ['-absf', 'aac_adtstoasc']
4127 elif filecodec in ['aac', 'mp3', 'vorbis']:
4128 # Lossless if possible
4130 extension = filecodec
4131 if filecodec == 'aac':
4132 more_opts = ['-f', 'adts']
4133 if filecodec == 'vorbis':
4137 acodec = 'libmp3lame'
4140 if self._preferredquality is not None:
4141 more_opts += ['-ab', self._preferredquality]
4143 # We convert the audio (lossy)
4144 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4145 extension = self._preferredcodec
4147 if self._preferredquality is not None:
4148 more_opts += ['-ab', self._preferredquality]
4149 if self._preferredcodec == 'aac':
4150 more_opts += ['-f', 'adts']
4151 if self._preferredcodec == 'm4a':
4152 more_opts += ['-absf', 'aac_adtstoasc']
4153 if self._preferredcodec == 'vorbis':
4155 if self._preferredcodec == 'wav':
4157 more_opts += ['-f', 'wav']
4159 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4160 new_path = prefix + sep + extension
4161 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4163 self.run_ffmpeg(path, new_path, acodec, more_opts)
4165 etype,e,tb = sys.exc_info()
4166 if isinstance(e, AudioConversionError):
4167 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4169 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4172 # Try to update the date time for extracted audio file.
4173 if information.get('filetime') is not None:
4175 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4177 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4179 if not self._keepvideo:
4181 os.remove(_encodeFilename(path))
4182 except (IOError, OSError):
4183 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4186 information['filepath'] = new_path
4190 def updateSelf(downloader, filename):
4191 ''' Update the program file with the latest version from the repository '''
4192 # Note: downloader only used for options
4193 if not os.access(filename, os.W_OK):
4194 sys.exit('ERROR: no write permissions on %s' % filename)
4196 downloader.to_screen(u'Updating to latest version...')
4200 urlh = urllib.urlopen(UPDATE_URL)
4201 newcontent = urlh.read()
4203 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4204 if vmatch is not None and vmatch.group(1) == __version__:
4205 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4209 except (IOError, OSError), err:
4210 sys.exit('ERROR: unable to download latest version')
4213 outf = open(filename, 'wb')
4215 outf.write(newcontent)
4218 except (IOError, OSError), err:
4219 sys.exit('ERROR: unable to overwrite current version')
4221 downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4224 def _readOptions(filename_bytes):
4226 optionf = open(filename_bytes)
4228 return [] # silently skip if file is not present
4232 res += shlex.split(l, comments=True)
4237 def _format_option_string(option):
4238 ''' ('-o', '--option') -> -o, --format METAVAR'''
4242 if option._short_opts: opts.append(option._short_opts[0])
4243 if option._long_opts: opts.append(option._long_opts[0])
4244 if len(opts) > 1: opts.insert(1, ', ')
4246 if option.takes_value(): opts.append(' %s' % option.metavar)
4248 return "".join(opts)
4250 def _find_term_columns():
4251 columns = os.environ.get('COLUMNS', None)
4256 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4257 out,err = sp.communicate()
4258 return int(out.split()[1])
4264 max_help_position = 80
4266 # No need to wrap help messages if we're on a wide console
4267 columns = _find_term_columns()
4268 if columns: max_width = columns
4270 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4271 fmt.format_option_strings = _format_option_string
4274 'version' : __version__,
4276 'usage' : '%prog [options] url [url...]',
4277 'conflict_handler' : 'resolve',
4280 parser = optparse.OptionParser(**kw)
4283 general = optparse.OptionGroup(parser, 'General Options')
4284 selection = optparse.OptionGroup(parser, 'Video Selection')
4285 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4286 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4287 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4288 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4289 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4291 general.add_option('-h', '--help',
4292 action='help', help='print this help text and exit')
4293 general.add_option('-v', '--version',
4294 action='version', help='print program version and exit')
4295 general.add_option('-U', '--update',
4296 action='store_true', dest='update_self', help='update this program to latest version')
4297 general.add_option('-i', '--ignore-errors',
4298 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4299 general.add_option('-r', '--rate-limit',
4300 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4301 general.add_option('-R', '--retries',
4302 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4303 general.add_option('--dump-user-agent',
4304 action='store_true', dest='dump_user_agent',
4305 help='display the current browser identification', default=False)
4306 general.add_option('--list-extractors',
4307 action='store_true', dest='list_extractors',
4308 help='List all supported extractors and the URLs they would handle', default=False)
4310 selection.add_option('--playlist-start',
4311 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4312 selection.add_option('--playlist-end',
4313 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4314 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4315 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4316 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4318 authentication.add_option('-u', '--username',
4319 dest='username', metavar='USERNAME', help='account username')
4320 authentication.add_option('-p', '--password',
4321 dest='password', metavar='PASSWORD', help='account password')
4322 authentication.add_option('-n', '--netrc',
4323 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4326 video_format.add_option('-f', '--format',
4327 action='store', dest='format', metavar='FORMAT', help='video format code')
4328 video_format.add_option('--all-formats',
4329 action='store_const', dest='format', help='download all available video formats', const='all')
4330 video_format.add_option('--prefer-free-formats',
4331 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4332 video_format.add_option('--max-quality',
4333 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4334 video_format.add_option('-F', '--list-formats',
4335 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4338 verbosity.add_option('-q', '--quiet',
4339 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4340 verbosity.add_option('-s', '--simulate',
4341 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4342 verbosity.add_option('--skip-download',
4343 action='store_true', dest='skip_download', help='do not download the video', default=False)
4344 verbosity.add_option('-g', '--get-url',
4345 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4346 verbosity.add_option('-e', '--get-title',
4347 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4348 verbosity.add_option('--get-thumbnail',
4349 action='store_true', dest='getthumbnail',
4350 help='simulate, quiet but print thumbnail URL', default=False)
4351 verbosity.add_option('--get-description',
4352 action='store_true', dest='getdescription',
4353 help='simulate, quiet but print video description', default=False)
4354 verbosity.add_option('--get-filename',
4355 action='store_true', dest='getfilename',
4356 help='simulate, quiet but print output filename', default=False)
4357 verbosity.add_option('--get-format',
4358 action='store_true', dest='getformat',
4359 help='simulate, quiet but print output format', default=False)
4360 verbosity.add_option('--no-progress',
4361 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4362 verbosity.add_option('--console-title',
4363 action='store_true', dest='consoletitle',
4364 help='display progress in console titlebar', default=False)
4365 verbosity.add_option('-v', '--verbose',
4366 action='store_true', dest='verbose', help='print various debugging information', default=False)
4369 filesystem.add_option('-t', '--title',
4370 action='store_true', dest='usetitle', help='use title in file name', default=False)
4371 filesystem.add_option('-l', '--literal',
4372 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4373 filesystem.add_option('-A', '--auto-number',
4374 action='store_true', dest='autonumber',
4375 help='number downloaded files starting from 00000', default=False)
4376 filesystem.add_option('-o', '--output',
4377 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4378 filesystem.add_option('-a', '--batch-file',
4379 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4380 filesystem.add_option('-w', '--no-overwrites',
4381 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4382 filesystem.add_option('-c', '--continue',
4383 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4384 filesystem.add_option('--no-continue',
4385 action='store_false', dest='continue_dl',
4386 help='do not resume partially downloaded files (restart from beginning)')
4387 filesystem.add_option('--cookies',
4388 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4389 filesystem.add_option('--no-part',
4390 action='store_true', dest='nopart', help='do not use .part files', default=False)
4391 filesystem.add_option('--no-mtime',
4392 action='store_false', dest='updatetime',
4393 help='do not use the Last-modified header to set the file modification time', default=True)
4394 filesystem.add_option('--write-description',
4395 action='store_true', dest='writedescription',
4396 help='write video description to a .description file', default=False)
4397 filesystem.add_option('--write-info-json',
4398 action='store_true', dest='writeinfojson',
4399 help='write video metadata to a .info.json file', default=False)
4402 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4403 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4404 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4405 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4406 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4407 help='ffmpeg audio bitrate specification, 128k by default')
4408 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4409 help='keeps the video file on disk after the post-processing; the video is erased by default')
4412 parser.add_option_group(general)
4413 parser.add_option_group(selection)
4414 parser.add_option_group(filesystem)
4415 parser.add_option_group(verbosity)
4416 parser.add_option_group(video_format)
4417 parser.add_option_group(authentication)
4418 parser.add_option_group(postproc)
4420 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4422 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4424 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4425 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4426 opts, args = parser.parse_args(argv)
4428 return parser, opts, args
4430 def gen_extractors():
4431 """ Return a list of an instance of every supported extractor.
4432 The order does matter; the first extractor matched is the one handling the URL.
4434 youtube_ie = YoutubeIE()
4435 google_ie = GoogleIE()
4436 yahoo_ie = YahooIE()
4438 YoutubePlaylistIE(youtube_ie),
4439 YoutubeUserIE(youtube_ie),
4440 YoutubeSearchIE(youtube_ie),
4442 MetacafeIE(youtube_ie),
4445 GoogleSearchIE(google_ie),
4448 YahooSearchIE(yahoo_ie),
4461 StanfordOpenClassroomIE(),
4468 parser, opts, args = parseOpts()
4470 # Open appropriate CookieJar
4471 if opts.cookiefile is None:
4472 jar = cookielib.CookieJar()
4475 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4476 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4478 except (IOError, OSError), err:
4479 sys.exit(u'ERROR: unable to open cookie file')
4482 if opts.dump_user_agent:
4483 print std_headers['User-Agent']
4486 # Batch file verification
4488 if opts.batchfile is not None:
4490 if opts.batchfile == '-':
4493 batchfd = open(opts.batchfile, 'r')
4494 batchurls = batchfd.readlines()
4495 batchurls = [x.strip() for x in batchurls]
4496 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4498 sys.exit(u'ERROR: batch file could not be read')
4499 all_urls = batchurls + args
4501 # General configuration
4502 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4503 proxy_handler = urllib2.ProxyHandler()
4504 opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4505 urllib2.install_opener(opener)
4506 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4509 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4511 extractors = gen_extractors()
4513 if opts.list_extractors:
4514 for ie in extractors:
4516 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4517 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4518 for mu in matchedUrls:
4522 # Conflicting, missing and erroneous options
4523 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4524 parser.error(u'using .netrc conflicts with giving username/password')
4525 if opts.password is not None and opts.username is None:
4526 parser.error(u'account username missing')
4527 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4528 parser.error(u'using output template conflicts with using title, literal title or auto number')
4529 if opts.usetitle and opts.useliteral:
4530 parser.error(u'using title conflicts with using literal title')
4531 if opts.username is not None and opts.password is None:
4532 opts.password = getpass.getpass(u'Type account password and press return:')
4533 if opts.ratelimit is not None:
4534 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4535 if numeric_limit is None:
4536 parser.error(u'invalid rate limit specified')
4537 opts.ratelimit = numeric_limit
4538 if opts.retries is not None:
4540 opts.retries = long(opts.retries)
4541 except (TypeError, ValueError), err:
4542 parser.error(u'invalid retry count specified')
4544 opts.playliststart = int(opts.playliststart)
4545 if opts.playliststart <= 0:
4546 raise ValueError(u'Playlist start must be positive')
4547 except (TypeError, ValueError), err:
4548 parser.error(u'invalid playlist start number specified')
4550 opts.playlistend = int(opts.playlistend)
4551 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4552 raise ValueError(u'Playlist end must be greater than playlist start')
4553 except (TypeError, ValueError), err:
4554 parser.error(u'invalid playlist end number specified')
4555 if opts.extractaudio:
4556 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4557 parser.error(u'invalid audio format specified')
4560 fd = FileDownloader({
4561 'usenetrc': opts.usenetrc,
4562 'username': opts.username,
4563 'password': opts.password,
4564 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4565 'forceurl': opts.geturl,
4566 'forcetitle': opts.gettitle,
4567 'forcethumbnail': opts.getthumbnail,
4568 'forcedescription': opts.getdescription,
4569 'forcefilename': opts.getfilename,
4570 'forceformat': opts.getformat,
4571 'simulate': opts.simulate,
4572 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4573 'format': opts.format,
4574 'format_limit': opts.format_limit,
4575 'listformats': opts.listformats,
4576 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4577 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4578 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4579 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4580 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4581 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4582 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4583 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4584 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4585 or u'%(id)s.%(ext)s'),
4586 'ignoreerrors': opts.ignoreerrors,
4587 'ratelimit': opts.ratelimit,
4588 'nooverwrites': opts.nooverwrites,
4589 'retries': opts.retries,
4590 'continuedl': opts.continue_dl,
4591 'noprogress': opts.noprogress,
4592 'playliststart': opts.playliststart,
4593 'playlistend': opts.playlistend,
4594 'logtostderr': opts.outtmpl == '-',
4595 'consoletitle': opts.consoletitle,
4596 'nopart': opts.nopart,
4597 'updatetime': opts.updatetime,
4598 'writedescription': opts.writedescription,
4599 'writeinfojson': opts.writeinfojson,
4600 'matchtitle': opts.matchtitle,
4601 'rejecttitle': opts.rejecttitle,
4602 'max_downloads': opts.max_downloads,
4603 'prefer_free_formats': opts.prefer_free_formats,
4604 'verbose': opts.verbose,
4606 for extractor in extractors:
4607 fd.add_info_extractor(extractor)
4610 if opts.extractaudio:
4611 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4614 if opts.update_self:
4615 updateSelf(fd, sys.argv[0])
4618 if len(all_urls) < 1:
4619 if not opts.update_self:
4620 parser.error(u'you must provide at least one URL')
4625 retcode = fd.download(all_urls)
4626 except MaxDownloadsReached:
4627 fd.to_screen(u'--max-download limit reached, aborting.')
4630 # Dump cookie jar if requested
4631 if opts.cookiefile is not None:
4634 except (IOError, OSError), err:
4635 sys.exit(u'ERROR: unable to save cookie jar')
4642 except DownloadError:
4644 except SameFileError:
4645 sys.exit(u'ERROR: fixed output name but more than one file to download')
4646 except KeyboardInterrupt:
4647 sys.exit(u'\nERROR: Interrupted by user')
4649 if __name__ == '__main__':
4652 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: