2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
20 __license__ = 'Public Domain'
21 __version__ = '2012.01.08b'
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
52 except ImportError: # Python 2.4
55 import cStringIO as StringIO
59 # parse_qs was moved from the cgi module to the urlparse module recently.
61 from urlparse import parse_qs
63 from cgi import parse_qs
71 import xml.etree.ElementTree
72 except ImportError: # Python<2.5: Not officially supported, but let it slip
73 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
76 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
77 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
78 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
79 'Accept-Encoding': 'gzip, deflate',
80 'Accept-Language': 'en-us,en;q=0.5',
85 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
91 def raiseError(msg, i):
92 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
93 def skipSpace(i, expectMore=True):
94 while i < len(s) and s[i] in ' \t\r\n':
98 raiseError('Premature end', i)
100 def decodeEscape(match):
116 return unichr(int(esc[1:5], 16))
117 if len(esc) == 5+6 and esc[5:7] == '\\u':
118 hi = int(esc[1:5], 16)
119 low = int(esc[7:11], 16)
120 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
121 raise ValueError('Unknown escape ' + str(esc))
128 while s[e-bslashes-1] == '\\':
130 if bslashes % 2 == 1:
134 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
135 stri = rexp.sub(decodeEscape, s[i:e])
141 if s[i] == '}': # Empty dictionary
145 raiseError('Expected a string object key', i)
146 i,key = parseString(i)
148 if i >= len(s) or s[i] != ':':
149 raiseError('Expected a colon', i)
156 raiseError('Expected comma or closing curly brace', i)
161 if s[i] == ']': # Empty array
166 i = skipSpace(i) # Raise exception if premature end
170 raiseError('Expected a comma or closing bracket', i)
172 def parseDiscrete(i):
173 for k,v in {'true': True, 'false': False, 'null': None}.items():
174 if s.startswith(k, i):
176 raiseError('Not a boolean (or null)', i)
178 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
180 raiseError('Not a number', i)
182 if '.' in nums or 'e' in nums or 'E' in nums:
183 return (i+len(nums), float(nums))
184 return (i+len(nums), int(nums))
185 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
188 i,res = CHARMAP.get(s[i], parseNumber)(i)
189 i = skipSpace(i, False)
193 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
196 def preferredencoding():
197 """Get preferred encoding.
199 Returns the best encoding scheme for the system, based on
200 locale.getpreferredencoding() and some further tweaks.
202 def yield_preferredencoding():
204 pref = locale.getpreferredencoding()
210 return yield_preferredencoding().next()
213 def htmlentity_transform(matchobj):
214 """Transforms an HTML entity to a Unicode character.
216 This function receives a match object and is intended to be used with
217 the re.sub() function.
219 entity = matchobj.group(1)
221 # Known non-numeric HTML entity
222 if entity in htmlentitydefs.name2codepoint:
223 return unichr(htmlentitydefs.name2codepoint[entity])
226 mobj = re.match(ur'(?u)#(x?\d+)', entity)
228 numstr = mobj.group(1)
229 if numstr.startswith(u'x'):
231 numstr = u'0%s' % numstr
234 return unichr(long(numstr, base))
236 # Unknown entity in name, return its literal representation
237 return (u'&%s;' % entity)
240 def sanitize_title(utitle):
241 """Sanitizes a video title so it could be used as part of a filename."""
242 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
243 return utitle.replace(unicode(os.sep), u'%')
246 def sanitize_open(filename, open_mode):
247 """Try to open the given filename, and slightly tweak it if this fails.
249 Attempts to open the given filename. If this fails, it tries to change
250 the filename slightly, step by step, until it's either able to open it
251 or it fails and raises a final exception, like the standard open()
254 It returns the tuple (stream, definitive_file_name).
258 if sys.platform == 'win32':
260 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
261 return (sys.stdout, filename)
262 stream = open(_encodeFilename(filename), open_mode)
263 return (stream, filename)
264 except (IOError, OSError), err:
265 # In case of error, try to remove win32 forbidden chars
266 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
268 # An exception here should be caught in the caller
269 stream = open(_encodeFilename(filename), open_mode)
270 return (stream, filename)
273 def timeconvert(timestr):
274 """Convert RFC 2822 defined time string into system timestamp"""
276 timetuple = email.utils.parsedate_tz(timestr)
277 if timetuple is not None:
278 timestamp = email.utils.mktime_tz(timetuple)
281 def _simplify_title(title):
282 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
283 return expr.sub(u'_', title).strip(u'_')
285 def _orderedSet(iterable):
286 """ Remove all duplicates from the input iterable """
293 def _unescapeHTML(s):
295 @param s a string (of type unicode)
297 assert type(s) == type(u'')
299 htmlParser = HTMLParser.HTMLParser()
300 return htmlParser.unescape(s)
302 def _encodeFilename(s):
304 @param s The name of the file (of type unicode)
307 assert type(s) == type(u'')
309 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
310 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
311 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
312 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
315 return s.encode(sys.getfilesystemencoding(), 'ignore')
317 class DownloadError(Exception):
318 """Download Error exception.
320 This exception may be thrown by FileDownloader objects if they are not
321 configured to continue on errors. They will contain the appropriate
327 class SameFileError(Exception):
328 """Same File exception.
330 This exception will be thrown by FileDownloader objects if they detect
331 multiple files would have to be downloaded to the same file on disk.
336 class PostProcessingError(Exception):
337 """Post Processing exception.
339 This exception may be raised by PostProcessor's .run() method to
340 indicate an error in the postprocessing task.
344 class MaxDownloadsReached(Exception):
345 """ --max-downloads limit has been reached. """
349 class UnavailableVideoError(Exception):
350 """Unavailable Format exception.
352 This exception will be thrown when a video is requested
353 in a format that is not available for that video.
358 class ContentTooShortError(Exception):
359 """Content Too Short exception.
361 This exception may be raised by FileDownloader objects when a file they
362 download is too small for what the server announced first, indicating
363 the connection was probably interrupted.
369 def __init__(self, downloaded, expected):
370 self.downloaded = downloaded
371 self.expected = expected
374 class YoutubeDLHandler(urllib2.HTTPHandler):
375 """Handler for HTTP requests and responses.
377 This class, when installed with an OpenerDirector, automatically adds
378 the standard headers to every HTTP request and handles gzipped and
379 deflated responses from web servers. If compression is to be avoided in
380 a particular request, the original request in the program code only has
381 to include the HTTP header "Youtubedl-No-Compression", which will be
382 removed before making the real request.
384 Part of this code was copied from:
386 http://techknack.net/python-urllib2-handlers/
388 Andrew Rowls, the author of that code, agreed to release it to the
395 return zlib.decompress(data, -zlib.MAX_WBITS)
397 return zlib.decompress(data)
400 def addinfourl_wrapper(stream, headers, url, code):
401 if hasattr(urllib2.addinfourl, 'getcode'):
402 return urllib2.addinfourl(stream, headers, url, code)
403 ret = urllib2.addinfourl(stream, headers, url)
407 def http_request(self, req):
408 for h in std_headers:
411 req.add_header(h, std_headers[h])
412 if 'Youtubedl-no-compression' in req.headers:
413 if 'Accept-encoding' in req.headers:
414 del req.headers['Accept-encoding']
415 del req.headers['Youtubedl-no-compression']
418 def http_response(self, req, resp):
421 if resp.headers.get('Content-encoding', '') == 'gzip':
422 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
423 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
424 resp.msg = old_resp.msg
426 if resp.headers.get('Content-encoding', '') == 'deflate':
427 gz = StringIO.StringIO(self.deflate(resp.read()))
428 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
429 resp.msg = old_resp.msg
433 class FileDownloader(object):
434 """File Downloader class.
436 File downloader objects are the ones responsible of downloading the
437 actual video file and writing it to disk if the user has requested
438 it, among some other tasks. In most cases there should be one per
439 program. As, given a video URL, the downloader doesn't know how to
440 extract all the needed information, task that InfoExtractors do, it
441 has to pass the URL to one of them.
443 For this, file downloader objects have a method that allows
444 InfoExtractors to be registered in a given order. When it is passed
445 a URL, the file downloader handles it to the first InfoExtractor it
446 finds that reports being able to handle it. The InfoExtractor extracts
447 all the information about the video or videos the URL refers to, and
448 asks the FileDownloader to process the video information, possibly
449 downloading the video.
451 File downloaders accept a lot of parameters. In order not to saturate
452 the object constructor with arguments, it receives a dictionary of
453 options instead. These options are available through the params
454 attribute for the InfoExtractors to use. The FileDownloader also
455 registers itself as the downloader in charge for the InfoExtractors
456 that are added to it, so this is a "mutual registration".
460 username: Username for authentication purposes.
461 password: Password for authentication purposes.
462 usenetrc: Use netrc for authentication instead.
463 quiet: Do not print messages to stdout.
464 forceurl: Force printing final URL.
465 forcetitle: Force printing title.
466 forcethumbnail: Force printing thumbnail URL.
467 forcedescription: Force printing description.
468 forcefilename: Force printing final filename.
469 simulate: Do not download the video files.
470 format: Video format code.
471 format_limit: Highest quality format to try.
472 outtmpl: Template for output names.
473 ignoreerrors: Do not stop on download errors.
474 ratelimit: Download speed limit, in bytes/sec.
475 nooverwrites: Prevent overwriting files.
476 retries: Number of times to retry for HTTP error 5xx
477 continuedl: Try to continue downloads if possible.
478 noprogress: Do not print the progress bar.
479 playliststart: Playlist item to start at.
480 playlistend: Playlist item to end at.
481 matchtitle: Download only matching titles.
482 rejecttitle: Reject downloads for matching titles.
483 logtostderr: Log messages to stderr instead of stdout.
484 consoletitle: Display progress in console window's titlebar.
485 nopart: Do not use temporary .part files.
486 updatetime: Use the Last-modified header to set output file timestamps.
487 writedescription: Write the video description to a .description file
488 writeinfojson: Write the video description to a .info.json file
494 _download_retcode = None
495 _num_downloads = None
498 def __init__(self, params):
499 """Create a FileDownloader object with the given options."""
502 self._download_retcode = 0
503 self._num_downloads = 0
504 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
508 def format_bytes(bytes):
511 if type(bytes) is str:
516 exponent = long(math.log(bytes, 1024.0))
517 suffix = 'bkMGTPEZY'[exponent]
518 converted = float(bytes) / float(1024 ** exponent)
519 return '%.2f%s' % (converted, suffix)
522 def calc_percent(byte_counter, data_len):
525 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
528 def calc_eta(start, now, total, current):
532 if current == 0 or dif < 0.001: # One millisecond
534 rate = float(current) / dif
535 eta = long((float(total) - float(current)) / rate)
536 (eta_mins, eta_secs) = divmod(eta, 60)
539 return '%02d:%02d' % (eta_mins, eta_secs)
542 def calc_speed(start, now, bytes):
544 if bytes == 0 or dif < 0.001: # One millisecond
545 return '%10s' % '---b/s'
546 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
549 def best_block_size(elapsed_time, bytes):
550 new_min = max(bytes / 2.0, 1.0)
551 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
552 if elapsed_time < 0.001:
554 rate = bytes / elapsed_time
562 def parse_bytes(bytestr):
563 """Parse a string indicating a byte quantity into a long integer."""
564 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
567 number = float(matchobj.group(1))
568 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
569 return long(round(number * multiplier))
571 def add_info_extractor(self, ie):
572 """Add an InfoExtractor object to the end of the list."""
574 ie.set_downloader(self)
576 def add_post_processor(self, pp):
577 """Add a PostProcessor object to the end of the chain."""
579 pp.set_downloader(self)
581 def to_screen(self, message, skip_eol=False):
582 """Print message to stdout if not in quiet mode."""
583 assert type(message) == type(u'')
584 if not self.params.get('quiet', False):
585 terminator = [u'\n', u''][skip_eol]
586 output = message + terminator
588 if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
589 output = output.encode(preferredencoding(), 'ignore')
590 self._screen_file.write(output)
591 self._screen_file.flush()
593 def to_stderr(self, message):
594 """Print message to stderr."""
595 print >>sys.stderr, message.encode(preferredencoding())
597 def to_cons_title(self, message):
598 """Set console/terminal window title to message."""
599 if not self.params.get('consoletitle', False):
601 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
602 # c_wchar_p() might not be necessary if `message` is
603 # already of type unicode()
604 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
605 elif 'TERM' in os.environ:
606 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
608 def fixed_template(self):
609 """Checks if the output template is fixed."""
610 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
612 def trouble(self, message=None):
613 """Determine action to take when a download problem appears.
615 Depending on if the downloader has been configured to ignore
616 download errors or not, this method may throw an exception or
617 not when errors are found, after printing the message.
619 if message is not None:
620 self.to_stderr(message)
621 if not self.params.get('ignoreerrors', False):
622 raise DownloadError(message)
623 self._download_retcode = 1
625 def slow_down(self, start_time, byte_counter):
626 """Sleep if the download speed is over the rate limit."""
627 rate_limit = self.params.get('ratelimit', None)
628 if rate_limit is None or byte_counter == 0:
631 elapsed = now - start_time
634 speed = float(byte_counter) / elapsed
635 if speed > rate_limit:
636 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
638 def temp_name(self, filename):
639 """Returns a temporary filename for the given filename."""
640 if self.params.get('nopart', False) or filename == u'-' or \
641 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
643 return filename + u'.part'
645 def undo_temp_name(self, filename):
646 if filename.endswith(u'.part'):
647 return filename[:-len(u'.part')]
650 def try_rename(self, old_filename, new_filename):
652 if old_filename == new_filename:
654 os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
655 except (IOError, OSError), err:
656 self.trouble(u'ERROR: unable to rename file')
658 def try_utime(self, filename, last_modified_hdr):
659 """Try to set the last-modified time of the given file."""
660 if last_modified_hdr is None:
662 if not os.path.isfile(_encodeFilename(filename)):
664 timestr = last_modified_hdr
667 filetime = timeconvert(timestr)
671 os.utime(filename, (time.time(), filetime))
676 def report_writedescription(self, descfn):
677 """ Report that the description file is being written """
678 self.to_screen(u'[info] Writing video description to: ' + descfn)
680 def report_writeinfojson(self, infofn):
681 """ Report that the metadata file has been written """
682 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
684 def report_destination(self, filename):
685 """Report destination filename."""
686 self.to_screen(u'[download] Destination: ' + filename)
688 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
689 """Report download progress."""
690 if self.params.get('noprogress', False):
692 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
693 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
694 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
695 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
697 def report_resuming_byte(self, resume_len):
698 """Report attempt to resume at given byte."""
699 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
701 def report_retry(self, count, retries):
702 """Report retry in case of HTTP error 5xx"""
703 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
705 def report_file_already_downloaded(self, file_name):
706 """Report file has already been fully downloaded."""
708 self.to_screen(u'[download] %s has already been downloaded' % file_name)
709 except (UnicodeEncodeError), err:
710 self.to_screen(u'[download] The file has already been downloaded')
712 def report_unable_to_resume(self):
713 """Report it was impossible to resume download."""
714 self.to_screen(u'[download] Unable to resume')
716 def report_finish(self):
717 """Report download finished."""
718 if self.params.get('noprogress', False):
719 self.to_screen(u'[download] Download completed')
723 def increment_downloads(self):
724 """Increment the ordinal that assigns a number to each file."""
725 self._num_downloads += 1
727 def prepare_filename(self, info_dict):
728 """Generate the output filename."""
730 template_dict = dict(info_dict)
731 template_dict['epoch'] = unicode(long(time.time()))
732 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
733 filename = self.params['outtmpl'] % template_dict
735 except (ValueError, KeyError), err:
736 self.trouble(u'ERROR: invalid system charset or erroneous output template')
739 def _match_entry(self, info_dict):
740 """ Returns None iff the file should be downloaded """
742 title = info_dict['title']
743 matchtitle = self.params.get('matchtitle', False)
744 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
745 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
746 rejecttitle = self.params.get('rejecttitle', False)
747 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
748 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
751 def process_info(self, info_dict):
752 """Process a single dictionary returned by an InfoExtractor."""
754 reason = self._match_entry(info_dict)
755 if reason is not None:
756 self.to_screen(u'[download] ' + reason)
759 max_downloads = self.params.get('max_downloads')
760 if max_downloads is not None:
761 if self._num_downloads > int(max_downloads):
762 raise MaxDownloadsReached()
764 filename = self.prepare_filename(info_dict)
767 if self.params.get('forcetitle', False):
768 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
769 if self.params.get('forceurl', False):
770 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
771 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
772 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
773 if self.params.get('forcedescription', False) and 'description' in info_dict:
774 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
775 if self.params.get('forcefilename', False) and filename is not None:
776 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
777 if self.params.get('forceformat', False):
778 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
780 # Do nothing else if in simulate mode
781 if self.params.get('simulate', False):
788 dn = os.path.dirname(_encodeFilename(filename))
789 if dn != '' and not os.path.exists(dn): # dn is already encoded
791 except (OSError, IOError), err:
792 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
795 if self.params.get('writedescription', False):
797 descfn = filename + u'.description'
798 self.report_writedescription(descfn)
799 descfile = open(_encodeFilename(descfn), 'wb')
801 descfile.write(info_dict['description'].encode('utf-8'))
804 except (OSError, IOError):
805 self.trouble(u'ERROR: Cannot write description file ' + descfn)
808 if self.params.get('writeinfojson', False):
809 infofn = filename + u'.info.json'
810 self.report_writeinfojson(infofn)
813 except (NameError,AttributeError):
814 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
817 infof = open(_encodeFilename(infofn), 'wb')
819 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
820 json.dump(json_info_dict, infof)
823 except (OSError, IOError):
824 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
827 if not self.params.get('skip_download', False):
828 if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
832 success = self._do_download(filename, info_dict)
833 except (OSError, IOError), err:
834 raise UnavailableVideoError
835 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
836 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
838 except (ContentTooShortError, ), err:
839 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
844 self.post_process(filename, info_dict)
845 except (PostProcessingError), err:
846 self.trouble(u'ERROR: postprocessing: %s' % str(err))
849 def download(self, url_list):
850 """Download a given list of URLs."""
851 if len(url_list) > 1 and self.fixed_template():
852 raise SameFileError(self.params['outtmpl'])
855 suitable_found = False
857 # Go to next InfoExtractor if not suitable
858 if not ie.suitable(url):
861 # Suitable InfoExtractor found
862 suitable_found = True
864 # Extract information from URL and process it
867 # Suitable InfoExtractor had been found; go to next URL
870 if not suitable_found:
871 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
873 return self._download_retcode
875 def post_process(self, filename, ie_info):
876 """Run the postprocessing chain on the given file."""
878 info['filepath'] = filename
884 def _download_with_rtmpdump(self, filename, url, player_url):
885 self.report_destination(filename)
886 tmpfilename = self.temp_name(filename)
888 # Check for rtmpdump first
890 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
891 except (OSError, IOError):
892 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
895 # Download using rtmpdump. rtmpdump returns exit code 2 when
896 # the connection was interrumpted and resuming appears to be
897 # possible. This is part of rtmpdump's normal usage, AFAIK.
898 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
899 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
900 while retval == 2 or retval == 1:
901 prevsize = os.path.getsize(_encodeFilename(tmpfilename))
902 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
903 time.sleep(5.0) # This seems to be needed
904 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
905 cursize = os.path.getsize(_encodeFilename(tmpfilename))
906 if prevsize == cursize and retval == 1:
908 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
909 if prevsize == cursize and retval == 2 and cursize > 1024:
910 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
914 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
915 self.try_rename(tmpfilename, filename)
918 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
921 def _do_download(self, filename, info_dict):
922 url = info_dict['url']
923 player_url = info_dict.get('player_url', None)
925 # Check file already present
926 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
927 self.report_file_already_downloaded(filename)
930 # Attempt to download using rtmpdump
931 if url.startswith('rtmp'):
932 return self._download_with_rtmpdump(filename, url, player_url)
934 tmpfilename = self.temp_name(filename)
937 # Do not include the Accept-Encoding header
938 headers = {'Youtubedl-no-compression': 'True'}
939 basic_request = urllib2.Request(url, None, headers)
940 request = urllib2.Request(url, None, headers)
942 # Establish possible resume length
943 if os.path.isfile(_encodeFilename(tmpfilename)):
944 resume_len = os.path.getsize(_encodeFilename(tmpfilename))
950 if self.params.get('continuedl', False):
951 self.report_resuming_byte(resume_len)
952 request.add_header('Range','bytes=%d-' % resume_len)
958 retries = self.params.get('retries', 0)
959 while count <= retries:
960 # Establish connection
962 if count == 0 and 'urlhandle' in info_dict:
963 data = info_dict['urlhandle']
964 data = urllib2.urlopen(request)
966 except (urllib2.HTTPError, ), err:
967 if (err.code < 500 or err.code >= 600) and err.code != 416:
968 # Unexpected HTTP error
970 elif err.code == 416:
971 # Unable to resume (requested range not satisfiable)
973 # Open the connection again without the range header
974 data = urllib2.urlopen(basic_request)
975 content_length = data.info()['Content-Length']
976 except (urllib2.HTTPError, ), err:
977 if err.code < 500 or err.code >= 600:
980 # Examine the reported length
981 if (content_length is not None and
982 (resume_len - 100 < long(content_length) < resume_len + 100)):
983 # The file had already been fully downloaded.
984 # Explanation to the above condition: in issue #175 it was revealed that
985 # YouTube sometimes adds or removes a few bytes from the end of the file,
986 # changing the file size slightly and causing problems for some users. So
987 # I decided to implement a suggested change and consider the file
988 # completely downloaded if the file size differs less than 100 bytes from
989 # the one in the hard drive.
990 self.report_file_already_downloaded(filename)
991 self.try_rename(tmpfilename, filename)
994 # The length does not match, we start the download over
995 self.report_unable_to_resume()
1000 if count <= retries:
1001 self.report_retry(count, retries)
1004 self.trouble(u'ERROR: giving up after %s retries' % retries)
1007 data_len = data.info().get('Content-length', None)
1008 if data_len is not None:
1009 data_len = long(data_len) + resume_len
1010 data_len_str = self.format_bytes(data_len)
1011 byte_counter = 0 + resume_len
1015 # Download and write
1016 before = time.time()
1017 data_block = data.read(block_size)
1019 if len(data_block) == 0:
1021 byte_counter += len(data_block)
1023 # Open file just in time
1026 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1027 assert stream is not None
1028 filename = self.undo_temp_name(tmpfilename)
1029 self.report_destination(filename)
1030 except (OSError, IOError), err:
1031 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1034 stream.write(data_block)
1035 except (IOError, OSError), err:
1036 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1038 block_size = self.best_block_size(after - before, len(data_block))
1041 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1042 if data_len is None:
1043 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1045 percent_str = self.calc_percent(byte_counter, data_len)
1046 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1047 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1050 self.slow_down(start, byte_counter - resume_len)
1053 self.trouble(u'\nERROR: Did not get any data blocks')
1056 self.report_finish()
1057 if data_len is not None and byte_counter != data_len:
1058 raise ContentTooShortError(byte_counter, long(data_len))
1059 self.try_rename(tmpfilename, filename)
1061 # Update file modification time
1062 if self.params.get('updatetime', True):
1063 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1068 class InfoExtractor(object):
1069 """Information Extractor class.
1071 Information extractors are the classes that, given a URL, extract
1072 information from the video (or videos) the URL refers to. This
1073 information includes the real video URL, the video title and simplified
1074 title, author and others. The information is stored in a dictionary
1075 which is then passed to the FileDownloader. The FileDownloader
1076 processes this information possibly downloading the video to the file
1077 system, among other possible outcomes. The dictionaries must include
1078 the following fields:
1080 id: Video identifier.
1081 url: Final video URL.
1082 uploader: Nickname of the video uploader.
1083 title: Literal title.
1084 stitle: Simplified title.
1085 ext: Video filename extension.
1086 format: Video format.
1087 player_url: SWF Player URL (may be None).
1089 The following fields are optional. Their primary purpose is to allow
1090 youtube-dl to serve as the backend for a video search function, such
1091 as the one in youtube2mp3. They are only used when their respective
1092 forced printing functions are called:
1094 thumbnail: Full URL to a video thumbnail image.
1095 description: One-line video description.
1097 Subclasses of this one should re-define the _real_initialize() and
1098 _real_extract() methods and define a _VALID_URL regexp.
1099 Probably, they should also be added to the list of extractors.
1105 def __init__(self, downloader=None):
1106 """Constructor. Receives an optional downloader."""
1108 self.set_downloader(downloader)
1110 def suitable(self, url):
1111 """Receives a URL and returns True if suitable for this IE."""
1112 return re.match(self._VALID_URL, url) is not None
1114 def initialize(self):
1115 """Initializes an instance (authentication, etc)."""
1117 self._real_initialize()
1120 def extract(self, url):
1121 """Extracts URL information and returns it in list of dicts."""
1123 return self._real_extract(url)
1125 def set_downloader(self, downloader):
1126 """Sets the downloader for this IE."""
1127 self._downloader = downloader
1129 def _real_initialize(self):
1130 """Real initialization process. Redefine in subclasses."""
1133 def _real_extract(self, url):
1134 """Real extraction process. Redefine in subclasses."""
1138 class YoutubeIE(InfoExtractor):
1139 """Information extractor for youtube.com."""
1141 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1142 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1143 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1144 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1145 _NETRC_MACHINE = 'youtube'
1146 # Listed in order of quality
1147 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1148 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1149 _video_extensions = {
1155 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1160 _video_dimensions = {
1175 IE_NAME = u'youtube'
1177 def report_lang(self):
1178 """Report attempt to set language."""
1179 self._downloader.to_screen(u'[youtube] Setting language')
1181 def report_login(self):
1182 """Report attempt to log in."""
1183 self._downloader.to_screen(u'[youtube] Logging in')
1185 def report_age_confirmation(self):
1186 """Report attempt to confirm age."""
1187 self._downloader.to_screen(u'[youtube] Confirming age')
1189 def report_video_webpage_download(self, video_id):
1190 """Report attempt to download video webpage."""
1191 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1193 def report_video_info_webpage_download(self, video_id):
1194 """Report attempt to download video info webpage."""
1195 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1197 def report_information_extraction(self, video_id):
1198 """Report attempt to extract video information."""
1199 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1201 def report_unavailable_format(self, video_id, format):
1202 """Report extracted video URL."""
1203 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1205 def report_rtmp_download(self):
1206 """Indicate the download will use the RTMP protocol."""
1207 self._downloader.to_screen(u'[youtube] RTMP download detected')
1209 def _print_formats(self, formats):
1210 print 'Available formats:'
1212 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1214 def _real_initialize(self):
1215 if self._downloader is None:
1220 downloader_params = self._downloader.params
1222 # Attempt to use provided username and password or .netrc data
1223 if downloader_params.get('username', None) is not None:
1224 username = downloader_params['username']
1225 password = downloader_params['password']
1226 elif downloader_params.get('usenetrc', False):
1228 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1229 if info is not None:
1233 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1234 except (IOError, netrc.NetrcParseError), err:
1235 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1239 request = urllib2.Request(self._LANG_URL)
1242 urllib2.urlopen(request).read()
1243 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1244 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1247 # No authentication to be performed
1248 if username is None:
1253 'current_form': 'loginForm',
1255 'action_login': 'Log In',
1256 'username': username,
1257 'password': password,
1259 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1262 login_results = urllib2.urlopen(request).read()
1263 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1264 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1266 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1267 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1273 'action_confirm': 'Confirm',
1275 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1277 self.report_age_confirmation()
1278 age_results = urllib2.urlopen(request).read()
1279 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1280 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1283 def _real_extract(self, url):
1284 # Extract video id from URL
1285 mobj = re.match(self._VALID_URL, url)
1287 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1289 video_id = mobj.group(2)
1292 self.report_video_webpage_download(video_id)
1293 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1295 video_webpage = urllib2.urlopen(request).read()
1296 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1297 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1300 # Attempt to extract SWF player URL
1301 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1302 if mobj is not None:
1303 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1308 self.report_video_info_webpage_download(video_id)
1309 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1310 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1311 % (video_id, el_type))
1312 request = urllib2.Request(video_info_url)
1314 video_info_webpage = urllib2.urlopen(request).read()
1315 video_info = parse_qs(video_info_webpage)
1316 if 'token' in video_info:
1318 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1319 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1321 if 'token' not in video_info:
1322 if 'reason' in video_info:
1323 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1325 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1328 # Start extracting information
1329 self.report_information_extraction(video_id)
1332 if 'author' not in video_info:
1333 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1335 video_uploader = urllib.unquote_plus(video_info['author'][0])
1338 if 'title' not in video_info:
1339 self._downloader.trouble(u'ERROR: unable to extract video title')
1341 video_title = urllib.unquote_plus(video_info['title'][0])
1342 video_title = video_title.decode('utf-8')
1343 video_title = sanitize_title(video_title)
1346 simple_title = _simplify_title(video_title)
1349 if 'thumbnail_url' not in video_info:
1350 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1351 video_thumbnail = ''
1352 else: # don't panic if we can't find it
1353 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1357 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1358 if mobj is not None:
1359 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1360 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1361 for expression in format_expressions:
1363 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1371 video_description = u'No description available.'
1372 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1373 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1374 if mobj is not None:
1375 video_description = mobj.group(1).decode('utf-8')
1377 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1378 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1379 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1380 # TODO use another parser
1383 video_token = urllib.unquote_plus(video_info['token'][0])
1385 # Decide which formats to download
1386 req_format = self._downloader.params.get('format', None)
1388 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1389 self.report_rtmp_download()
1390 video_url_list = [(None, video_info['conn'][0])]
1391 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1392 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1393 url_data = [parse_qs(uds) for uds in url_data_strs]
1394 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1395 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1397 format_limit = self._downloader.params.get('format_limit', None)
1398 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1399 if format_limit is not None and format_limit in available_formats:
1400 format_list = available_formats[available_formats.index(format_limit):]
1402 format_list = available_formats
1403 existing_formats = [x for x in format_list if x in url_map]
1404 if len(existing_formats) == 0:
1405 self._downloader.trouble(u'ERROR: no known formats available for video')
1407 if self._downloader.params.get('listformats', None):
1408 self._print_formats(existing_formats)
1410 if req_format is None or req_format == 'best':
1411 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1412 elif req_format == 'worst':
1413 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1414 elif req_format in ('-1', 'all'):
1415 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1417 # Specific formats. We pick the first in a slash-delimeted sequence.
1418 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1419 req_formats = req_format.split('/')
1420 video_url_list = None
1421 for rf in req_formats:
1423 video_url_list = [(rf, url_map[rf])]
1425 if video_url_list is None:
1426 self._downloader.trouble(u'ERROR: requested format not available')
1429 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1432 for format_param, video_real_url in video_url_list:
1433 # At this point we have a new video
1434 self._downloader.increment_downloads()
1437 video_extension = self._video_extensions.get(format_param, 'flv')
1440 # Process video information
1441 self._downloader.process_info({
1442 'id': video_id.decode('utf-8'),
1443 'url': video_real_url.decode('utf-8'),
1444 'uploader': video_uploader.decode('utf-8'),
1445 'upload_date': upload_date,
1446 'title': video_title,
1447 'stitle': simple_title,
1448 'ext': video_extension.decode('utf-8'),
1449 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1450 'thumbnail': video_thumbnail.decode('utf-8'),
1451 'description': video_description,
1452 'player_url': player_url,
1454 except UnavailableVideoError, err:
1455 self._downloader.trouble(u'\nERROR: unable to download video')
1458 class MetacafeIE(InfoExtractor):
1459 """Information Extractor for metacafe.com."""
1461 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1462 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1463 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1465 IE_NAME = u'metacafe'
1467 def __init__(self, youtube_ie, downloader=None):
1468 InfoExtractor.__init__(self, downloader)
1469 self._youtube_ie = youtube_ie
1471 def report_disclaimer(self):
1472 """Report disclaimer retrieval."""
1473 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1475 def report_age_confirmation(self):
1476 """Report attempt to confirm age."""
1477 self._downloader.to_screen(u'[metacafe] Confirming age')
1479 def report_download_webpage(self, video_id):
1480 """Report webpage download."""
1481 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1483 def report_extraction(self, video_id):
1484 """Report information extraction."""
1485 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1487 def _real_initialize(self):
1488 # Retrieve disclaimer
1489 request = urllib2.Request(self._DISCLAIMER)
1491 self.report_disclaimer()
1492 disclaimer = urllib2.urlopen(request).read()
1493 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1494 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1500 'submit': "Continue - I'm over 18",
1502 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1504 self.report_age_confirmation()
1505 disclaimer = urllib2.urlopen(request).read()
1506 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1507 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1510 def _real_extract(self, url):
1511 # Extract id and simplified title from URL
1512 mobj = re.match(self._VALID_URL, url)
1514 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1517 video_id = mobj.group(1)
1519 # Check if video comes from YouTube
1520 mobj2 = re.match(r'^yt-(.*)$', video_id)
1521 if mobj2 is not None:
1522 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1525 # At this point we have a new video
1526 self._downloader.increment_downloads()
1528 simple_title = mobj.group(2).decode('utf-8')
1530 # Retrieve video webpage to extract further information
1531 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1533 self.report_download_webpage(video_id)
1534 webpage = urllib2.urlopen(request).read()
1535 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1536 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1539 # Extract URL, uploader and title from webpage
1540 self.report_extraction(video_id)
1541 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1542 if mobj is not None:
1543 mediaURL = urllib.unquote(mobj.group(1))
1544 video_extension = mediaURL[-3:]
1546 # Extract gdaKey if available
1547 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1549 video_url = mediaURL
1551 gdaKey = mobj.group(1)
1552 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1554 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1556 self._downloader.trouble(u'ERROR: unable to extract media URL')
1558 vardict = parse_qs(mobj.group(1))
1559 if 'mediaData' not in vardict:
1560 self._downloader.trouble(u'ERROR: unable to extract media URL')
1562 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1564 self._downloader.trouble(u'ERROR: unable to extract media URL')
1566 mediaURL = mobj.group(1).replace('\\/', '/')
1567 video_extension = mediaURL[-3:]
1568 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1570 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1572 self._downloader.trouble(u'ERROR: unable to extract title')
1574 video_title = mobj.group(1).decode('utf-8')
1575 video_title = sanitize_title(video_title)
1577 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1579 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1581 video_uploader = mobj.group(1)
1584 # Process video information
1585 self._downloader.process_info({
1586 'id': video_id.decode('utf-8'),
1587 'url': video_url.decode('utf-8'),
1588 'uploader': video_uploader.decode('utf-8'),
1589 'upload_date': u'NA',
1590 'title': video_title,
1591 'stitle': simple_title,
1592 'ext': video_extension.decode('utf-8'),
1596 except UnavailableVideoError:
1597 self._downloader.trouble(u'\nERROR: unable to download video')
1600 class DailymotionIE(InfoExtractor):
1601 """Information Extractor for Dailymotion"""
1603 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1604 IE_NAME = u'dailymotion'
1606 def __init__(self, downloader=None):
1607 InfoExtractor.__init__(self, downloader)
1609 def report_download_webpage(self, video_id):
1610 """Report webpage download."""
1611 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1613 def report_extraction(self, video_id):
1614 """Report information extraction."""
1615 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1617 def _real_extract(self, url):
1618 # Extract id and simplified title from URL
1619 mobj = re.match(self._VALID_URL, url)
1621 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1624 # At this point we have a new video
1625 self._downloader.increment_downloads()
1626 video_id = mobj.group(1)
1628 video_extension = 'flv'
1630 # Retrieve video webpage to extract further information
1631 request = urllib2.Request(url)
1632 request.add_header('Cookie', 'family_filter=off')
1634 self.report_download_webpage(video_id)
1635 webpage = urllib2.urlopen(request).read()
1636 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1637 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1640 # Extract URL, uploader and title from webpage
1641 self.report_extraction(video_id)
1642 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1644 self._downloader.trouble(u'ERROR: unable to extract media URL')
1646 sequence = urllib.unquote(mobj.group(1))
1647 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1649 self._downloader.trouble(u'ERROR: unable to extract media URL')
1651 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1653 # if needed add http://www.dailymotion.com/ if relative URL
1655 video_url = mediaURL
1657 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1659 self._downloader.trouble(u'ERROR: unable to extract title')
1661 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1662 video_title = sanitize_title(video_title)
1663 simple_title = _simplify_title(video_title)
1665 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1667 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1669 video_uploader = mobj.group(1)
1672 # Process video information
1673 self._downloader.process_info({
1674 'id': video_id.decode('utf-8'),
1675 'url': video_url.decode('utf-8'),
1676 'uploader': video_uploader.decode('utf-8'),
1677 'upload_date': u'NA',
1678 'title': video_title,
1679 'stitle': simple_title,
1680 'ext': video_extension.decode('utf-8'),
1684 except UnavailableVideoError:
1685 self._downloader.trouble(u'\nERROR: unable to download video')
1688 class GoogleIE(InfoExtractor):
1689 """Information extractor for video.google.com."""
1691 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1692 IE_NAME = u'video.google'
1694 def __init__(self, downloader=None):
1695 InfoExtractor.__init__(self, downloader)
1697 def report_download_webpage(self, video_id):
1698 """Report webpage download."""
1699 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1701 def report_extraction(self, video_id):
1702 """Report information extraction."""
1703 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1705 def _real_extract(self, url):
1706 # Extract id from URL
1707 mobj = re.match(self._VALID_URL, url)
1709 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1712 # At this point we have a new video
1713 self._downloader.increment_downloads()
1714 video_id = mobj.group(1)
1716 video_extension = 'mp4'
1718 # Retrieve video webpage to extract further information
1719 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1721 self.report_download_webpage(video_id)
1722 webpage = urllib2.urlopen(request).read()
1723 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1724 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1727 # Extract URL, uploader, and title from webpage
1728 self.report_extraction(video_id)
1729 mobj = re.search(r"download_url:'([^']+)'", webpage)
1731 video_extension = 'flv'
1732 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1734 self._downloader.trouble(u'ERROR: unable to extract media URL')
1736 mediaURL = urllib.unquote(mobj.group(1))
1737 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1738 mediaURL = mediaURL.replace('\\x26', '\x26')
1740 video_url = mediaURL
1742 mobj = re.search(r'<title>(.*)</title>', webpage)
1744 self._downloader.trouble(u'ERROR: unable to extract title')
1746 video_title = mobj.group(1).decode('utf-8')
1747 video_title = sanitize_title(video_title)
1748 simple_title = _simplify_title(video_title)
1750 # Extract video description
1751 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1753 self._downloader.trouble(u'ERROR: unable to extract video description')
1755 video_description = mobj.group(1).decode('utf-8')
1756 if not video_description:
1757 video_description = 'No description available.'
1759 # Extract video thumbnail
1760 if self._downloader.params.get('forcethumbnail', False):
1761 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1763 webpage = urllib2.urlopen(request).read()
1764 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1765 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1767 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1769 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1771 video_thumbnail = mobj.group(1)
1772 else: # we need something to pass to process_info
1773 video_thumbnail = ''
1776 # Process video information
1777 self._downloader.process_info({
1778 'id': video_id.decode('utf-8'),
1779 'url': video_url.decode('utf-8'),
1781 'upload_date': u'NA',
1782 'title': video_title,
1783 'stitle': simple_title,
1784 'ext': video_extension.decode('utf-8'),
1788 except UnavailableVideoError:
1789 self._downloader.trouble(u'\nERROR: unable to download video')
1792 class PhotobucketIE(InfoExtractor):
1793 """Information extractor for photobucket.com."""
1795 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1796 IE_NAME = u'photobucket'
1798 def __init__(self, downloader=None):
1799 InfoExtractor.__init__(self, downloader)
1801 def report_download_webpage(self, video_id):
1802 """Report webpage download."""
1803 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1805 def report_extraction(self, video_id):
1806 """Report information extraction."""
1807 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1809 def _real_extract(self, url):
1810 # Extract id from URL
1811 mobj = re.match(self._VALID_URL, url)
1813 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1816 # At this point we have a new video
1817 self._downloader.increment_downloads()
1818 video_id = mobj.group(1)
1820 video_extension = 'flv'
1822 # Retrieve video webpage to extract further information
1823 request = urllib2.Request(url)
1825 self.report_download_webpage(video_id)
1826 webpage = urllib2.urlopen(request).read()
1827 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1828 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1831 # Extract URL, uploader, and title from webpage
1832 self.report_extraction(video_id)
1833 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1835 self._downloader.trouble(u'ERROR: unable to extract media URL')
1837 mediaURL = urllib.unquote(mobj.group(1))
1839 video_url = mediaURL
1841 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1843 self._downloader.trouble(u'ERROR: unable to extract title')
1845 video_title = mobj.group(1).decode('utf-8')
1846 video_title = sanitize_title(video_title)
1847 simple_title = _simplify_title(vide_title)
1849 video_uploader = mobj.group(2).decode('utf-8')
1852 # Process video information
1853 self._downloader.process_info({
1854 'id': video_id.decode('utf-8'),
1855 'url': video_url.decode('utf-8'),
1856 'uploader': video_uploader,
1857 'upload_date': u'NA',
1858 'title': video_title,
1859 'stitle': simple_title,
1860 'ext': video_extension.decode('utf-8'),
1864 except UnavailableVideoError:
1865 self._downloader.trouble(u'\nERROR: unable to download video')
1868 class YahooIE(InfoExtractor):
1869 """Information extractor for video.yahoo.com."""
1871 # _VALID_URL matches all Yahoo! Video URLs
1872 # _VPAGE_URL matches only the extractable '/watch/' URLs
1873 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1874 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1875 IE_NAME = u'video.yahoo'
1877 def __init__(self, downloader=None):
1878 InfoExtractor.__init__(self, downloader)
1880 def report_download_webpage(self, video_id):
1881 """Report webpage download."""
1882 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1884 def report_extraction(self, video_id):
1885 """Report information extraction."""
1886 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1888 def _real_extract(self, url, new_video=True):
1889 # Extract ID from URL
1890 mobj = re.match(self._VALID_URL, url)
1892 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1895 # At this point we have a new video
1896 self._downloader.increment_downloads()
1897 video_id = mobj.group(2)
1898 video_extension = 'flv'
1900 # Rewrite valid but non-extractable URLs as
1901 # extractable English language /watch/ URLs
1902 if re.match(self._VPAGE_URL, url) is None:
1903 request = urllib2.Request(url)
1905 webpage = urllib2.urlopen(request).read()
1906 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1907 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1910 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1912 self._downloader.trouble(u'ERROR: Unable to extract id field')
1914 yahoo_id = mobj.group(1)
1916 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1918 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1920 yahoo_vid = mobj.group(1)
1922 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1923 return self._real_extract(url, new_video=False)
1925 # Retrieve video webpage to extract further information
1926 request = urllib2.Request(url)
1928 self.report_download_webpage(video_id)
1929 webpage = urllib2.urlopen(request).read()
1930 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1931 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1934 # Extract uploader and title from webpage
1935 self.report_extraction(video_id)
1936 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1938 self._downloader.trouble(u'ERROR: unable to extract video title')
1940 video_title = mobj.group(1).decode('utf-8')
1941 simple_title = _simplify_title(video_title)
1943 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1945 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1947 video_uploader = mobj.group(1).decode('utf-8')
1949 # Extract video thumbnail
1950 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1952 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1954 video_thumbnail = mobj.group(1).decode('utf-8')
1956 # Extract video description
1957 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1959 self._downloader.trouble(u'ERROR: unable to extract video description')
1961 video_description = mobj.group(1).decode('utf-8')
1962 if not video_description:
1963 video_description = 'No description available.'
1965 # Extract video height and width
1966 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1968 self._downloader.trouble(u'ERROR: unable to extract video height')
1970 yv_video_height = mobj.group(1)
1972 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1974 self._downloader.trouble(u'ERROR: unable to extract video width')
1976 yv_video_width = mobj.group(1)
1978 # Retrieve video playlist to extract media URL
1979 # I'm not completely sure what all these options are, but we
1980 # seem to need most of them, otherwise the server sends a 401.
1981 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1982 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1983 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1984 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1985 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1987 self.report_download_webpage(video_id)
1988 webpage = urllib2.urlopen(request).read()
1989 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1990 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1993 # Extract media URL from playlist XML
1994 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1996 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1998 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1999 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2002 # Process video information
2003 self._downloader.process_info({
2004 'id': video_id.decode('utf-8'),
2006 'uploader': video_uploader,
2007 'upload_date': u'NA',
2008 'title': video_title,
2009 'stitle': simple_title,
2010 'ext': video_extension.decode('utf-8'),
2011 'thumbnail': video_thumbnail.decode('utf-8'),
2012 'description': video_description,
2013 'thumbnail': video_thumbnail,
2016 except UnavailableVideoError:
2017 self._downloader.trouble(u'\nERROR: unable to download video')
2020 class VimeoIE(InfoExtractor):
2021 """Information extractor for vimeo.com."""
2023 # _VALID_URL matches Vimeo URLs
2024 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2027 def __init__(self, downloader=None):
2028 InfoExtractor.__init__(self, downloader)
2030 def report_download_webpage(self, video_id):
2031 """Report webpage download."""
2032 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2034 def report_extraction(self, video_id):
2035 """Report information extraction."""
2036 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2038 def _real_extract(self, url, new_video=True):
2039 # Extract ID from URL
2040 mobj = re.match(self._VALID_URL, url)
2042 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2045 # At this point we have a new video
2046 self._downloader.increment_downloads()
2047 video_id = mobj.group(1)
2049 # Retrieve video webpage to extract further information
2050 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2052 self.report_download_webpage(video_id)
2053 webpage = urllib2.urlopen(request).read()
2054 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2055 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2058 # Now we begin extracting as much information as we can from what we
2059 # retrieved. First we extract the information common to all extractors,
2060 # and latter we extract those that are Vimeo specific.
2061 self.report_extraction(video_id)
2064 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2066 self._downloader.trouble(u'ERROR: unable to extract video title')
2068 video_title = mobj.group(1).decode('utf-8')
2069 simple_title = _simplify_title(video_title)
2072 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2074 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2076 video_uploader = mobj.group(1).decode('utf-8')
2078 # Extract video thumbnail
2079 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2081 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2083 video_thumbnail = mobj.group(1).decode('utf-8')
2085 # # Extract video description
2086 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2088 # self._downloader.trouble(u'ERROR: unable to extract video description')
2090 # video_description = mobj.group(1).decode('utf-8')
2091 # if not video_description: video_description = 'No description available.'
2092 video_description = 'Foo.'
2094 # Vimeo specific: extract request signature
2095 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2097 self._downloader.trouble(u'ERROR: unable to extract request signature')
2099 sig = mobj.group(1).decode('utf-8')
2101 # Vimeo specific: extract video quality information
2102 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2104 self._downloader.trouble(u'ERROR: unable to extract video quality information')
2106 quality = mobj.group(1).decode('utf-8')
2108 if int(quality) == 1:
2113 # Vimeo specific: Extract request signature expiration
2114 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2116 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2118 sig_exp = mobj.group(1).decode('utf-8')
2120 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2123 # Process video information
2124 self._downloader.process_info({
2125 'id': video_id.decode('utf-8'),
2127 'uploader': video_uploader,
2128 'upload_date': u'NA',
2129 'title': video_title,
2130 'stitle': simple_title,
2132 'thumbnail': video_thumbnail.decode('utf-8'),
2133 'description': video_description,
2134 'thumbnail': video_thumbnail,
2135 'description': video_description,
2138 except UnavailableVideoError:
2139 self._downloader.trouble(u'ERROR: unable to download video')
2142 class GenericIE(InfoExtractor):
2143 """Generic last-resort information extractor."""
2146 IE_NAME = u'generic'
2148 def __init__(self, downloader=None):
2149 InfoExtractor.__init__(self, downloader)
2151 def report_download_webpage(self, video_id):
2152 """Report webpage download."""
2153 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2154 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2156 def report_extraction(self, video_id):
2157 """Report information extraction."""
2158 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2160 def _real_extract(self, url):
2161 # At this point we have a new video
2162 self._downloader.increment_downloads()
2164 video_id = url.split('/')[-1]
2165 request = urllib2.Request(url)
2167 self.report_download_webpage(video_id)
2168 webpage = urllib2.urlopen(request).read()
2169 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2170 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2172 except ValueError, err:
2173 # since this is the last-resort InfoExtractor, if
2174 # this error is thrown, it'll be thrown here
2175 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2178 self.report_extraction(video_id)
2179 # Start with something easy: JW Player in SWFObject
2180 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2182 # Broaden the search a little bit
2183 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2185 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2188 # It's possible that one of the regexes
2189 # matched, but returned an empty group:
2190 if mobj.group(1) is None:
2191 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2194 video_url = urllib.unquote(mobj.group(1))
2195 video_id = os.path.basename(video_url)
2197 # here's a fun little line of code for you:
2198 video_extension = os.path.splitext(video_id)[1][1:]
2199 video_id = os.path.splitext(video_id)[0]
2201 # it's tempting to parse this further, but you would
2202 # have to take into account all the variations like
2203 # Video Title - Site Name
2204 # Site Name | Video Title
2205 # Video Title - Tagline | Site Name
2206 # and so on and so forth; it's just not practical
2207 mobj = re.search(r'<title>(.*)</title>', webpage)
2209 self._downloader.trouble(u'ERROR: unable to extract title')
2211 video_title = mobj.group(1).decode('utf-8')
2212 video_title = sanitize_title(video_title)
2213 simple_title = _simplify_title(video_title)
2215 # video uploader is domain name
2216 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2218 self._downloader.trouble(u'ERROR: unable to extract title')
2220 video_uploader = mobj.group(1).decode('utf-8')
2223 # Process video information
2224 self._downloader.process_info({
2225 'id': video_id.decode('utf-8'),
2226 'url': video_url.decode('utf-8'),
2227 'uploader': video_uploader,
2228 'upload_date': u'NA',
2229 'title': video_title,
2230 'stitle': simple_title,
2231 'ext': video_extension.decode('utf-8'),
2235 except UnavailableVideoError, err:
2236 self._downloader.trouble(u'\nERROR: unable to download video')
2239 class YoutubeSearchIE(InfoExtractor):
2240 """Information Extractor for YouTube search queries."""
2241 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2242 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2243 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2244 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2246 _max_youtube_results = 1000
2247 IE_NAME = u'youtube:search'
2249 def __init__(self, youtube_ie, downloader=None):
2250 InfoExtractor.__init__(self, downloader)
2251 self._youtube_ie = youtube_ie
2253 def report_download_page(self, query, pagenum):
2254 """Report attempt to download playlist page with given number."""
2255 query = query.decode(preferredencoding())
2256 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2258 def _real_initialize(self):
2259 self._youtube_ie.initialize()
2261 def _real_extract(self, query):
2262 mobj = re.match(self._VALID_URL, query)
2264 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2267 prefix, query = query.split(':')
2269 query = query.encode('utf-8')
2271 self._download_n_results(query, 1)
2273 elif prefix == 'all':
2274 self._download_n_results(query, self._max_youtube_results)
2280 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2282 elif n > self._max_youtube_results:
2283 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2284 n = self._max_youtube_results
2285 self._download_n_results(query, n)
2287 except ValueError: # parsing prefix as integer fails
2288 self._download_n_results(query, 1)
2291 def _download_n_results(self, query, n):
2292 """Downloads a specified number of results for a query"""
2295 already_seen = set()
2299 self.report_download_page(query, pagenum)
2300 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2301 request = urllib2.Request(result_url)
2303 page = urllib2.urlopen(request).read()
2304 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2305 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2308 # Extract video identifiers
2309 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2310 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2311 if video_id not in already_seen:
2312 video_ids.append(video_id)
2313 already_seen.add(video_id)
2314 if len(video_ids) == n:
2315 # Specified n videos reached
2316 for id in video_ids:
2317 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2320 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2321 for id in video_ids:
2322 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2325 pagenum = pagenum + 1
2328 class GoogleSearchIE(InfoExtractor):
2329 """Information Extractor for Google Video search queries."""
2330 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2331 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2332 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2333 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2335 _max_google_results = 1000
2336 IE_NAME = u'video.google:search'
2338 def __init__(self, google_ie, downloader=None):
2339 InfoExtractor.__init__(self, downloader)
2340 self._google_ie = google_ie
2342 def report_download_page(self, query, pagenum):
2343 """Report attempt to download playlist page with given number."""
2344 query = query.decode(preferredencoding())
2345 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2347 def _real_initialize(self):
2348 self._google_ie.initialize()
2350 def _real_extract(self, query):
2351 mobj = re.match(self._VALID_URL, query)
2353 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2356 prefix, query = query.split(':')
2358 query = query.encode('utf-8')
2360 self._download_n_results(query, 1)
2362 elif prefix == 'all':
2363 self._download_n_results(query, self._max_google_results)
2369 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2371 elif n > self._max_google_results:
2372 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2373 n = self._max_google_results
2374 self._download_n_results(query, n)
2376 except ValueError: # parsing prefix as integer fails
2377 self._download_n_results(query, 1)
2380 def _download_n_results(self, query, n):
2381 """Downloads a specified number of results for a query"""
2384 already_seen = set()
2388 self.report_download_page(query, pagenum)
2389 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2390 request = urllib2.Request(result_url)
2392 page = urllib2.urlopen(request).read()
2393 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2394 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2397 # Extract video identifiers
2398 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2399 video_id = mobj.group(1)
2400 if video_id not in already_seen:
2401 video_ids.append(video_id)
2402 already_seen.add(video_id)
2403 if len(video_ids) == n:
2404 # Specified n videos reached
2405 for id in video_ids:
2406 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2409 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2410 for id in video_ids:
2411 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2414 pagenum = pagenum + 1
2417 class YahooSearchIE(InfoExtractor):
2418 """Information Extractor for Yahoo! Video search queries."""
2419 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2420 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2421 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2422 _MORE_PAGES_INDICATOR = r'\s*Next'
2424 _max_yahoo_results = 1000
2425 IE_NAME = u'video.yahoo:search'
2427 def __init__(self, yahoo_ie, downloader=None):
2428 InfoExtractor.__init__(self, downloader)
2429 self._yahoo_ie = yahoo_ie
2431 def report_download_page(self, query, pagenum):
2432 """Report attempt to download playlist page with given number."""
2433 query = query.decode(preferredencoding())
2434 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2436 def _real_initialize(self):
2437 self._yahoo_ie.initialize()
2439 def _real_extract(self, query):
2440 mobj = re.match(self._VALID_URL, query)
2442 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2445 prefix, query = query.split(':')
2447 query = query.encode('utf-8')
2449 self._download_n_results(query, 1)
2451 elif prefix == 'all':
2452 self._download_n_results(query, self._max_yahoo_results)
2458 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2460 elif n > self._max_yahoo_results:
2461 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2462 n = self._max_yahoo_results
2463 self._download_n_results(query, n)
2465 except ValueError: # parsing prefix as integer fails
2466 self._download_n_results(query, 1)
2469 def _download_n_results(self, query, n):
2470 """Downloads a specified number of results for a query"""
2473 already_seen = set()
2477 self.report_download_page(query, pagenum)
2478 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2479 request = urllib2.Request(result_url)
2481 page = urllib2.urlopen(request).read()
2482 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2483 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2486 # Extract video identifiers
2487 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2488 video_id = mobj.group(1)
2489 if video_id not in already_seen:
2490 video_ids.append(video_id)
2491 already_seen.add(video_id)
2492 if len(video_ids) == n:
2493 # Specified n videos reached
2494 for id in video_ids:
2495 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2498 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2499 for id in video_ids:
2500 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2503 pagenum = pagenum + 1
2506 class YoutubePlaylistIE(InfoExtractor):
2507 """Information Extractor for YouTube playlists."""
2509 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2510 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2511 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2512 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2514 IE_NAME = u'youtube:playlist'
2516 def __init__(self, youtube_ie, downloader=None):
2517 InfoExtractor.__init__(self, downloader)
2518 self._youtube_ie = youtube_ie
2520 def report_download_page(self, playlist_id, pagenum):
2521 """Report attempt to download playlist page with given number."""
2522 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2524 def _real_initialize(self):
2525 self._youtube_ie.initialize()
2527 def _real_extract(self, url):
2528 # Extract playlist id
2529 mobj = re.match(self._VALID_URL, url)
2531 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2535 if mobj.group(3) is not None:
2536 self._youtube_ie.extract(mobj.group(3))
2539 # Download playlist pages
2540 # prefix is 'p' as default for playlists but there are other types that need extra care
2541 playlist_prefix = mobj.group(1)
2542 if playlist_prefix == 'a':
2543 playlist_access = 'artist'
2545 playlist_prefix = 'p'
2546 playlist_access = 'view_play_list'
2547 playlist_id = mobj.group(2)
2552 self.report_download_page(playlist_id, pagenum)
2553 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2554 request = urllib2.Request(url)
2556 page = urllib2.urlopen(request).read()
2557 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2558 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2561 # Extract video identifiers
2563 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2564 if mobj.group(1) not in ids_in_page:
2565 ids_in_page.append(mobj.group(1))
2566 video_ids.extend(ids_in_page)
2568 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2570 pagenum = pagenum + 1
2572 playliststart = self._downloader.params.get('playliststart', 1) - 1
2573 playlistend = self._downloader.params.get('playlistend', -1)
2574 video_ids = video_ids[playliststart:playlistend]
2576 for id in video_ids:
2577 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2581 class YoutubeUserIE(InfoExtractor):
2582 """Information Extractor for YouTube users."""
2584 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2585 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2586 _GDATA_PAGE_SIZE = 50
2587 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2588 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2590 IE_NAME = u'youtube:user'
2592 def __init__(self, youtube_ie, downloader=None):
2593 InfoExtractor.__init__(self, downloader)
2594 self._youtube_ie = youtube_ie
2596 def report_download_page(self, username, start_index):
2597 """Report attempt to download user page."""
2598 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2599 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2601 def _real_initialize(self):
2602 self._youtube_ie.initialize()
2604 def _real_extract(self, url):
2606 mobj = re.match(self._VALID_URL, url)
2608 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2611 username = mobj.group(1)
2613 # Download video ids using YouTube Data API. Result size per
2614 # query is limited (currently to 50 videos) so we need to query
2615 # page by page until there are no video ids - it means we got
2622 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2623 self.report_download_page(username, start_index)
2625 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2628 page = urllib2.urlopen(request).read()
2629 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2630 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2633 # Extract video identifiers
2636 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2637 if mobj.group(1) not in ids_in_page:
2638 ids_in_page.append(mobj.group(1))
2640 video_ids.extend(ids_in_page)
2642 # A little optimization - if current page is not
2643 # "full", ie. does not contain PAGE_SIZE video ids then
2644 # we can assume that this page is the last one - there
2645 # are no more ids on further pages - no need to query
2648 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2653 all_ids_count = len(video_ids)
2654 playliststart = self._downloader.params.get('playliststart', 1) - 1
2655 playlistend = self._downloader.params.get('playlistend', -1)
2657 if playlistend == -1:
2658 video_ids = video_ids[playliststart:]
2660 video_ids = video_ids[playliststart:playlistend]
2662 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2663 (username, all_ids_count, len(video_ids)))
2665 for video_id in video_ids:
2666 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2669 class DepositFilesIE(InfoExtractor):
2670 """Information extractor for depositfiles.com"""
2672 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2673 IE_NAME = u'DepositFiles'
2675 def __init__(self, downloader=None):
2676 InfoExtractor.__init__(self, downloader)
2678 def report_download_webpage(self, file_id):
2679 """Report webpage download."""
2680 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2682 def report_extraction(self, file_id):
2683 """Report information extraction."""
2684 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2686 def _real_extract(self, url):
2687 # At this point we have a new file
2688 self._downloader.increment_downloads()
2690 file_id = url.split('/')[-1]
2691 # Rebuild url in english locale
2692 url = 'http://depositfiles.com/en/files/' + file_id
2694 # Retrieve file webpage with 'Free download' button pressed
2695 free_download_indication = { 'gateway_result' : '1' }
2696 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2698 self.report_download_webpage(file_id)
2699 webpage = urllib2.urlopen(request).read()
2700 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2701 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2704 # Search for the real file URL
2705 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2706 if (mobj is None) or (mobj.group(1) is None):
2707 # Try to figure out reason of the error.
2708 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2709 if (mobj is not None) and (mobj.group(1) is not None):
2710 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2711 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2713 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2716 file_url = mobj.group(1)
2717 file_extension = os.path.splitext(file_url)[1][1:]
2719 # Search for file title
2720 mobj = re.search(r'<b title="(.*?)">', webpage)
2722 self._downloader.trouble(u'ERROR: unable to extract title')
2724 file_title = mobj.group(1).decode('utf-8')
2727 # Process file information
2728 self._downloader.process_info({
2729 'id': file_id.decode('utf-8'),
2730 'url': file_url.decode('utf-8'),
2732 'upload_date': u'NA',
2733 'title': file_title,
2734 'stitle': file_title,
2735 'ext': file_extension.decode('utf-8'),
2739 except UnavailableVideoError, err:
2740 self._downloader.trouble(u'ERROR: unable to download file')
2743 class FacebookIE(InfoExtractor):
2744 """Information Extractor for Facebook"""
2746 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2747 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2748 _NETRC_MACHINE = 'facebook'
2749 _available_formats = ['video', 'highqual', 'lowqual']
2750 _video_extensions = {
2755 IE_NAME = u'facebook'
2757 def __init__(self, downloader=None):
2758 InfoExtractor.__init__(self, downloader)
2760 def _reporter(self, message):
2761 """Add header and report message."""
2762 self._downloader.to_screen(u'[facebook] %s' % message)
2764 def report_login(self):
2765 """Report attempt to log in."""
2766 self._reporter(u'Logging in')
2768 def report_video_webpage_download(self, video_id):
2769 """Report attempt to download video webpage."""
2770 self._reporter(u'%s: Downloading video webpage' % video_id)
2772 def report_information_extraction(self, video_id):
2773 """Report attempt to extract video information."""
2774 self._reporter(u'%s: Extracting video information' % video_id)
2776 def _parse_page(self, video_webpage):
2777 """Extract video information from page"""
2779 data = {'title': r'\("video_title", "(.*?)"\)',
2780 'description': r'<div class="datawrap">(.*?)</div>',
2781 'owner': r'\("video_owner_name", "(.*?)"\)',
2782 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2785 for piece in data.keys():
2786 mobj = re.search(data[piece], video_webpage)
2787 if mobj is not None:
2788 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2792 for fmt in self._available_formats:
2793 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2794 if mobj is not None:
2795 # URL is in a Javascript segment inside an escaped Unicode format within
2796 # the generally utf-8 page
2797 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2798 video_info['video_urls'] = video_urls
2802 def _real_initialize(self):
2803 if self._downloader is None:
2808 downloader_params = self._downloader.params
2810 # Attempt to use provided username and password or .netrc data
2811 if downloader_params.get('username', None) is not None:
2812 useremail = downloader_params['username']
2813 password = downloader_params['password']
2814 elif downloader_params.get('usenetrc', False):
2816 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2817 if info is not None:
2821 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2822 except (IOError, netrc.NetrcParseError), err:
2823 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2826 if useremail is None:
2835 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2838 login_results = urllib2.urlopen(request).read()
2839 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2840 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2842 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2843 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2846 def _real_extract(self, url):
2847 mobj = re.match(self._VALID_URL, url)
2849 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2851 video_id = mobj.group('ID')
2854 self.report_video_webpage_download(video_id)
2855 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2857 page = urllib2.urlopen(request)
2858 video_webpage = page.read()
2859 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2860 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2863 # Start extracting information
2864 self.report_information_extraction(video_id)
2866 # Extract information
2867 video_info = self._parse_page(video_webpage)
2870 if 'owner' not in video_info:
2871 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2873 video_uploader = video_info['owner']
2876 if 'title' not in video_info:
2877 self._downloader.trouble(u'ERROR: unable to extract video title')
2879 video_title = video_info['title']
2880 video_title = video_title.decode('utf-8')
2881 video_title = sanitize_title(video_title)
2883 simple_title = _simplify_title(video_title)
2886 if 'thumbnail' not in video_info:
2887 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2888 video_thumbnail = ''
2890 video_thumbnail = video_info['thumbnail']
2894 if 'upload_date' in video_info:
2895 upload_time = video_info['upload_date']
2896 timetuple = email.utils.parsedate_tz(upload_time)
2897 if timetuple is not None:
2899 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2904 video_description = video_info.get('description', 'No description available.')
2906 url_map = video_info['video_urls']
2907 if len(url_map.keys()) > 0:
2908 # Decide which formats to download
2909 req_format = self._downloader.params.get('format', None)
2910 format_limit = self._downloader.params.get('format_limit', None)
2912 if format_limit is not None and format_limit in self._available_formats:
2913 format_list = self._available_formats[self._available_formats.index(format_limit):]
2915 format_list = self._available_formats
2916 existing_formats = [x for x in format_list if x in url_map]
2917 if len(existing_formats) == 0:
2918 self._downloader.trouble(u'ERROR: no known formats available for video')
2920 if req_format is None:
2921 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2922 elif req_format == 'worst':
2923 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2924 elif req_format == '-1':
2925 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2928 if req_format not in url_map:
2929 self._downloader.trouble(u'ERROR: requested format not available')
2931 video_url_list = [(req_format, url_map[req_format])] # Specific format
2933 for format_param, video_real_url in video_url_list:
2935 # At this point we have a new video
2936 self._downloader.increment_downloads()
2939 video_extension = self._video_extensions.get(format_param, 'mp4')
2942 # Process video information
2943 self._downloader.process_info({
2944 'id': video_id.decode('utf-8'),
2945 'url': video_real_url.decode('utf-8'),
2946 'uploader': video_uploader.decode('utf-8'),
2947 'upload_date': upload_date,
2948 'title': video_title,
2949 'stitle': simple_title,
2950 'ext': video_extension.decode('utf-8'),
2951 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2952 'thumbnail': video_thumbnail.decode('utf-8'),
2953 'description': video_description.decode('utf-8'),
2956 except UnavailableVideoError, err:
2957 self._downloader.trouble(u'\nERROR: unable to download video')
2959 class BlipTVIE(InfoExtractor):
2960 """Information extractor for blip.tv"""
2962 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2963 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2964 IE_NAME = u'blip.tv'
2966 def report_extraction(self, file_id):
2967 """Report information extraction."""
2968 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2970 def report_direct_download(self, title):
2971 """Report information extraction."""
2972 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2974 def _real_extract(self, url):
2975 mobj = re.match(self._VALID_URL, url)
2977 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2984 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2985 request = urllib2.Request(json_url)
2986 self.report_extraction(mobj.group(1))
2989 urlh = urllib2.urlopen(request)
2990 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2991 basename = url.split('/')[-1]
2992 title,ext = os.path.splitext(basename)
2993 title = title.decode('UTF-8')
2994 ext = ext.replace('.', '')
2995 self.report_direct_download(title)
3000 'stitle': _simplify_title(title),
3004 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3005 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3007 if info is None: # Regular URL
3009 json_code = urlh.read()
3010 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3011 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3015 json_data = json.loads(json_code)
3016 if 'Post' in json_data:
3017 data = json_data['Post']
3021 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3022 video_url = data['media']['url']
3023 umobj = re.match(self._URL_EXT, video_url)
3025 raise ValueError('Can not determine filename extension')
3026 ext = umobj.group(1)
3029 'id': data['item_id'],
3031 'uploader': data['display_name'],
3032 'upload_date': upload_date,
3033 'title': data['title'],
3034 'stitle': _simplify_title(data['title']),
3036 'format': data['media']['mimeType'],
3037 'thumbnail': data['thumbnailUrl'],
3038 'description': data['description'],
3039 'player_url': data['embedUrl']
3041 except (ValueError,KeyError), err:
3042 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3045 self._downloader.increment_downloads()
3048 self._downloader.process_info(info)
3049 except UnavailableVideoError, err:
3050 self._downloader.trouble(u'\nERROR: unable to download video')
3053 class MyVideoIE(InfoExtractor):
3054 """Information Extractor for myvideo.de."""
3056 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3057 IE_NAME = u'myvideo'
3059 def __init__(self, downloader=None):
3060 InfoExtractor.__init__(self, downloader)
3062 def report_download_webpage(self, video_id):
3063 """Report webpage download."""
3064 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3066 def report_extraction(self, video_id):
3067 """Report information extraction."""
3068 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3070 def _real_extract(self,url):
3071 mobj = re.match(self._VALID_URL, url)
3073 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3076 video_id = mobj.group(1)
3079 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3081 self.report_download_webpage(video_id)
3082 webpage = urllib2.urlopen(request).read()
3083 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3084 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3087 self.report_extraction(video_id)
3088 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3091 self._downloader.trouble(u'ERROR: unable to extract media URL')
3093 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3095 mobj = re.search('<title>([^<]+)</title>', webpage)
3097 self._downloader.trouble(u'ERROR: unable to extract title')
3100 video_title = mobj.group(1)
3101 video_title = sanitize_title(video_title)
3103 simple_title = _simplify_title(video_title)
3106 self._downloader.process_info({
3110 'upload_date': u'NA',
3111 'title': video_title,
3112 'stitle': simple_title,
3117 except UnavailableVideoError:
3118 self._downloader.trouble(u'\nERROR: Unable to download video')
3120 class ComedyCentralIE(InfoExtractor):
3121 """Information extractor for The Daily Show and Colbert Report """
3123 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3124 IE_NAME = u'comedycentral'
3126 def report_extraction(self, episode_id):
3127 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3129 def report_config_download(self, episode_id):
3130 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3132 def report_index_download(self, episode_id):
3133 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3135 def report_player_url(self, episode_id):
3136 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3138 def _real_extract(self, url):
3139 mobj = re.match(self._VALID_URL, url)
3141 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3144 if mobj.group('shortname'):
3145 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3146 url = u'http://www.thedailyshow.com/full-episodes/'
3148 url = u'http://www.colbertnation.com/full-episodes/'
3149 mobj = re.match(self._VALID_URL, url)
3150 assert mobj is not None
3152 dlNewest = not mobj.group('episode')
3154 epTitle = mobj.group('showname')
3156 epTitle = mobj.group('episode')
3158 req = urllib2.Request(url)
3159 self.report_extraction(epTitle)
3161 htmlHandle = urllib2.urlopen(req)
3162 html = htmlHandle.read()
3163 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3164 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3167 url = htmlHandle.geturl()
3168 mobj = re.match(self._VALID_URL, url)
3170 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3172 if mobj.group('episode') == '':
3173 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3175 epTitle = mobj.group('episode')
3177 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3178 if len(mMovieParams) == 0:
3179 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3182 playerUrl_raw = mMovieParams[0][0]
3183 self.report_player_url(epTitle)
3185 urlHandle = urllib2.urlopen(playerUrl_raw)
3186 playerUrl = urlHandle.geturl()
3187 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3188 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3191 uri = mMovieParams[0][1]
3192 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3193 self.report_index_download(epTitle)
3195 indexXml = urllib2.urlopen(indexUrl).read()
3196 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3197 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3200 idoc = xml.etree.ElementTree.fromstring(indexXml)
3201 itemEls = idoc.findall('.//item')
3202 for itemEl in itemEls:
3203 mediaId = itemEl.findall('./guid')[0].text
3204 shortMediaId = mediaId.split(':')[-1]
3205 showId = mediaId.split(':')[-2].replace('.com', '')
3206 officialTitle = itemEl.findall('./title')[0].text
3207 officialDate = itemEl.findall('./pubDate')[0].text
3209 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3210 urllib.urlencode({'uri': mediaId}))
3211 configReq = urllib2.Request(configUrl)
3212 self.report_config_download(epTitle)
3214 configXml = urllib2.urlopen(configReq).read()
3215 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3216 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3219 cdoc = xml.etree.ElementTree.fromstring(configXml)
3221 for rendition in cdoc.findall('.//rendition'):
3222 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3226 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3229 # For now, just pick the highest bitrate
3230 format,video_url = turls[-1]
3232 self._downloader.increment_downloads()
3234 effTitle = showId + u'-' + epTitle
3239 'upload_date': officialDate,
3241 'stitle': _simplify_title(effTitle),
3245 'description': officialTitle,
3246 'player_url': playerUrl
3250 self._downloader.process_info(info)
3251 except UnavailableVideoError, err:
3252 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3256 class EscapistIE(InfoExtractor):
3257 """Information extractor for The Escapist """
3259 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3260 IE_NAME = u'escapist'
3262 def report_extraction(self, showName):
3263 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3265 def report_config_download(self, showName):
3266 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3268 def _real_extract(self, url):
3269 htmlParser = HTMLParser.HTMLParser()
3271 mobj = re.match(self._VALID_URL, url)
3273 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3275 showName = mobj.group('showname')
3276 videoId = mobj.group('episode')
3278 self.report_extraction(showName)
3280 webPage = urllib2.urlopen(url).read()
3281 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3282 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3285 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3286 description = htmlParser.unescape(descMatch.group(1))
3287 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3288 imgUrl = htmlParser.unescape(imgMatch.group(1))
3289 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3290 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3291 configUrlMatch = re.search('config=(.*)$', playerUrl)
3292 configUrl = urllib2.unquote(configUrlMatch.group(1))
3294 self.report_config_download(showName)
3296 configJSON = urllib2.urlopen(configUrl).read()
3297 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3298 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3301 # Technically, it's JavaScript, not JSON
3302 configJSON = configJSON.replace("'", '"')
3305 config = json.loads(configJSON)
3306 except (ValueError,), err:
3307 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3310 playlist = config['playlist']
3311 videoUrl = playlist[1]['url']
3313 self._downloader.increment_downloads()
3317 'uploader': showName,
3318 'upload_date': None,
3320 'stitle': _simplify_title(showName),
3323 'thumbnail': imgUrl,
3324 'description': description,
3325 'player_url': playerUrl,
3329 self._downloader.process_info(info)
3330 except UnavailableVideoError, err:
3331 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3334 class CollegeHumorIE(InfoExtractor):
3335 """Information extractor for collegehumor.com"""
3337 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3338 IE_NAME = u'collegehumor'
3340 def report_webpage(self, video_id):
3341 """Report information extraction."""
3342 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3344 def report_extraction(self, video_id):
3345 """Report information extraction."""
3346 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3348 def _real_extract(self, url):
3349 htmlParser = HTMLParser.HTMLParser()
3351 mobj = re.match(self._VALID_URL, url)
3353 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3355 video_id = mobj.group('videoid')
3357 self.report_webpage(video_id)
3358 request = urllib2.Request(url)
3360 webpage = urllib2.urlopen(request).read()
3361 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3362 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3365 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3367 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3369 internal_video_id = m.group('internalvideoid')
3373 'internal_id': internal_video_id,
3376 self.report_extraction(video_id)
3377 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3379 metaXml = urllib2.urlopen(xmlUrl).read()
3380 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3381 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3384 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3386 videoNode = mdoc.findall('./video')[0]
3387 info['description'] = videoNode.findall('./description')[0].text
3388 info['title'] = videoNode.findall('./caption')[0].text
3389 info['stitle'] = _simplify_title(info['title'])
3390 info['url'] = videoNode.findall('./file')[0].text
3391 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3392 info['ext'] = info['url'].rpartition('.')[2]
3393 info['format'] = info['ext']
3395 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3398 self._downloader.increment_downloads()
3401 self._downloader.process_info(info)
3402 except UnavailableVideoError, err:
3403 self._downloader.trouble(u'\nERROR: unable to download video')
3406 class XVideosIE(InfoExtractor):
3407 """Information extractor for xvideos.com"""
3409 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3410 IE_NAME = u'xvideos'
3412 def report_webpage(self, video_id):
3413 """Report information extraction."""
3414 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3416 def report_extraction(self, video_id):
3417 """Report information extraction."""
3418 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3420 def _real_extract(self, url):
3421 htmlParser = HTMLParser.HTMLParser()
3423 mobj = re.match(self._VALID_URL, url)
3425 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3427 video_id = mobj.group(1).decode('utf-8')
3429 self.report_webpage(video_id)
3431 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3433 webpage = urllib2.urlopen(request).read()
3434 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3435 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3438 self.report_extraction(video_id)
3442 mobj = re.search(r'flv_url=(.+?)&', webpage)
3444 self._downloader.trouble(u'ERROR: unable to extract video url')
3446 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3450 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3452 self._downloader.trouble(u'ERROR: unable to extract video title')
3454 video_title = mobj.group(1).decode('utf-8')
3457 # Extract video thumbnail
3458 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3460 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3462 video_thumbnail = mobj.group(1).decode('utf-8')
3466 self._downloader.increment_downloads()
3471 'upload_date': None,
3472 'title': video_title,
3473 'stitle': _simplify_title(video_title),
3476 'thumbnail': video_thumbnail,
3477 'description': None,
3482 self._downloader.process_info(info)
3483 except UnavailableVideoError, err:
3484 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3487 class SoundcloudIE(InfoExtractor):
3488 """Information extractor for soundcloud.com
3489 To access the media, the uid of the song and a stream token
3490 must be extracted from the page source and the script must make
3491 a request to media.soundcloud.com/crossdomain.xml. Then
3492 the media can be grabbed by requesting from an url composed
3493 of the stream token and uid
3496 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3497 IE_NAME = u'soundcloud'
3499 def __init__(self, downloader=None):
3500 InfoExtractor.__init__(self, downloader)
3502 def report_webpage(self, video_id):
3503 """Report information extraction."""
3504 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3506 def report_extraction(self, video_id):
3507 """Report information extraction."""
3508 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3510 def _real_extract(self, url):
3511 htmlParser = HTMLParser.HTMLParser()
3513 mobj = re.match(self._VALID_URL, url)
3515 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3518 # extract uploader (which is in the url)
3519 uploader = mobj.group(1).decode('utf-8')
3520 # extract simple title (uploader + slug of song title)
3521 slug_title = mobj.group(2).decode('utf-8')
3522 simple_title = uploader + '-' + slug_title
3524 self.report_webpage('%s/%s' % (uploader, slug_title))
3526 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3528 webpage = urllib2.urlopen(request).read()
3529 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3530 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3533 self.report_extraction('%s/%s' % (uploader, slug_title))
3535 # extract uid and stream token that soundcloud hands out for access
3536 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3538 video_id = mobj.group(1)
3539 stream_token = mobj.group(2)
3541 # extract unsimplified title
3542 mobj = re.search('"title":"(.*?)",', webpage)
3544 title = mobj.group(1)
3546 # construct media url (with uid/token)
3547 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3548 mediaURL = mediaURL % (video_id, stream_token)
3551 description = u'No description available'
3552 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3554 description = mobj.group(1)
3558 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3561 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3562 except Exception, e:
3565 # for soundcloud, a request to a cross domain is required for cookies
3566 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3569 self._downloader.process_info({
3570 'id': video_id.decode('utf-8'),
3572 'uploader': uploader.decode('utf-8'),
3573 'upload_date': upload_date,
3574 'title': simple_title.decode('utf-8'),
3575 'stitle': simple_title.decode('utf-8'),
3579 'description': description.decode('utf-8')
3581 except UnavailableVideoError:
3582 self._downloader.trouble(u'\nERROR: unable to download video')
3585 class InfoQIE(InfoExtractor):
3586 """Information extractor for infoq.com"""
3588 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3591 def report_webpage(self, video_id):
3592 """Report information extraction."""
3593 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3595 def report_extraction(self, video_id):
3596 """Report information extraction."""
3597 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3599 def _real_extract(self, url):
3600 htmlParser = HTMLParser.HTMLParser()
3602 mobj = re.match(self._VALID_URL, url)
3604 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3607 self.report_webpage(url)
3609 request = urllib2.Request(url)
3611 webpage = urllib2.urlopen(request).read()
3612 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3613 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3616 self.report_extraction(url)
3620 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3622 self._downloader.trouble(u'ERROR: unable to extract video url')
3624 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3628 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3630 self._downloader.trouble(u'ERROR: unable to extract video title')
3632 video_title = mobj.group(1).decode('utf-8')
3634 # Extract description
3635 video_description = u'No description available.'
3636 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3637 if mobj is not None:
3638 video_description = mobj.group(1).decode('utf-8')
3640 video_filename = video_url.split('/')[-1]
3641 video_id, extension = video_filename.split('.')
3643 self._downloader.increment_downloads()
3648 'upload_date': None,
3649 'title': video_title,
3650 'stitle': _simplify_title(video_title),
3652 'format': extension, # Extension is always(?) mp4, but seems to be flv
3654 'description': video_description,
3659 self._downloader.process_info(info)
3660 except UnavailableVideoError, err:
3661 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3663 class MixcloudIE(InfoExtractor):
3664 """Information extractor for www.mixcloud.com"""
3665 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3666 IE_NAME = u'mixcloud'
3668 def __init__(self, downloader=None):
3669 InfoExtractor.__init__(self, downloader)
3671 def report_download_json(self, file_id):
3672 """Report JSON download."""
3673 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3675 def report_extraction(self, file_id):
3676 """Report information extraction."""
3677 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3679 def get_urls(self, jsonData, fmt, bitrate='best'):
3680 """Get urls from 'audio_formats' section in json"""
3683 bitrate_list = jsonData[fmt]
3684 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3685 bitrate = max(bitrate_list) # select highest
3687 url_list = jsonData[fmt][bitrate]
3688 except TypeError: # we have no bitrate info.
3689 url_list = jsonData[fmt]
3693 def check_urls(self, url_list):
3694 """Returns 1st active url from list"""
3695 for url in url_list:
3697 urllib2.urlopen(url)
3699 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3704 def _print_formats(self, formats):
3705 print 'Available formats:'
3706 for fmt in formats.keys():
3707 for b in formats[fmt]:
3709 ext = formats[fmt][b][0]
3710 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3711 except TypeError: # we have no bitrate info
3712 ext = formats[fmt][0]
3713 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3716 def _real_extract(self, url):
3717 mobj = re.match(self._VALID_URL, url)
3719 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3721 # extract uploader & filename from url
3722 uploader = mobj.group(1).decode('utf-8')
3723 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3725 # construct API request
3726 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3727 # retrieve .json file with links to files
3728 request = urllib2.Request(file_url)
3730 self.report_download_json(file_url)
3731 jsonData = urllib2.urlopen(request).read()
3732 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3733 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3737 json_data = json.loads(jsonData)
3738 player_url = json_data['player_swf_url']
3739 formats = dict(json_data['audio_formats'])
3741 req_format = self._downloader.params.get('format', None)
3744 if self._downloader.params.get('listformats', None):
3745 self._print_formats(formats)
3748 if req_format is None or req_format == 'best':
3749 for format_param in formats.keys():
3750 url_list = self.get_urls(formats, format_param)
3752 file_url = self.check_urls(url_list)
3753 if file_url is not None:
3756 if req_format not in formats.keys():
3757 self._downloader.trouble(u'ERROR: format is not available')
3760 url_list = self.get_urls(formats, req_format)
3761 file_url = self.check_urls(url_list)
3762 format_param = req_format
3765 self._downloader.increment_downloads()
3767 # Process file information
3768 self._downloader.process_info({
3769 'id': file_id.decode('utf-8'),
3770 'url': file_url.decode('utf-8'),
3771 'uploader': uploader.decode('utf-8'),
3772 'upload_date': u'NA',
3773 'title': json_data['name'],
3774 'stitle': _simplify_title(json_data['name']),
3775 'ext': file_url.split('.')[-1].decode('utf-8'),
3776 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3777 'thumbnail': json_data['thumbnail_url'],
3778 'description': json_data['description'],
3779 'player_url': player_url.decode('utf-8'),
3781 except UnavailableVideoError, err:
3782 self._downloader.trouble(u'ERROR: unable to download file')
3784 class StanfordOpenClassroomIE(InfoExtractor):
3785 """Information extractor for Stanford's Open ClassRoom"""
3787 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3788 IE_NAME = u'stanfordoc'
3790 def report_download_webpage(self, objid):
3791 """Report information extraction."""
3792 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3794 def report_extraction(self, video_id):
3795 """Report information extraction."""
3796 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3798 def _real_extract(self, url):
3799 mobj = re.match(self._VALID_URL, url)
3801 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3804 if mobj.group('course') and mobj.group('video'): # A specific video
3805 course = mobj.group('course')
3806 video = mobj.group('video')
3808 'id': _simplify_title(course + '_' + video),
3811 self.report_extraction(info['id'])
3812 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3813 xmlUrl = baseUrl + video + '.xml'
3815 metaXml = urllib2.urlopen(xmlUrl).read()
3816 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3817 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3819 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3821 info['title'] = mdoc.findall('./title')[0].text
3822 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3824 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3826 info['stitle'] = _simplify_title(info['title'])
3827 info['ext'] = info['url'].rpartition('.')[2]
3828 info['format'] = info['ext']
3829 self._downloader.increment_downloads()
3831 self._downloader.process_info(info)
3832 except UnavailableVideoError, err:
3833 self._downloader.trouble(u'\nERROR: unable to download video')
3834 elif mobj.group('course'): # A course page
3835 unescapeHTML = HTMLParser.HTMLParser().unescape
3837 course = mobj.group('course')
3839 'id': _simplify_title(course),
3843 self.report_download_webpage(info['id'])
3845 coursepage = urllib2.urlopen(url).read()
3846 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3847 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3850 m = re.search('<h1>([^<]+)</h1>', coursepage)
3852 info['title'] = unescapeHTML(m.group(1))
3854 info['title'] = info['id']
3855 info['stitle'] = _simplify_title(info['title'])
3857 m = re.search('<description>([^<]+)</description>', coursepage)
3859 info['description'] = unescapeHTML(m.group(1))
3861 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3864 'type': 'reference',
3865 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3869 for entry in info['list']:
3870 assert entry['type'] == 'reference'
3871 self.extract(entry['url'])
3873 unescapeHTML = HTMLParser.HTMLParser().unescape
3876 'id': 'Stanford OpenClassroom',
3880 self.report_download_webpage(info['id'])
3881 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3883 rootpage = urllib2.urlopen(rootURL).read()
3884 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3885 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3888 info['title'] = info['id']
3889 info['stitle'] = _simplify_title(info['title'])
3891 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3894 'type': 'reference',
3895 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3899 for entry in info['list']:
3900 assert entry['type'] == 'reference'
3901 self.extract(entry['url'])
3903 class MTVIE(InfoExtractor):
3904 """Information extractor for MTV.com"""
3906 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3909 def report_webpage(self, video_id):
3910 """Report information extraction."""
3911 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3913 def report_extraction(self, video_id):
3914 """Report information extraction."""
3915 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3917 def _real_extract(self, url):
3918 mobj = re.match(self._VALID_URL, url)
3920 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3922 if not mobj.group('proto'):
3923 url = 'http://' + url
3924 video_id = mobj.group('videoid')
3925 self.report_webpage(video_id)
3927 request = urllib2.Request(url)
3929 webpage = urllib2.urlopen(request).read()
3930 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3931 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3934 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3936 self._downloader.trouble(u'ERROR: unable to extract song name')
3938 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3939 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3941 self._downloader.trouble(u'ERROR: unable to extract performer')
3943 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3944 video_title = performer + ' - ' + song_name
3946 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3948 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3950 mtvn_uri = mobj.group(1)
3952 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3954 self._downloader.trouble(u'ERROR: unable to extract content id')
3956 content_id = mobj.group(1)
3958 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3959 self.report_extraction(video_id)
3960 request = urllib2.Request(videogen_url)
3962 metadataXml = urllib2.urlopen(request).read()
3963 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3964 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3967 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3968 renditions = mdoc.findall('.//rendition')
3970 # For now, always pick the highest quality.
3971 rendition = renditions[-1]
3974 _,_,ext = rendition.attrib['type'].partition('/')
3975 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3976 video_url = rendition.find('./src').text
3978 self._downloader.trouble('Invalid rendition field.')
3981 self._downloader.increment_downloads()
3985 'uploader': performer,
3986 'title': video_title,
3987 'stitle': _simplify_title(video_title),
3993 self._downloader.process_info(info)
3994 except UnavailableVideoError, err:
3995 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3998 class PostProcessor(object):
3999 """Post Processor class.
4001 PostProcessor objects can be added to downloaders with their
4002 add_post_processor() method. When the downloader has finished a
4003 successful download, it will take its internal chain of PostProcessors
4004 and start calling the run() method on each one of them, first with
4005 an initial argument and then with the returned value of the previous
4008 The chain will be stopped if one of them ever returns None or the end
4009 of the chain is reached.
4011 PostProcessor objects follow a "mutual registration" process similar
4012 to InfoExtractor objects.
4017 def __init__(self, downloader=None):
4018 self._downloader = downloader
4020 def set_downloader(self, downloader):
4021 """Sets the downloader for this PP."""
4022 self._downloader = downloader
4024 def run(self, information):
4025 """Run the PostProcessor.
4027 The "information" argument is a dictionary like the ones
4028 composed by InfoExtractors. The only difference is that this
4029 one has an extra field called "filepath" that points to the
4032 When this method returns None, the postprocessing chain is
4033 stopped. However, this method may return an information
4034 dictionary that will be passed to the next postprocessing
4035 object in the chain. It can be the one it received after
4036 changing some fields.
4038 In addition, this method may raise a PostProcessingError
4039 exception that will be taken into account by the downloader
4042 return information # by default, do nothing
4044 class AudioConversionError(BaseException):
4045 def __init__(self, message):
4046 self.message = message
4048 class FFmpegExtractAudioPP(PostProcessor):
4050 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4051 PostProcessor.__init__(self, downloader)
4052 if preferredcodec is None:
4053 preferredcodec = 'best'
4054 self._preferredcodec = preferredcodec
4055 self._preferredquality = preferredquality
4056 self._keepvideo = keepvideo
4059 def get_audio_codec(path):
4061 cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4062 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4063 output = handle.communicate()[0]
4064 if handle.wait() != 0:
4066 except (IOError, OSError):
4069 for line in output.split('\n'):
4070 if line.startswith('codec_name='):
4071 audio_codec = line.split('=')[1].strip()
4072 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4077 def run_ffmpeg(path, out_path, codec, more_opts):
4081 acodec_opts = ['-acodec', codec]
4082 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4084 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4085 stdout,stderr = p.communicate()
4086 except (IOError, OSError):
4087 e = sys.exc_info()[1]
4088 if isinstance(e, OSError) and e.errno == 2:
4089 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4092 if p.returncode != 0:
4093 msg = stderr.strip().split('\n')[-1]
4094 raise AudioConversionError(msg)
4096 def run(self, information):
4097 path = information['filepath']
4099 filecodec = self.get_audio_codec(path)
4100 if filecodec is None:
4101 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4105 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4106 if self._preferredcodec == 'm4a' and filecodec == 'aac':
4107 # Lossless, but in another container
4109 extension = self._preferredcodec
4110 more_opts = ['-absf', 'aac_adtstoasc']
4111 elif filecodec in ['aac', 'mp3', 'vorbis']:
4112 # Lossless if possible
4114 extension = filecodec
4115 if filecodec == 'aac':
4116 more_opts = ['-f', 'adts']
4117 if filecodec == 'vorbis':
4121 acodec = 'libmp3lame'
4124 if self._preferredquality is not None:
4125 more_opts += ['-ab', self._preferredquality]
4127 # We convert the audio (lossy)
4128 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4129 extension = self._preferredcodec
4131 if self._preferredquality is not None:
4132 more_opts += ['-ab', self._preferredquality]
4133 if self._preferredcodec == 'aac':
4134 more_opts += ['-f', 'adts']
4135 if self._preferredcodec == 'm4a':
4136 more_opts += ['-absf', 'aac_adtstoasc']
4137 if self._preferredcodec == 'vorbis':
4139 if self._preferredcodec == 'wav':
4141 more_opts += ['-f', 'wav']
4143 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4144 new_path = prefix + sep + extension
4145 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4147 self.run_ffmpeg(path, new_path, acodec, more_opts)
4149 etype,e,tb = sys.exc_info()
4150 if isinstance(e, AudioConversionError):
4151 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4153 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4156 # Try to update the date time for extracted audio file.
4157 if information.get('filetime') is not None:
4159 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4161 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4163 if not self._keepvideo:
4165 os.remove(_encodeFilename(path))
4166 except (IOError, OSError):
4167 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4170 information['filepath'] = new_path
4174 def updateSelf(downloader, filename):
4175 ''' Update the program file with the latest version from the repository '''
4176 # Note: downloader only used for options
4177 if not os.access(filename, os.W_OK):
4178 sys.exit('ERROR: no write permissions on %s' % filename)
4180 downloader.to_screen(u'Updating to latest version...')
4184 urlh = urllib.urlopen(UPDATE_URL)
4185 newcontent = urlh.read()
4187 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4188 if vmatch is not None and vmatch.group(1) == __version__:
4189 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4193 except (IOError, OSError), err:
4194 sys.exit('ERROR: unable to download latest version')
4197 outf = open(filename, 'wb')
4199 outf.write(newcontent)
4202 except (IOError, OSError), err:
4203 sys.exit('ERROR: unable to overwrite current version')
4205 downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4213 def _readOptions(filename_bytes):
4215 optionf = open(filename_bytes)
4217 return [] # silently skip if file is not present
4221 res += shlex.split(l, comments=True)
4226 def _format_option_string(option):
4227 ''' ('-o', '--option') -> -o, --format METAVAR'''
4231 if option._short_opts: opts.append(option._short_opts[0])
4232 if option._long_opts: opts.append(option._long_opts[0])
4233 if len(opts) > 1: opts.insert(1, ', ')
4235 if option.takes_value(): opts.append(' %s' % option.metavar)
4237 return "".join(opts)
4239 def _find_term_columns():
4240 columns = os.environ.get('COLUMNS', None)
4245 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4246 out,err = sp.communicate()
4247 return int(out.split()[1])
4253 max_help_position = 80
4255 # No need to wrap help messages if we're on a wide console
4256 columns = _find_term_columns()
4257 if columns: max_width = columns
4259 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4260 fmt.format_option_strings = _format_option_string
4263 'version' : __version__,
4265 'usage' : '%prog [options] url [url...]',
4266 'conflict_handler' : 'resolve',
4269 parser = optparse.OptionParser(**kw)
4272 general = optparse.OptionGroup(parser, 'General Options')
4273 selection = optparse.OptionGroup(parser, 'Video Selection')
4274 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4275 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4276 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4277 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4278 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4280 general.add_option('-h', '--help',
4281 action='help', help='print this help text and exit')
4282 general.add_option('-v', '--version',
4283 action='version', help='print program version and exit')
4284 general.add_option('-U', '--update',
4285 action='store_true', dest='update_self', help='update this program to latest version')
4286 general.add_option('-i', '--ignore-errors',
4287 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4288 general.add_option('-r', '--rate-limit',
4289 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4290 general.add_option('-R', '--retries',
4291 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4292 general.add_option('--dump-user-agent',
4293 action='store_true', dest='dump_user_agent',
4294 help='display the current browser identification', default=False)
4295 general.add_option('--list-extractors',
4296 action='store_true', dest='list_extractors',
4297 help='List all supported extractors and the URLs they would handle', default=False)
4299 selection.add_option('--playlist-start',
4300 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4301 selection.add_option('--playlist-end',
4302 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4303 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4304 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4305 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4307 authentication.add_option('-u', '--username',
4308 dest='username', metavar='USERNAME', help='account username')
4309 authentication.add_option('-p', '--password',
4310 dest='password', metavar='PASSWORD', help='account password')
4311 authentication.add_option('-n', '--netrc',
4312 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4315 video_format.add_option('-f', '--format',
4316 action='store', dest='format', metavar='FORMAT', help='video format code')
4317 video_format.add_option('--all-formats',
4318 action='store_const', dest='format', help='download all available video formats', const='all')
4319 video_format.add_option('--prefer-free-formats',
4320 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4321 video_format.add_option('--max-quality',
4322 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4323 video_format.add_option('-F', '--list-formats',
4324 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4327 verbosity.add_option('-q', '--quiet',
4328 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4329 verbosity.add_option('-s', '--simulate',
4330 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4331 verbosity.add_option('--skip-download',
4332 action='store_true', dest='skip_download', help='do not download the video', default=False)
4333 verbosity.add_option('-g', '--get-url',
4334 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4335 verbosity.add_option('-e', '--get-title',
4336 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4337 verbosity.add_option('--get-thumbnail',
4338 action='store_true', dest='getthumbnail',
4339 help='simulate, quiet but print thumbnail URL', default=False)
4340 verbosity.add_option('--get-description',
4341 action='store_true', dest='getdescription',
4342 help='simulate, quiet but print video description', default=False)
4343 verbosity.add_option('--get-filename',
4344 action='store_true', dest='getfilename',
4345 help='simulate, quiet but print output filename', default=False)
4346 verbosity.add_option('--get-format',
4347 action='store_true', dest='getformat',
4348 help='simulate, quiet but print output format', default=False)
4349 verbosity.add_option('--no-progress',
4350 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4351 verbosity.add_option('--console-title',
4352 action='store_true', dest='consoletitle',
4353 help='display progress in console titlebar', default=False)
4354 verbosity.add_option('-v', '--verbose',
4355 action='store_true', dest='verbose', help='print various debugging information', default=False)
4358 filesystem.add_option('-t', '--title',
4359 action='store_true', dest='usetitle', help='use title in file name', default=False)
4360 filesystem.add_option('-l', '--literal',
4361 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4362 filesystem.add_option('-A', '--auto-number',
4363 action='store_true', dest='autonumber',
4364 help='number downloaded files starting from 00000', default=False)
4365 filesystem.add_option('-o', '--output',
4366 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4367 filesystem.add_option('-a', '--batch-file',
4368 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4369 filesystem.add_option('-w', '--no-overwrites',
4370 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4371 filesystem.add_option('-c', '--continue',
4372 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4373 filesystem.add_option('--no-continue',
4374 action='store_false', dest='continue_dl',
4375 help='do not resume partially downloaded files (restart from beginning)')
4376 filesystem.add_option('--cookies',
4377 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4378 filesystem.add_option('--no-part',
4379 action='store_true', dest='nopart', help='do not use .part files', default=False)
4380 filesystem.add_option('--no-mtime',
4381 action='store_false', dest='updatetime',
4382 help='do not use the Last-modified header to set the file modification time', default=True)
4383 filesystem.add_option('--write-description',
4384 action='store_true', dest='writedescription',
4385 help='write video description to a .description file', default=False)
4386 filesystem.add_option('--write-info-json',
4387 action='store_true', dest='writeinfojson',
4388 help='write video metadata to a .info.json file', default=False)
4391 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4392 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4393 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4394 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4395 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4396 help='ffmpeg audio bitrate specification, 128k by default')
4397 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4398 help='keeps the video file on disk after the post-processing; the video is erased by default')
4401 parser.add_option_group(general)
4402 parser.add_option_group(selection)
4403 parser.add_option_group(filesystem)
4404 parser.add_option_group(verbosity)
4405 parser.add_option_group(video_format)
4406 parser.add_option_group(authentication)
4407 parser.add_option_group(postproc)
4409 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4411 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4413 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4414 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4415 opts, args = parser.parse_args(argv)
4417 return parser, opts, args
4419 def gen_extractors():
4420 """ Return a list of an instance of every supported extractor.
4421 The order does matter; the first extractor matched is the one handling the URL.
4423 youtube_ie = YoutubeIE()
4424 google_ie = GoogleIE()
4425 yahoo_ie = YahooIE()
4427 YoutubePlaylistIE(youtube_ie),
4428 YoutubeUserIE(youtube_ie),
4429 YoutubeSearchIE(youtube_ie),
4431 MetacafeIE(youtube_ie),
4434 GoogleSearchIE(google_ie),
4437 YahooSearchIE(yahoo_ie),
4450 StanfordOpenClassroomIE(),
4457 parser, opts, args = parseOpts()
4459 # Open appropriate CookieJar
4460 if opts.cookiefile is None:
4461 jar = cookielib.CookieJar()
4464 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4465 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4467 except (IOError, OSError), err:
4468 sys.exit(u'ERROR: unable to open cookie file')
4471 if opts.dump_user_agent:
4472 print std_headers['User-Agent']
4475 # Batch file verification
4477 if opts.batchfile is not None:
4479 if opts.batchfile == '-':
4482 batchfd = open(opts.batchfile, 'r')
4483 batchurls = batchfd.readlines()
4484 batchurls = [x.strip() for x in batchurls]
4485 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4487 sys.exit(u'ERROR: batch file could not be read')
4488 all_urls = batchurls + args
4490 # General configuration
4491 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4492 proxy_handler = urllib2.ProxyHandler()
4493 opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4494 urllib2.install_opener(opener)
4495 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4498 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4500 extractors = gen_extractors()
4502 if opts.list_extractors:
4503 for ie in extractors:
4505 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4506 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4507 for mu in matchedUrls:
4511 # Conflicting, missing and erroneous options
4512 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4513 parser.error(u'using .netrc conflicts with giving username/password')
4514 if opts.password is not None and opts.username is None:
4515 parser.error(u'account username missing')
4516 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4517 parser.error(u'using output template conflicts with using title, literal title or auto number')
4518 if opts.usetitle and opts.useliteral:
4519 parser.error(u'using title conflicts with using literal title')
4520 if opts.username is not None and opts.password is None:
4521 opts.password = getpass.getpass(u'Type account password and press return:')
4522 if opts.ratelimit is not None:
4523 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4524 if numeric_limit is None:
4525 parser.error(u'invalid rate limit specified')
4526 opts.ratelimit = numeric_limit
4527 if opts.retries is not None:
4529 opts.retries = long(opts.retries)
4530 except (TypeError, ValueError), err:
4531 parser.error(u'invalid retry count specified')
4533 opts.playliststart = int(opts.playliststart)
4534 if opts.playliststart <= 0:
4535 raise ValueError(u'Playlist start must be positive')
4536 except (TypeError, ValueError), err:
4537 parser.error(u'invalid playlist start number specified')
4539 opts.playlistend = int(opts.playlistend)
4540 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4541 raise ValueError(u'Playlist end must be greater than playlist start')
4542 except (TypeError, ValueError), err:
4543 parser.error(u'invalid playlist end number specified')
4544 if opts.extractaudio:
4545 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4546 parser.error(u'invalid audio format specified')
4549 fd = FileDownloader({
4550 'usenetrc': opts.usenetrc,
4551 'username': opts.username,
4552 'password': opts.password,
4553 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4554 'forceurl': opts.geturl,
4555 'forcetitle': opts.gettitle,
4556 'forcethumbnail': opts.getthumbnail,
4557 'forcedescription': opts.getdescription,
4558 'forcefilename': opts.getfilename,
4559 'forceformat': opts.getformat,
4560 'simulate': opts.simulate,
4561 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4562 'format': opts.format,
4563 'format_limit': opts.format_limit,
4564 'listformats': opts.listformats,
4565 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4566 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4567 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4568 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4569 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4570 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4571 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4572 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4573 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4574 or u'%(id)s.%(ext)s'),
4575 'ignoreerrors': opts.ignoreerrors,
4576 'ratelimit': opts.ratelimit,
4577 'nooverwrites': opts.nooverwrites,
4578 'retries': opts.retries,
4579 'continuedl': opts.continue_dl,
4580 'noprogress': opts.noprogress,
4581 'playliststart': opts.playliststart,
4582 'playlistend': opts.playlistend,
4583 'logtostderr': opts.outtmpl == '-',
4584 'consoletitle': opts.consoletitle,
4585 'nopart': opts.nopart,
4586 'updatetime': opts.updatetime,
4587 'writedescription': opts.writedescription,
4588 'writeinfojson': opts.writeinfojson,
4589 'matchtitle': opts.matchtitle,
4590 'rejecttitle': opts.rejecttitle,
4591 'max_downloads': opts.max_downloads,
4592 'prefer_free_formats': opts.prefer_free_formats,
4594 for extractor in extractors:
4595 fd.add_info_extractor(extractor)
4598 if opts.extractaudio:
4599 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4602 if opts.update_self:
4603 updateSelf(fd, sys.argv[0])
4606 if len(all_urls) < 1:
4607 if not opts.update_self:
4608 parser.error(u'you must provide at least one URL')
4613 retcode = fd.download(all_urls)
4614 except MaxDownloadsReached:
4615 fd.to_screen(u'--max-download limit reached, aborting.')
4618 # Dump cookie jar if requested
4619 if opts.cookiefile is not None:
4622 except (IOError, OSError), err:
4623 sys.exit(u'ERROR: unable to save cookie jar')
4630 except DownloadError:
4632 except SameFileError:
4633 sys.exit(u'ERROR: fixed output name but more than one file to download')
4634 except KeyboardInterrupt:
4635 sys.exit(u'\nERROR: Interrupted by user')
4637 if __name__ == '__main__':
4640 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: