2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
20 __license__ = 'Public Domain'
21 __version__ = '2012.01.08b'
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
52 except ImportError: # Python 2.4
55 import cStringIO as StringIO
59 # parse_qs was moved from the cgi module to the urlparse module recently.
61 from urlparse import parse_qs
63 from cgi import parse_qs
71 import xml.etree.ElementTree
72 except ImportError: # Python<2.5: Not officially supported, but let it slip
73 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
76 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
77 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
78 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
79 'Accept-Encoding': 'gzip, deflate',
80 'Accept-Language': 'en-us,en;q=0.5',
85 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
91 def raiseError(msg, i):
92 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
93 def skipSpace(i, expectMore=True):
94 while i < len(s) and s[i] in ' \t\r\n':
98 raiseError('Premature end', i)
100 def decodeEscape(match):
116 return unichr(int(esc[1:5], 16))
117 if len(esc) == 5+6 and esc[5:7] == '\\u':
118 hi = int(esc[1:5], 16)
119 low = int(esc[7:11], 16)
120 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
121 raise ValueError('Unknown escape ' + str(esc))
128 while s[e-bslashes-1] == '\\':
130 if bslashes % 2 == 1:
134 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
135 stri = rexp.sub(decodeEscape, s[i:e])
141 if s[i] == '}': # Empty dictionary
145 raiseError('Expected a string object key', i)
146 i,key = parseString(i)
148 if i >= len(s) or s[i] != ':':
149 raiseError('Expected a colon', i)
156 raiseError('Expected comma or closing curly brace', i)
161 if s[i] == ']': # Empty array
166 i = skipSpace(i) # Raise exception if premature end
170 raiseError('Expected a comma or closing bracket', i)
172 def parseDiscrete(i):
173 for k,v in {'true': True, 'false': False, 'null': None}.items():
174 if s.startswith(k, i):
176 raiseError('Not a boolean (or null)', i)
178 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
180 raiseError('Not a number', i)
182 if '.' in nums or 'e' in nums or 'E' in nums:
183 return (i+len(nums), float(nums))
184 return (i+len(nums), int(nums))
185 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
188 i,res = CHARMAP.get(s[i], parseNumber)(i)
189 i = skipSpace(i, False)
193 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
196 def preferredencoding():
197 """Get preferred encoding.
199 Returns the best encoding scheme for the system, based on
200 locale.getpreferredencoding() and some further tweaks.
202 def yield_preferredencoding():
204 pref = locale.getpreferredencoding()
210 return yield_preferredencoding().next()
213 def htmlentity_transform(matchobj):
214 """Transforms an HTML entity to a Unicode character.
216 This function receives a match object and is intended to be used with
217 the re.sub() function.
219 entity = matchobj.group(1)
221 # Known non-numeric HTML entity
222 if entity in htmlentitydefs.name2codepoint:
223 return unichr(htmlentitydefs.name2codepoint[entity])
226 mobj = re.match(ur'(?u)#(x?\d+)', entity)
228 numstr = mobj.group(1)
229 if numstr.startswith(u'x'):
231 numstr = u'0%s' % numstr
234 return unichr(long(numstr, base))
236 # Unknown entity in name, return its literal representation
237 return (u'&%s;' % entity)
240 def sanitize_title(utitle):
241 """Sanitizes a video title so it could be used as part of a filename."""
242 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
243 return utitle.replace(unicode(os.sep), u'%')
246 def sanitize_open(filename, open_mode):
247 """Try to open the given filename, and slightly tweak it if this fails.
249 Attempts to open the given filename. If this fails, it tries to change
250 the filename slightly, step by step, until it's either able to open it
251 or it fails and raises a final exception, like the standard open()
254 It returns the tuple (stream, definitive_file_name).
258 if sys.platform == 'win32':
260 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
261 return (sys.stdout, filename)
262 stream = open(_encodeFilename(filename), open_mode)
263 return (stream, filename)
264 except (IOError, OSError), err:
265 # In case of error, try to remove win32 forbidden chars
266 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
268 # An exception here should be caught in the caller
269 stream = open(_encodeFilename(filename), open_mode)
270 return (stream, filename)
273 def timeconvert(timestr):
274 """Convert RFC 2822 defined time string into system timestamp"""
276 timetuple = email.utils.parsedate_tz(timestr)
277 if timetuple is not None:
278 timestamp = email.utils.mktime_tz(timetuple)
281 def _simplify_title(title):
282 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
283 return expr.sub(u'_', title).strip(u'_')
285 def _orderedSet(iterable):
286 """ Remove all duplicates from the input iterable """
293 def _unescapeHTML(s):
295 @param s a string (of type unicode)
297 assert type(s) == type(u'')
299 htmlParser = HTMLParser.HTMLParser()
300 return htmlParser.unescape(s)
302 def _encodeFilename(s):
304 @param s The name of the file (of type unicode)
307 assert type(s) == type(u'')
308 return s.encode(sys.getfilesystemencoding(), 'ignore')
310 class DownloadError(Exception):
311 """Download Error exception.
313 This exception may be thrown by FileDownloader objects if they are not
314 configured to continue on errors. They will contain the appropriate
320 class SameFileError(Exception):
321 """Same File exception.
323 This exception will be thrown by FileDownloader objects if they detect
324 multiple files would have to be downloaded to the same file on disk.
329 class PostProcessingError(Exception):
330 """Post Processing exception.
332 This exception may be raised by PostProcessor's .run() method to
333 indicate an error in the postprocessing task.
337 class MaxDownloadsReached(Exception):
338 """ --max-downloads limit has been reached. """
342 class UnavailableVideoError(Exception):
343 """Unavailable Format exception.
345 This exception will be thrown when a video is requested
346 in a format that is not available for that video.
351 class ContentTooShortError(Exception):
352 """Content Too Short exception.
354 This exception may be raised by FileDownloader objects when a file they
355 download is too small for what the server announced first, indicating
356 the connection was probably interrupted.
362 def __init__(self, downloaded, expected):
363 self.downloaded = downloaded
364 self.expected = expected
367 class YoutubeDLHandler(urllib2.HTTPHandler):
368 """Handler for HTTP requests and responses.
370 This class, when installed with an OpenerDirector, automatically adds
371 the standard headers to every HTTP request and handles gzipped and
372 deflated responses from web servers. If compression is to be avoided in
373 a particular request, the original request in the program code only has
374 to include the HTTP header "Youtubedl-No-Compression", which will be
375 removed before making the real request.
377 Part of this code was copied from:
379 http://techknack.net/python-urllib2-handlers/
381 Andrew Rowls, the author of that code, agreed to release it to the
388 return zlib.decompress(data, -zlib.MAX_WBITS)
390 return zlib.decompress(data)
393 def addinfourl_wrapper(stream, headers, url, code):
394 if hasattr(urllib2.addinfourl, 'getcode'):
395 return urllib2.addinfourl(stream, headers, url, code)
396 ret = urllib2.addinfourl(stream, headers, url)
400 def http_request(self, req):
401 for h in std_headers:
404 req.add_header(h, std_headers[h])
405 if 'Youtubedl-no-compression' in req.headers:
406 if 'Accept-encoding' in req.headers:
407 del req.headers['Accept-encoding']
408 del req.headers['Youtubedl-no-compression']
411 def http_response(self, req, resp):
414 if resp.headers.get('Content-encoding', '') == 'gzip':
415 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
416 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
417 resp.msg = old_resp.msg
419 if resp.headers.get('Content-encoding', '') == 'deflate':
420 gz = StringIO.StringIO(self.deflate(resp.read()))
421 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
422 resp.msg = old_resp.msg
426 class FileDownloader(object):
427 """File Downloader class.
429 File downloader objects are the ones responsible of downloading the
430 actual video file and writing it to disk if the user has requested
431 it, among some other tasks. In most cases there should be one per
432 program. As, given a video URL, the downloader doesn't know how to
433 extract all the needed information, task that InfoExtractors do, it
434 has to pass the URL to one of them.
436 For this, file downloader objects have a method that allows
437 InfoExtractors to be registered in a given order. When it is passed
438 a URL, the file downloader handles it to the first InfoExtractor it
439 finds that reports being able to handle it. The InfoExtractor extracts
440 all the information about the video or videos the URL refers to, and
441 asks the FileDownloader to process the video information, possibly
442 downloading the video.
444 File downloaders accept a lot of parameters. In order not to saturate
445 the object constructor with arguments, it receives a dictionary of
446 options instead. These options are available through the params
447 attribute for the InfoExtractors to use. The FileDownloader also
448 registers itself as the downloader in charge for the InfoExtractors
449 that are added to it, so this is a "mutual registration".
453 username: Username for authentication purposes.
454 password: Password for authentication purposes.
455 usenetrc: Use netrc for authentication instead.
456 quiet: Do not print messages to stdout.
457 forceurl: Force printing final URL.
458 forcetitle: Force printing title.
459 forcethumbnail: Force printing thumbnail URL.
460 forcedescription: Force printing description.
461 forcefilename: Force printing final filename.
462 simulate: Do not download the video files.
463 format: Video format code.
464 format_limit: Highest quality format to try.
465 outtmpl: Template for output names.
466 ignoreerrors: Do not stop on download errors.
467 ratelimit: Download speed limit, in bytes/sec.
468 nooverwrites: Prevent overwriting files.
469 retries: Number of times to retry for HTTP error 5xx
470 continuedl: Try to continue downloads if possible.
471 noprogress: Do not print the progress bar.
472 playliststart: Playlist item to start at.
473 playlistend: Playlist item to end at.
474 matchtitle: Download only matching titles.
475 rejecttitle: Reject downloads for matching titles.
476 logtostderr: Log messages to stderr instead of stdout.
477 consoletitle: Display progress in console window's titlebar.
478 nopart: Do not use temporary .part files.
479 updatetime: Use the Last-modified header to set output file timestamps.
480 writedescription: Write the video description to a .description file
481 writeinfojson: Write the video description to a .info.json file
487 _download_retcode = None
488 _num_downloads = None
491 def __init__(self, params):
492 """Create a FileDownloader object with the given options."""
495 self._download_retcode = 0
496 self._num_downloads = 0
497 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
501 def format_bytes(bytes):
504 if type(bytes) is str:
509 exponent = long(math.log(bytes, 1024.0))
510 suffix = 'bkMGTPEZY'[exponent]
511 converted = float(bytes) / float(1024 ** exponent)
512 return '%.2f%s' % (converted, suffix)
515 def calc_percent(byte_counter, data_len):
518 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
521 def calc_eta(start, now, total, current):
525 if current == 0 or dif < 0.001: # One millisecond
527 rate = float(current) / dif
528 eta = long((float(total) - float(current)) / rate)
529 (eta_mins, eta_secs) = divmod(eta, 60)
532 return '%02d:%02d' % (eta_mins, eta_secs)
535 def calc_speed(start, now, bytes):
537 if bytes == 0 or dif < 0.001: # One millisecond
538 return '%10s' % '---b/s'
539 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
542 def best_block_size(elapsed_time, bytes):
543 new_min = max(bytes / 2.0, 1.0)
544 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
545 if elapsed_time < 0.001:
547 rate = bytes / elapsed_time
555 def parse_bytes(bytestr):
556 """Parse a string indicating a byte quantity into a long integer."""
557 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
560 number = float(matchobj.group(1))
561 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
562 return long(round(number * multiplier))
564 def add_info_extractor(self, ie):
565 """Add an InfoExtractor object to the end of the list."""
567 ie.set_downloader(self)
569 def add_post_processor(self, pp):
570 """Add a PostProcessor object to the end of the chain."""
572 pp.set_downloader(self)
574 def to_screen(self, message, skip_eol=False):
575 """Print message to stdout if not in quiet mode."""
576 assert type(message) == type(u'')
577 if not self.params.get('quiet', False):
578 terminator = [u'\n', u''][skip_eol]
579 output = message + terminator
581 if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
582 output = output.encode(preferredencoding(), 'ignore')
583 self._screen_file.write(output)
584 self._screen_file.flush()
586 def to_stderr(self, message):
587 """Print message to stderr."""
588 print >>sys.stderr, message.encode(preferredencoding())
590 def to_cons_title(self, message):
591 """Set console/terminal window title to message."""
592 if not self.params.get('consoletitle', False):
594 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
595 # c_wchar_p() might not be necessary if `message` is
596 # already of type unicode()
597 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
598 elif 'TERM' in os.environ:
599 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
601 def fixed_template(self):
602 """Checks if the output template is fixed."""
603 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
605 def trouble(self, message=None):
606 """Determine action to take when a download problem appears.
608 Depending on if the downloader has been configured to ignore
609 download errors or not, this method may throw an exception or
610 not when errors are found, after printing the message.
612 if message is not None:
613 self.to_stderr(message)
614 if not self.params.get('ignoreerrors', False):
615 raise DownloadError(message)
616 self._download_retcode = 1
618 def slow_down(self, start_time, byte_counter):
619 """Sleep if the download speed is over the rate limit."""
620 rate_limit = self.params.get('ratelimit', None)
621 if rate_limit is None or byte_counter == 0:
624 elapsed = now - start_time
627 speed = float(byte_counter) / elapsed
628 if speed > rate_limit:
629 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
631 def temp_name(self, filename):
632 """Returns a temporary filename for the given filename."""
633 if self.params.get('nopart', False) or filename == u'-' or \
634 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
636 return filename + u'.part'
638 def undo_temp_name(self, filename):
639 if filename.endswith(u'.part'):
640 return filename[:-len(u'.part')]
643 def try_rename(self, old_filename, new_filename):
645 if old_filename == new_filename:
647 os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
648 except (IOError, OSError), err:
649 self.trouble(u'ERROR: unable to rename file')
651 def try_utime(self, filename, last_modified_hdr):
652 """Try to set the last-modified time of the given file."""
653 if last_modified_hdr is None:
655 if not os.path.isfile(_encodeFilename(filename)):
657 timestr = last_modified_hdr
660 filetime = timeconvert(timestr)
664 os.utime(filename, (time.time(), filetime))
669 def report_writedescription(self, descfn):
670 """ Report that the description file is being written """
671 self.to_screen(u'[info] Writing video description to: ' + descfn)
673 def report_writeinfojson(self, infofn):
674 """ Report that the metadata file has been written """
675 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
677 def report_destination(self, filename):
678 """Report destination filename."""
679 self.to_screen(u'[download] Destination: ' + filename)
681 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
682 """Report download progress."""
683 if self.params.get('noprogress', False):
685 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
686 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
687 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
688 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
690 def report_resuming_byte(self, resume_len):
691 """Report attempt to resume at given byte."""
692 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
694 def report_retry(self, count, retries):
695 """Report retry in case of HTTP error 5xx"""
696 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
698 def report_file_already_downloaded(self, file_name):
699 """Report file has already been fully downloaded."""
701 self.to_screen(u'[download] %s has already been downloaded' % file_name)
702 except (UnicodeEncodeError), err:
703 self.to_screen(u'[download] The file has already been downloaded')
705 def report_unable_to_resume(self):
706 """Report it was impossible to resume download."""
707 self.to_screen(u'[download] Unable to resume')
709 def report_finish(self):
710 """Report download finished."""
711 if self.params.get('noprogress', False):
712 self.to_screen(u'[download] Download completed')
716 def increment_downloads(self):
717 """Increment the ordinal that assigns a number to each file."""
718 self._num_downloads += 1
720 def prepare_filename(self, info_dict):
721 """Generate the output filename."""
723 template_dict = dict(info_dict)
724 template_dict['epoch'] = unicode(long(time.time()))
725 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
726 filename = self.params['outtmpl'] % template_dict
728 except (ValueError, KeyError), err:
729 self.trouble(u'ERROR: invalid system charset or erroneous output template')
732 def _match_entry(self, info_dict):
733 """ Returns None iff the file should be downloaded """
735 title = info_dict['title']
736 matchtitle = self.params.get('matchtitle', False)
737 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
738 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
739 rejecttitle = self.params.get('rejecttitle', False)
740 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
741 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
744 def process_info(self, info_dict):
745 """Process a single dictionary returned by an InfoExtractor."""
747 reason = self._match_entry(info_dict)
748 if reason is not None:
749 self.to_screen(u'[download] ' + reason)
752 max_downloads = self.params.get('max_downloads')
753 if max_downloads is not None:
754 if self._num_downloads > int(max_downloads):
755 raise MaxDownloadsReached()
757 filename = self.prepare_filename(info_dict)
760 if self.params.get('forcetitle', False):
761 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
762 if self.params.get('forceurl', False):
763 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
764 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
765 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
766 if self.params.get('forcedescription', False) and 'description' in info_dict:
767 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
768 if self.params.get('forcefilename', False) and filename is not None:
769 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
770 if self.params.get('forceformat', False):
771 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
773 # Do nothing else if in simulate mode
774 if self.params.get('simulate', False):
781 dn = os.path.dirname(_encodeFilename(filename))
782 if dn != '' and not os.path.exists(dn): # dn is already encoded
784 except (OSError, IOError), err:
785 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
788 if self.params.get('writedescription', False):
790 descfn = filename + u'.description'
791 self.report_writedescription(descfn)
792 descfile = open(_encodeFilename(descfn), 'wb')
794 descfile.write(info_dict['description'].encode('utf-8'))
797 except (OSError, IOError):
798 self.trouble(u'ERROR: Cannot write description file ' + descfn)
801 if self.params.get('writeinfojson', False):
802 infofn = filename + u'.info.json'
803 self.report_writeinfojson(infofn)
806 except (NameError,AttributeError):
807 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
810 infof = open(_encodeFilename(infofn), 'wb')
812 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
813 json.dump(json_info_dict, infof)
816 except (OSError, IOError):
817 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
820 if not self.params.get('skip_download', False):
821 if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
825 success = self._do_download(filename, info_dict)
826 except (OSError, IOError), err:
827 raise UnavailableVideoError
828 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
829 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
831 except (ContentTooShortError, ), err:
832 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
837 self.post_process(filename, info_dict)
838 except (PostProcessingError), err:
839 self.trouble(u'ERROR: postprocessing: %s' % str(err))
842 def download(self, url_list):
843 """Download a given list of URLs."""
844 if len(url_list) > 1 and self.fixed_template():
845 raise SameFileError(self.params['outtmpl'])
848 suitable_found = False
850 # Go to next InfoExtractor if not suitable
851 if not ie.suitable(url):
854 # Suitable InfoExtractor found
855 suitable_found = True
857 # Extract information from URL and process it
860 # Suitable InfoExtractor had been found; go to next URL
863 if not suitable_found:
864 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
866 return self._download_retcode
868 def post_process(self, filename, ie_info):
869 """Run the postprocessing chain on the given file."""
871 info['filepath'] = filename
877 def _download_with_rtmpdump(self, filename, url, player_url):
878 self.report_destination(filename)
879 tmpfilename = self.temp_name(filename)
881 # Check for rtmpdump first
883 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
884 except (OSError, IOError):
885 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
888 # Download using rtmpdump. rtmpdump returns exit code 2 when
889 # the connection was interrumpted and resuming appears to be
890 # possible. This is part of rtmpdump's normal usage, AFAIK.
891 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
892 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
893 while retval == 2 or retval == 1:
894 prevsize = os.path.getsize(_encodeFilename(tmpfilename))
895 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
896 time.sleep(5.0) # This seems to be needed
897 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
898 cursize = os.path.getsize(_encodeFilename(tmpfilename))
899 if prevsize == cursize and retval == 1:
901 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
902 if prevsize == cursize and retval == 2 and cursize > 1024:
903 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
907 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
908 self.try_rename(tmpfilename, filename)
911 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
914 def _do_download(self, filename, info_dict):
915 url = info_dict['url']
916 player_url = info_dict.get('player_url', None)
918 # Check file already present
919 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
920 self.report_file_already_downloaded(filename)
923 # Attempt to download using rtmpdump
924 if url.startswith('rtmp'):
925 return self._download_with_rtmpdump(filename, url, player_url)
927 tmpfilename = self.temp_name(filename)
930 # Do not include the Accept-Encoding header
931 headers = {'Youtubedl-no-compression': 'True'}
932 basic_request = urllib2.Request(url, None, headers)
933 request = urllib2.Request(url, None, headers)
935 # Establish possible resume length
936 if os.path.isfile(_encodeFilename(tmpfilename)):
937 resume_len = os.path.getsize(_encodeFilename(tmpfilename))
943 if self.params.get('continuedl', False):
944 self.report_resuming_byte(resume_len)
945 request.add_header('Range','bytes=%d-' % resume_len)
951 retries = self.params.get('retries', 0)
952 while count <= retries:
953 # Establish connection
955 if count == 0 and 'urlhandle' in info_dict:
956 data = info_dict['urlhandle']
957 data = urllib2.urlopen(request)
959 except (urllib2.HTTPError, ), err:
960 if (err.code < 500 or err.code >= 600) and err.code != 416:
961 # Unexpected HTTP error
963 elif err.code == 416:
964 # Unable to resume (requested range not satisfiable)
966 # Open the connection again without the range header
967 data = urllib2.urlopen(basic_request)
968 content_length = data.info()['Content-Length']
969 except (urllib2.HTTPError, ), err:
970 if err.code < 500 or err.code >= 600:
973 # Examine the reported length
974 if (content_length is not None and
975 (resume_len - 100 < long(content_length) < resume_len + 100)):
976 # The file had already been fully downloaded.
977 # Explanation to the above condition: in issue #175 it was revealed that
978 # YouTube sometimes adds or removes a few bytes from the end of the file,
979 # changing the file size slightly and causing problems for some users. So
980 # I decided to implement a suggested change and consider the file
981 # completely downloaded if the file size differs less than 100 bytes from
982 # the one in the hard drive.
983 self.report_file_already_downloaded(filename)
984 self.try_rename(tmpfilename, filename)
987 # The length does not match, we start the download over
988 self.report_unable_to_resume()
994 self.report_retry(count, retries)
997 self.trouble(u'ERROR: giving up after %s retries' % retries)
1000 data_len = data.info().get('Content-length', None)
1001 if data_len is not None:
1002 data_len = long(data_len) + resume_len
1003 data_len_str = self.format_bytes(data_len)
1004 byte_counter = 0 + resume_len
1008 # Download and write
1009 before = time.time()
1010 data_block = data.read(block_size)
1012 if len(data_block) == 0:
1014 byte_counter += len(data_block)
1016 # Open file just in time
1019 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1020 assert stream is not None
1021 filename = self.undo_temp_name(tmpfilename)
1022 self.report_destination(filename)
1023 except (OSError, IOError), err:
1024 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1027 stream.write(data_block)
1028 except (IOError, OSError), err:
1029 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1031 block_size = self.best_block_size(after - before, len(data_block))
1034 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1035 if data_len is None:
1036 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1038 percent_str = self.calc_percent(byte_counter, data_len)
1039 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1040 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1043 self.slow_down(start, byte_counter - resume_len)
1046 self.trouble(u'\nERROR: Did not get any data blocks')
1049 self.report_finish()
1050 if data_len is not None and byte_counter != data_len:
1051 raise ContentTooShortError(byte_counter, long(data_len))
1052 self.try_rename(tmpfilename, filename)
1054 # Update file modification time
1055 if self.params.get('updatetime', True):
1056 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1061 class InfoExtractor(object):
1062 """Information Extractor class.
1064 Information extractors are the classes that, given a URL, extract
1065 information from the video (or videos) the URL refers to. This
1066 information includes the real video URL, the video title and simplified
1067 title, author and others. The information is stored in a dictionary
1068 which is then passed to the FileDownloader. The FileDownloader
1069 processes this information possibly downloading the video to the file
1070 system, among other possible outcomes. The dictionaries must include
1071 the following fields:
1073 id: Video identifier.
1074 url: Final video URL.
1075 uploader: Nickname of the video uploader.
1076 title: Literal title.
1077 stitle: Simplified title.
1078 ext: Video filename extension.
1079 format: Video format.
1080 player_url: SWF Player URL (may be None).
1082 The following fields are optional. Their primary purpose is to allow
1083 youtube-dl to serve as the backend for a video search function, such
1084 as the one in youtube2mp3. They are only used when their respective
1085 forced printing functions are called:
1087 thumbnail: Full URL to a video thumbnail image.
1088 description: One-line video description.
1090 Subclasses of this one should re-define the _real_initialize() and
1091 _real_extract() methods and define a _VALID_URL regexp.
1092 Probably, they should also be added to the list of extractors.
1098 def __init__(self, downloader=None):
1099 """Constructor. Receives an optional downloader."""
1101 self.set_downloader(downloader)
1103 def suitable(self, url):
1104 """Receives a URL and returns True if suitable for this IE."""
1105 return re.match(self._VALID_URL, url) is not None
1107 def initialize(self):
1108 """Initializes an instance (authentication, etc)."""
1110 self._real_initialize()
1113 def extract(self, url):
1114 """Extracts URL information and returns it in list of dicts."""
1116 return self._real_extract(url)
1118 def set_downloader(self, downloader):
1119 """Sets the downloader for this IE."""
1120 self._downloader = downloader
1122 def _real_initialize(self):
1123 """Real initialization process. Redefine in subclasses."""
1126 def _real_extract(self, url):
1127 """Real extraction process. Redefine in subclasses."""
1131 class YoutubeIE(InfoExtractor):
1132 """Information extractor for youtube.com."""
1134 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1135 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1136 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1137 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1138 _NETRC_MACHINE = 'youtube'
1139 # Listed in order of quality
1140 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1141 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1142 _video_extensions = {
1148 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1153 _video_dimensions = {
1168 IE_NAME = u'youtube'
1170 def report_lang(self):
1171 """Report attempt to set language."""
1172 self._downloader.to_screen(u'[youtube] Setting language')
1174 def report_login(self):
1175 """Report attempt to log in."""
1176 self._downloader.to_screen(u'[youtube] Logging in')
1178 def report_age_confirmation(self):
1179 """Report attempt to confirm age."""
1180 self._downloader.to_screen(u'[youtube] Confirming age')
1182 def report_video_webpage_download(self, video_id):
1183 """Report attempt to download video webpage."""
1184 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1186 def report_video_info_webpage_download(self, video_id):
1187 """Report attempt to download video info webpage."""
1188 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1190 def report_information_extraction(self, video_id):
1191 """Report attempt to extract video information."""
1192 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1194 def report_unavailable_format(self, video_id, format):
1195 """Report extracted video URL."""
1196 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1198 def report_rtmp_download(self):
1199 """Indicate the download will use the RTMP protocol."""
1200 self._downloader.to_screen(u'[youtube] RTMP download detected')
1202 def _print_formats(self, formats):
1203 print 'Available formats:'
1205 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1207 def _real_initialize(self):
1208 if self._downloader is None:
1213 downloader_params = self._downloader.params
1215 # Attempt to use provided username and password or .netrc data
1216 if downloader_params.get('username', None) is not None:
1217 username = downloader_params['username']
1218 password = downloader_params['password']
1219 elif downloader_params.get('usenetrc', False):
1221 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1222 if info is not None:
1226 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1227 except (IOError, netrc.NetrcParseError), err:
1228 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1232 request = urllib2.Request(self._LANG_URL)
1235 urllib2.urlopen(request).read()
1236 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1237 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1240 # No authentication to be performed
1241 if username is None:
1246 'current_form': 'loginForm',
1248 'action_login': 'Log In',
1249 'username': username,
1250 'password': password,
1252 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1255 login_results = urllib2.urlopen(request).read()
1256 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1257 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1259 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1260 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1266 'action_confirm': 'Confirm',
1268 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1270 self.report_age_confirmation()
1271 age_results = urllib2.urlopen(request).read()
1272 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1273 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1276 def _real_extract(self, url):
1277 # Extract video id from URL
1278 mobj = re.match(self._VALID_URL, url)
1280 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1282 video_id = mobj.group(2)
1285 self.report_video_webpage_download(video_id)
1286 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1288 video_webpage = urllib2.urlopen(request).read()
1289 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1290 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1293 # Attempt to extract SWF player URL
1294 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1295 if mobj is not None:
1296 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1301 self.report_video_info_webpage_download(video_id)
1302 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1303 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1304 % (video_id, el_type))
1305 request = urllib2.Request(video_info_url)
1307 video_info_webpage = urllib2.urlopen(request).read()
1308 video_info = parse_qs(video_info_webpage)
1309 if 'token' in video_info:
1311 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1312 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1314 if 'token' not in video_info:
1315 if 'reason' in video_info:
1316 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1318 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1321 # Start extracting information
1322 self.report_information_extraction(video_id)
1325 if 'author' not in video_info:
1326 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1328 video_uploader = urllib.unquote_plus(video_info['author'][0])
1331 if 'title' not in video_info:
1332 self._downloader.trouble(u'ERROR: unable to extract video title')
1334 video_title = urllib.unquote_plus(video_info['title'][0])
1335 video_title = video_title.decode('utf-8')
1336 video_title = sanitize_title(video_title)
1339 simple_title = _simplify_title(video_title)
1342 if 'thumbnail_url' not in video_info:
1343 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1344 video_thumbnail = ''
1345 else: # don't panic if we can't find it
1346 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1350 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1351 if mobj is not None:
1352 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1353 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1354 for expression in format_expressions:
1356 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1364 video_description = u'No description available.'
1365 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1366 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1367 if mobj is not None:
1368 video_description = mobj.group(1).decode('utf-8')
1370 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1371 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1372 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1373 # TODO use another parser
1376 video_token = urllib.unquote_plus(video_info['token'][0])
1378 # Decide which formats to download
1379 req_format = self._downloader.params.get('format', None)
1381 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1382 self.report_rtmp_download()
1383 video_url_list = [(None, video_info['conn'][0])]
1384 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1385 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1386 url_data = [parse_qs(uds) for uds in url_data_strs]
1387 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1388 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1390 format_limit = self._downloader.params.get('format_limit', None)
1391 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1392 if format_limit is not None and format_limit in available_formats:
1393 format_list = available_formats[available_formats.index(format_limit):]
1395 format_list = available_formats
1396 existing_formats = [x for x in format_list if x in url_map]
1397 if len(existing_formats) == 0:
1398 self._downloader.trouble(u'ERROR: no known formats available for video')
1400 if self._downloader.params.get('listformats', None):
1401 self._print_formats(existing_formats)
1403 if req_format is None or req_format == 'best':
1404 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1405 elif req_format == 'worst':
1406 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1407 elif req_format in ('-1', 'all'):
1408 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1410 # Specific formats. We pick the first in a slash-delimeted sequence.
1411 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1412 req_formats = req_format.split('/')
1413 video_url_list = None
1414 for rf in req_formats:
1416 video_url_list = [(rf, url_map[rf])]
1418 if video_url_list is None:
1419 self._downloader.trouble(u'ERROR: requested format not available')
1422 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1425 for format_param, video_real_url in video_url_list:
1426 # At this point we have a new video
1427 self._downloader.increment_downloads()
1430 video_extension = self._video_extensions.get(format_param, 'flv')
1433 # Process video information
1434 self._downloader.process_info({
1435 'id': video_id.decode('utf-8'),
1436 'url': video_real_url.decode('utf-8'),
1437 'uploader': video_uploader.decode('utf-8'),
1438 'upload_date': upload_date,
1439 'title': video_title,
1440 'stitle': simple_title,
1441 'ext': video_extension.decode('utf-8'),
1442 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1443 'thumbnail': video_thumbnail.decode('utf-8'),
1444 'description': video_description,
1445 'player_url': player_url,
1447 except UnavailableVideoError, err:
1448 self._downloader.trouble(u'\nERROR: unable to download video')
1451 class MetacafeIE(InfoExtractor):
1452 """Information Extractor for metacafe.com."""
1454 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1455 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1456 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1458 IE_NAME = u'metacafe'
1460 def __init__(self, youtube_ie, downloader=None):
1461 InfoExtractor.__init__(self, downloader)
1462 self._youtube_ie = youtube_ie
1464 def report_disclaimer(self):
1465 """Report disclaimer retrieval."""
1466 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1468 def report_age_confirmation(self):
1469 """Report attempt to confirm age."""
1470 self._downloader.to_screen(u'[metacafe] Confirming age')
1472 def report_download_webpage(self, video_id):
1473 """Report webpage download."""
1474 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1476 def report_extraction(self, video_id):
1477 """Report information extraction."""
1478 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1480 def _real_initialize(self):
1481 # Retrieve disclaimer
1482 request = urllib2.Request(self._DISCLAIMER)
1484 self.report_disclaimer()
1485 disclaimer = urllib2.urlopen(request).read()
1486 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1487 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1493 'submit': "Continue - I'm over 18",
1495 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1497 self.report_age_confirmation()
1498 disclaimer = urllib2.urlopen(request).read()
1499 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1500 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1503 def _real_extract(self, url):
1504 # Extract id and simplified title from URL
1505 mobj = re.match(self._VALID_URL, url)
1507 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1510 video_id = mobj.group(1)
1512 # Check if video comes from YouTube
1513 mobj2 = re.match(r'^yt-(.*)$', video_id)
1514 if mobj2 is not None:
1515 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1518 # At this point we have a new video
1519 self._downloader.increment_downloads()
1521 simple_title = mobj.group(2).decode('utf-8')
1523 # Retrieve video webpage to extract further information
1524 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1526 self.report_download_webpage(video_id)
1527 webpage = urllib2.urlopen(request).read()
1528 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1529 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1532 # Extract URL, uploader and title from webpage
1533 self.report_extraction(video_id)
1534 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1535 if mobj is not None:
1536 mediaURL = urllib.unquote(mobj.group(1))
1537 video_extension = mediaURL[-3:]
1539 # Extract gdaKey if available
1540 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1542 video_url = mediaURL
1544 gdaKey = mobj.group(1)
1545 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1547 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1549 self._downloader.trouble(u'ERROR: unable to extract media URL')
1551 vardict = parse_qs(mobj.group(1))
1552 if 'mediaData' not in vardict:
1553 self._downloader.trouble(u'ERROR: unable to extract media URL')
1555 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1557 self._downloader.trouble(u'ERROR: unable to extract media URL')
1559 mediaURL = mobj.group(1).replace('\\/', '/')
1560 video_extension = mediaURL[-3:]
1561 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1563 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1565 self._downloader.trouble(u'ERROR: unable to extract title')
1567 video_title = mobj.group(1).decode('utf-8')
1568 video_title = sanitize_title(video_title)
1570 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1572 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1574 video_uploader = mobj.group(1)
1577 # Process video information
1578 self._downloader.process_info({
1579 'id': video_id.decode('utf-8'),
1580 'url': video_url.decode('utf-8'),
1581 'uploader': video_uploader.decode('utf-8'),
1582 'upload_date': u'NA',
1583 'title': video_title,
1584 'stitle': simple_title,
1585 'ext': video_extension.decode('utf-8'),
1589 except UnavailableVideoError:
1590 self._downloader.trouble(u'\nERROR: unable to download video')
1593 class DailymotionIE(InfoExtractor):
1594 """Information Extractor for Dailymotion"""
1596 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1597 IE_NAME = u'dailymotion'
1599 def __init__(self, downloader=None):
1600 InfoExtractor.__init__(self, downloader)
1602 def report_download_webpage(self, video_id):
1603 """Report webpage download."""
1604 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1606 def report_extraction(self, video_id):
1607 """Report information extraction."""
1608 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1610 def _real_extract(self, url):
1611 # Extract id and simplified title from URL
1612 mobj = re.match(self._VALID_URL, url)
1614 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1617 # At this point we have a new video
1618 self._downloader.increment_downloads()
1619 video_id = mobj.group(1)
1621 video_extension = 'flv'
1623 # Retrieve video webpage to extract further information
1624 request = urllib2.Request(url)
1625 request.add_header('Cookie', 'family_filter=off')
1627 self.report_download_webpage(video_id)
1628 webpage = urllib2.urlopen(request).read()
1629 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1630 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1633 # Extract URL, uploader and title from webpage
1634 self.report_extraction(video_id)
1635 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1637 self._downloader.trouble(u'ERROR: unable to extract media URL')
1639 sequence = urllib.unquote(mobj.group(1))
1640 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1642 self._downloader.trouble(u'ERROR: unable to extract media URL')
1644 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1646 # if needed add http://www.dailymotion.com/ if relative URL
1648 video_url = mediaURL
1650 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1652 self._downloader.trouble(u'ERROR: unable to extract title')
1654 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1655 video_title = sanitize_title(video_title)
1656 simple_title = _simplify_title(video_title)
1658 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1660 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1662 video_uploader = mobj.group(1)
1665 # Process video information
1666 self._downloader.process_info({
1667 'id': video_id.decode('utf-8'),
1668 'url': video_url.decode('utf-8'),
1669 'uploader': video_uploader.decode('utf-8'),
1670 'upload_date': u'NA',
1671 'title': video_title,
1672 'stitle': simple_title,
1673 'ext': video_extension.decode('utf-8'),
1677 except UnavailableVideoError:
1678 self._downloader.trouble(u'\nERROR: unable to download video')
1681 class GoogleIE(InfoExtractor):
1682 """Information extractor for video.google.com."""
1684 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1685 IE_NAME = u'video.google'
1687 def __init__(self, downloader=None):
1688 InfoExtractor.__init__(self, downloader)
1690 def report_download_webpage(self, video_id):
1691 """Report webpage download."""
1692 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1694 def report_extraction(self, video_id):
1695 """Report information extraction."""
1696 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1698 def _real_extract(self, url):
1699 # Extract id from URL
1700 mobj = re.match(self._VALID_URL, url)
1702 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1705 # At this point we have a new video
1706 self._downloader.increment_downloads()
1707 video_id = mobj.group(1)
1709 video_extension = 'mp4'
1711 # Retrieve video webpage to extract further information
1712 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1714 self.report_download_webpage(video_id)
1715 webpage = urllib2.urlopen(request).read()
1716 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1717 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1720 # Extract URL, uploader, and title from webpage
1721 self.report_extraction(video_id)
1722 mobj = re.search(r"download_url:'([^']+)'", webpage)
1724 video_extension = 'flv'
1725 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1727 self._downloader.trouble(u'ERROR: unable to extract media URL')
1729 mediaURL = urllib.unquote(mobj.group(1))
1730 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1731 mediaURL = mediaURL.replace('\\x26', '\x26')
1733 video_url = mediaURL
1735 mobj = re.search(r'<title>(.*)</title>', webpage)
1737 self._downloader.trouble(u'ERROR: unable to extract title')
1739 video_title = mobj.group(1).decode('utf-8')
1740 video_title = sanitize_title(video_title)
1741 simple_title = _simplify_title(video_title)
1743 # Extract video description
1744 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1746 self._downloader.trouble(u'ERROR: unable to extract video description')
1748 video_description = mobj.group(1).decode('utf-8')
1749 if not video_description:
1750 video_description = 'No description available.'
1752 # Extract video thumbnail
1753 if self._downloader.params.get('forcethumbnail', False):
1754 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1756 webpage = urllib2.urlopen(request).read()
1757 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1758 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1760 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1762 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1764 video_thumbnail = mobj.group(1)
1765 else: # we need something to pass to process_info
1766 video_thumbnail = ''
1769 # Process video information
1770 self._downloader.process_info({
1771 'id': video_id.decode('utf-8'),
1772 'url': video_url.decode('utf-8'),
1774 'upload_date': u'NA',
1775 'title': video_title,
1776 'stitle': simple_title,
1777 'ext': video_extension.decode('utf-8'),
1781 except UnavailableVideoError:
1782 self._downloader.trouble(u'\nERROR: unable to download video')
1785 class PhotobucketIE(InfoExtractor):
1786 """Information extractor for photobucket.com."""
1788 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1789 IE_NAME = u'photobucket'
1791 def __init__(self, downloader=None):
1792 InfoExtractor.__init__(self, downloader)
1794 def report_download_webpage(self, video_id):
1795 """Report webpage download."""
1796 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1798 def report_extraction(self, video_id):
1799 """Report information extraction."""
1800 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1802 def _real_extract(self, url):
1803 # Extract id from URL
1804 mobj = re.match(self._VALID_URL, url)
1806 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1809 # At this point we have a new video
1810 self._downloader.increment_downloads()
1811 video_id = mobj.group(1)
1813 video_extension = 'flv'
1815 # Retrieve video webpage to extract further information
1816 request = urllib2.Request(url)
1818 self.report_download_webpage(video_id)
1819 webpage = urllib2.urlopen(request).read()
1820 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1821 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1824 # Extract URL, uploader, and title from webpage
1825 self.report_extraction(video_id)
1826 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1828 self._downloader.trouble(u'ERROR: unable to extract media URL')
1830 mediaURL = urllib.unquote(mobj.group(1))
1832 video_url = mediaURL
1834 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1836 self._downloader.trouble(u'ERROR: unable to extract title')
1838 video_title = mobj.group(1).decode('utf-8')
1839 video_title = sanitize_title(video_title)
1840 simple_title = _simplify_title(vide_title)
1842 video_uploader = mobj.group(2).decode('utf-8')
1845 # Process video information
1846 self._downloader.process_info({
1847 'id': video_id.decode('utf-8'),
1848 'url': video_url.decode('utf-8'),
1849 'uploader': video_uploader,
1850 'upload_date': u'NA',
1851 'title': video_title,
1852 'stitle': simple_title,
1853 'ext': video_extension.decode('utf-8'),
1857 except UnavailableVideoError:
1858 self._downloader.trouble(u'\nERROR: unable to download video')
1861 class YahooIE(InfoExtractor):
1862 """Information extractor for video.yahoo.com."""
1864 # _VALID_URL matches all Yahoo! Video URLs
1865 # _VPAGE_URL matches only the extractable '/watch/' URLs
1866 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1867 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1868 IE_NAME = u'video.yahoo'
1870 def __init__(self, downloader=None):
1871 InfoExtractor.__init__(self, downloader)
1873 def report_download_webpage(self, video_id):
1874 """Report webpage download."""
1875 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1877 def report_extraction(self, video_id):
1878 """Report information extraction."""
1879 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1881 def _real_extract(self, url, new_video=True):
1882 # Extract ID from URL
1883 mobj = re.match(self._VALID_URL, url)
1885 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1888 # At this point we have a new video
1889 self._downloader.increment_downloads()
1890 video_id = mobj.group(2)
1891 video_extension = 'flv'
1893 # Rewrite valid but non-extractable URLs as
1894 # extractable English language /watch/ URLs
1895 if re.match(self._VPAGE_URL, url) is None:
1896 request = urllib2.Request(url)
1898 webpage = urllib2.urlopen(request).read()
1899 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1900 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1903 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1905 self._downloader.trouble(u'ERROR: Unable to extract id field')
1907 yahoo_id = mobj.group(1)
1909 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1911 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1913 yahoo_vid = mobj.group(1)
1915 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1916 return self._real_extract(url, new_video=False)
1918 # Retrieve video webpage to extract further information
1919 request = urllib2.Request(url)
1921 self.report_download_webpage(video_id)
1922 webpage = urllib2.urlopen(request).read()
1923 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1924 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1927 # Extract uploader and title from webpage
1928 self.report_extraction(video_id)
1929 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1931 self._downloader.trouble(u'ERROR: unable to extract video title')
1933 video_title = mobj.group(1).decode('utf-8')
1934 simple_title = _simplify_title(video_title)
1936 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1938 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1940 video_uploader = mobj.group(1).decode('utf-8')
1942 # Extract video thumbnail
1943 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1945 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1947 video_thumbnail = mobj.group(1).decode('utf-8')
1949 # Extract video description
1950 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1952 self._downloader.trouble(u'ERROR: unable to extract video description')
1954 video_description = mobj.group(1).decode('utf-8')
1955 if not video_description:
1956 video_description = 'No description available.'
1958 # Extract video height and width
1959 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1961 self._downloader.trouble(u'ERROR: unable to extract video height')
1963 yv_video_height = mobj.group(1)
1965 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1967 self._downloader.trouble(u'ERROR: unable to extract video width')
1969 yv_video_width = mobj.group(1)
1971 # Retrieve video playlist to extract media URL
1972 # I'm not completely sure what all these options are, but we
1973 # seem to need most of them, otherwise the server sends a 401.
1974 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1975 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1976 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1977 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1978 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1980 self.report_download_webpage(video_id)
1981 webpage = urllib2.urlopen(request).read()
1982 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1983 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1986 # Extract media URL from playlist XML
1987 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1989 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1991 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1992 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1995 # Process video information
1996 self._downloader.process_info({
1997 'id': video_id.decode('utf-8'),
1999 'uploader': video_uploader,
2000 'upload_date': u'NA',
2001 'title': video_title,
2002 'stitle': simple_title,
2003 'ext': video_extension.decode('utf-8'),
2004 'thumbnail': video_thumbnail.decode('utf-8'),
2005 'description': video_description,
2006 'thumbnail': video_thumbnail,
2009 except UnavailableVideoError:
2010 self._downloader.trouble(u'\nERROR: unable to download video')
2013 class VimeoIE(InfoExtractor):
2014 """Information extractor for vimeo.com."""
2016 # _VALID_URL matches Vimeo URLs
2017 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?(?:moogaloop.swf\?clip_id=)?([0-9]+)'
2020 def __init__(self, downloader=None):
2021 InfoExtractor.__init__(self, downloader)
2023 def report_download_webpage(self, video_id):
2024 """Report webpage download."""
2025 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2027 def report_extraction(self, video_id):
2028 """Report information extraction."""
2029 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2031 def _real_extract(self, url, new_video=True):
2032 # Extract ID from URL
2033 mobj = re.match(self._VALID_URL, url)
2035 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2038 # At this point we have a new video
2039 self._downloader.increment_downloads()
2040 video_id = mobj.group(1)
2042 # Retrieve video webpage to extract further information
2043 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2045 self.report_download_webpage(video_id)
2046 webpage = urllib2.urlopen(request).read()
2047 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2048 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2051 # Now we begin extracting as much information as we can from what we
2052 # retrieved. First we extract the information common to all extractors,
2053 # and latter we extract those that are Vimeo specific.
2054 self.report_extraction(video_id)
2057 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2059 self._downloader.trouble(u'ERROR: unable to extract video title')
2061 video_title = mobj.group(1).decode('utf-8')
2062 simple_title = _simplify_title(video_title)
2065 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2067 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2069 video_uploader = mobj.group(1).decode('utf-8')
2071 # Extract video thumbnail
2072 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2074 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2076 video_thumbnail = mobj.group(1).decode('utf-8')
2078 # # Extract video description
2079 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2081 # self._downloader.trouble(u'ERROR: unable to extract video description')
2083 # video_description = mobj.group(1).decode('utf-8')
2084 # if not video_description: video_description = 'No description available.'
2085 video_description = 'Foo.'
2087 # Vimeo specific: extract request signature
2088 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2090 self._downloader.trouble(u'ERROR: unable to extract request signature')
2092 sig = mobj.group(1).decode('utf-8')
2094 # Vimeo specific: extract video quality information
2095 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2097 self._downloader.trouble(u'ERROR: unable to extract video quality information')
2099 quality = mobj.group(1).decode('utf-8')
2101 if int(quality) == 1:
2106 # Vimeo specific: Extract request signature expiration
2107 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2109 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2111 sig_exp = mobj.group(1).decode('utf-8')
2113 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2116 # Process video information
2117 self._downloader.process_info({
2118 'id': video_id.decode('utf-8'),
2120 'uploader': video_uploader,
2121 'upload_date': u'NA',
2122 'title': video_title,
2123 'stitle': simple_title,
2125 'thumbnail': video_thumbnail.decode('utf-8'),
2126 'description': video_description,
2127 'thumbnail': video_thumbnail,
2128 'description': video_description,
2131 except UnavailableVideoError:
2132 self._downloader.trouble(u'ERROR: unable to download video')
2135 class GenericIE(InfoExtractor):
2136 """Generic last-resort information extractor."""
2139 IE_NAME = u'generic'
2141 def __init__(self, downloader=None):
2142 InfoExtractor.__init__(self, downloader)
2144 def report_download_webpage(self, video_id):
2145 """Report webpage download."""
2146 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2147 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2149 def report_extraction(self, video_id):
2150 """Report information extraction."""
2151 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2153 def _real_extract(self, url):
2154 # At this point we have a new video
2155 self._downloader.increment_downloads()
2157 video_id = url.split('/')[-1]
2158 request = urllib2.Request(url)
2160 self.report_download_webpage(video_id)
2161 webpage = urllib2.urlopen(request).read()
2162 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2163 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2165 except ValueError, err:
2166 # since this is the last-resort InfoExtractor, if
2167 # this error is thrown, it'll be thrown here
2168 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2171 self.report_extraction(video_id)
2172 # Start with something easy: JW Player in SWFObject
2173 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2175 # Broaden the search a little bit
2176 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2178 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2181 # It's possible that one of the regexes
2182 # matched, but returned an empty group:
2183 if mobj.group(1) is None:
2184 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2187 video_url = urllib.unquote(mobj.group(1))
2188 video_id = os.path.basename(video_url)
2190 # here's a fun little line of code for you:
2191 video_extension = os.path.splitext(video_id)[1][1:]
2192 video_id = os.path.splitext(video_id)[0]
2194 # it's tempting to parse this further, but you would
2195 # have to take into account all the variations like
2196 # Video Title - Site Name
2197 # Site Name | Video Title
2198 # Video Title - Tagline | Site Name
2199 # and so on and so forth; it's just not practical
2200 mobj = re.search(r'<title>(.*)</title>', webpage)
2202 self._downloader.trouble(u'ERROR: unable to extract title')
2204 video_title = mobj.group(1).decode('utf-8')
2205 video_title = sanitize_title(video_title)
2206 simple_title = _simplify_title(video_title)
2208 # video uploader is domain name
2209 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2211 self._downloader.trouble(u'ERROR: unable to extract title')
2213 video_uploader = mobj.group(1).decode('utf-8')
2216 # Process video information
2217 self._downloader.process_info({
2218 'id': video_id.decode('utf-8'),
2219 'url': video_url.decode('utf-8'),
2220 'uploader': video_uploader,
2221 'upload_date': u'NA',
2222 'title': video_title,
2223 'stitle': simple_title,
2224 'ext': video_extension.decode('utf-8'),
2228 except UnavailableVideoError, err:
2229 self._downloader.trouble(u'\nERROR: unable to download video')
2232 class YoutubeSearchIE(InfoExtractor):
2233 """Information Extractor for YouTube search queries."""
2234 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2235 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2236 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2237 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2239 _max_youtube_results = 1000
2240 IE_NAME = u'youtube:search'
2242 def __init__(self, youtube_ie, downloader=None):
2243 InfoExtractor.__init__(self, downloader)
2244 self._youtube_ie = youtube_ie
2246 def report_download_page(self, query, pagenum):
2247 """Report attempt to download playlist page with given number."""
2248 query = query.decode(preferredencoding())
2249 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2251 def _real_initialize(self):
2252 self._youtube_ie.initialize()
2254 def _real_extract(self, query):
2255 mobj = re.match(self._VALID_URL, query)
2257 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2260 prefix, query = query.split(':')
2262 query = query.encode('utf-8')
2264 self._download_n_results(query, 1)
2266 elif prefix == 'all':
2267 self._download_n_results(query, self._max_youtube_results)
2273 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2275 elif n > self._max_youtube_results:
2276 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2277 n = self._max_youtube_results
2278 self._download_n_results(query, n)
2280 except ValueError: # parsing prefix as integer fails
2281 self._download_n_results(query, 1)
2284 def _download_n_results(self, query, n):
2285 """Downloads a specified number of results for a query"""
2288 already_seen = set()
2292 self.report_download_page(query, pagenum)
2293 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2294 request = urllib2.Request(result_url)
2296 page = urllib2.urlopen(request).read()
2297 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2298 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2301 # Extract video identifiers
2302 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2303 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2304 if video_id not in already_seen:
2305 video_ids.append(video_id)
2306 already_seen.add(video_id)
2307 if len(video_ids) == n:
2308 # Specified n videos reached
2309 for id in video_ids:
2310 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2313 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2314 for id in video_ids:
2315 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2318 pagenum = pagenum + 1
2321 class GoogleSearchIE(InfoExtractor):
2322 """Information Extractor for Google Video search queries."""
2323 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2324 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2325 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2326 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2328 _max_google_results = 1000
2329 IE_NAME = u'video.google:search'
2331 def __init__(self, google_ie, downloader=None):
2332 InfoExtractor.__init__(self, downloader)
2333 self._google_ie = google_ie
2335 def report_download_page(self, query, pagenum):
2336 """Report attempt to download playlist page with given number."""
2337 query = query.decode(preferredencoding())
2338 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2340 def _real_initialize(self):
2341 self._google_ie.initialize()
2343 def _real_extract(self, query):
2344 mobj = re.match(self._VALID_URL, query)
2346 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2349 prefix, query = query.split(':')
2351 query = query.encode('utf-8')
2353 self._download_n_results(query, 1)
2355 elif prefix == 'all':
2356 self._download_n_results(query, self._max_google_results)
2362 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2364 elif n > self._max_google_results:
2365 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2366 n = self._max_google_results
2367 self._download_n_results(query, n)
2369 except ValueError: # parsing prefix as integer fails
2370 self._download_n_results(query, 1)
2373 def _download_n_results(self, query, n):
2374 """Downloads a specified number of results for a query"""
2377 already_seen = set()
2381 self.report_download_page(query, pagenum)
2382 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2383 request = urllib2.Request(result_url)
2385 page = urllib2.urlopen(request).read()
2386 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2387 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2390 # Extract video identifiers
2391 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2392 video_id = mobj.group(1)
2393 if video_id not in already_seen:
2394 video_ids.append(video_id)
2395 already_seen.add(video_id)
2396 if len(video_ids) == n:
2397 # Specified n videos reached
2398 for id in video_ids:
2399 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2402 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2403 for id in video_ids:
2404 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2407 pagenum = pagenum + 1
2410 class YahooSearchIE(InfoExtractor):
2411 """Information Extractor for Yahoo! Video search queries."""
2412 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2413 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2414 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2415 _MORE_PAGES_INDICATOR = r'\s*Next'
2417 _max_yahoo_results = 1000
2418 IE_NAME = u'video.yahoo:search'
2420 def __init__(self, yahoo_ie, downloader=None):
2421 InfoExtractor.__init__(self, downloader)
2422 self._yahoo_ie = yahoo_ie
2424 def report_download_page(self, query, pagenum):
2425 """Report attempt to download playlist page with given number."""
2426 query = query.decode(preferredencoding())
2427 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2429 def _real_initialize(self):
2430 self._yahoo_ie.initialize()
2432 def _real_extract(self, query):
2433 mobj = re.match(self._VALID_URL, query)
2435 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2438 prefix, query = query.split(':')
2440 query = query.encode('utf-8')
2442 self._download_n_results(query, 1)
2444 elif prefix == 'all':
2445 self._download_n_results(query, self._max_yahoo_results)
2451 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2453 elif n > self._max_yahoo_results:
2454 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2455 n = self._max_yahoo_results
2456 self._download_n_results(query, n)
2458 except ValueError: # parsing prefix as integer fails
2459 self._download_n_results(query, 1)
2462 def _download_n_results(self, query, n):
2463 """Downloads a specified number of results for a query"""
2466 already_seen = set()
2470 self.report_download_page(query, pagenum)
2471 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2472 request = urllib2.Request(result_url)
2474 page = urllib2.urlopen(request).read()
2475 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2476 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2479 # Extract video identifiers
2480 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2481 video_id = mobj.group(1)
2482 if video_id not in already_seen:
2483 video_ids.append(video_id)
2484 already_seen.add(video_id)
2485 if len(video_ids) == n:
2486 # Specified n videos reached
2487 for id in video_ids:
2488 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2491 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2492 for id in video_ids:
2493 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2496 pagenum = pagenum + 1
2499 class YoutubePlaylistIE(InfoExtractor):
2500 """Information Extractor for YouTube playlists."""
2502 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2503 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2504 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2505 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2507 IE_NAME = u'youtube:playlist'
2509 def __init__(self, youtube_ie, downloader=None):
2510 InfoExtractor.__init__(self, downloader)
2511 self._youtube_ie = youtube_ie
2513 def report_download_page(self, playlist_id, pagenum):
2514 """Report attempt to download playlist page with given number."""
2515 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2517 def _real_initialize(self):
2518 self._youtube_ie.initialize()
2520 def _real_extract(self, url):
2521 # Extract playlist id
2522 mobj = re.match(self._VALID_URL, url)
2524 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2528 if mobj.group(3) is not None:
2529 self._youtube_ie.extract(mobj.group(3))
2532 # Download playlist pages
2533 # prefix is 'p' as default for playlists but there are other types that need extra care
2534 playlist_prefix = mobj.group(1)
2535 if playlist_prefix == 'a':
2536 playlist_access = 'artist'
2538 playlist_prefix = 'p'
2539 playlist_access = 'view_play_list'
2540 playlist_id = mobj.group(2)
2545 self.report_download_page(playlist_id, pagenum)
2546 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2547 request = urllib2.Request(url)
2549 page = urllib2.urlopen(request).read()
2550 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2551 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2554 # Extract video identifiers
2556 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2557 if mobj.group(1) not in ids_in_page:
2558 ids_in_page.append(mobj.group(1))
2559 video_ids.extend(ids_in_page)
2561 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2563 pagenum = pagenum + 1
2565 playliststart = self._downloader.params.get('playliststart', 1) - 1
2566 playlistend = self._downloader.params.get('playlistend', -1)
2567 video_ids = video_ids[playliststart:playlistend]
2569 for id in video_ids:
2570 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2574 class YoutubeUserIE(InfoExtractor):
2575 """Information Extractor for YouTube users."""
2577 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2578 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2579 _GDATA_PAGE_SIZE = 50
2580 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2581 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2583 IE_NAME = u'youtube:user'
2585 def __init__(self, youtube_ie, downloader=None):
2586 InfoExtractor.__init__(self, downloader)
2587 self._youtube_ie = youtube_ie
2589 def report_download_page(self, username, start_index):
2590 """Report attempt to download user page."""
2591 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2592 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2594 def _real_initialize(self):
2595 self._youtube_ie.initialize()
2597 def _real_extract(self, url):
2599 mobj = re.match(self._VALID_URL, url)
2601 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2604 username = mobj.group(1)
2606 # Download video ids using YouTube Data API. Result size per
2607 # query is limited (currently to 50 videos) so we need to query
2608 # page by page until there are no video ids - it means we got
2615 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2616 self.report_download_page(username, start_index)
2618 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2621 page = urllib2.urlopen(request).read()
2622 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2623 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2626 # Extract video identifiers
2629 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2630 if mobj.group(1) not in ids_in_page:
2631 ids_in_page.append(mobj.group(1))
2633 video_ids.extend(ids_in_page)
2635 # A little optimization - if current page is not
2636 # "full", ie. does not contain PAGE_SIZE video ids then
2637 # we can assume that this page is the last one - there
2638 # are no more ids on further pages - no need to query
2641 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2646 all_ids_count = len(video_ids)
2647 playliststart = self._downloader.params.get('playliststart', 1) - 1
2648 playlistend = self._downloader.params.get('playlistend', -1)
2650 if playlistend == -1:
2651 video_ids = video_ids[playliststart:]
2653 video_ids = video_ids[playliststart:playlistend]
2655 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2656 (username, all_ids_count, len(video_ids)))
2658 for video_id in video_ids:
2659 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2662 class DepositFilesIE(InfoExtractor):
2663 """Information extractor for depositfiles.com"""
2665 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2666 IE_NAME = u'DepositFiles'
2668 def __init__(self, downloader=None):
2669 InfoExtractor.__init__(self, downloader)
2671 def report_download_webpage(self, file_id):
2672 """Report webpage download."""
2673 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2675 def report_extraction(self, file_id):
2676 """Report information extraction."""
2677 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2679 def _real_extract(self, url):
2680 # At this point we have a new file
2681 self._downloader.increment_downloads()
2683 file_id = url.split('/')[-1]
2684 # Rebuild url in english locale
2685 url = 'http://depositfiles.com/en/files/' + file_id
2687 # Retrieve file webpage with 'Free download' button pressed
2688 free_download_indication = { 'gateway_result' : '1' }
2689 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2691 self.report_download_webpage(file_id)
2692 webpage = urllib2.urlopen(request).read()
2693 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2694 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2697 # Search for the real file URL
2698 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2699 if (mobj is None) or (mobj.group(1) is None):
2700 # Try to figure out reason of the error.
2701 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2702 if (mobj is not None) and (mobj.group(1) is not None):
2703 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2704 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2706 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2709 file_url = mobj.group(1)
2710 file_extension = os.path.splitext(file_url)[1][1:]
2712 # Search for file title
2713 mobj = re.search(r'<b title="(.*?)">', webpage)
2715 self._downloader.trouble(u'ERROR: unable to extract title')
2717 file_title = mobj.group(1).decode('utf-8')
2720 # Process file information
2721 self._downloader.process_info({
2722 'id': file_id.decode('utf-8'),
2723 'url': file_url.decode('utf-8'),
2725 'upload_date': u'NA',
2726 'title': file_title,
2727 'stitle': file_title,
2728 'ext': file_extension.decode('utf-8'),
2732 except UnavailableVideoError, err:
2733 self._downloader.trouble(u'ERROR: unable to download file')
2736 class FacebookIE(InfoExtractor):
2737 """Information Extractor for Facebook"""
2739 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2740 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2741 _NETRC_MACHINE = 'facebook'
2742 _available_formats = ['video', 'highqual', 'lowqual']
2743 _video_extensions = {
2748 IE_NAME = u'facebook'
2750 def __init__(self, downloader=None):
2751 InfoExtractor.__init__(self, downloader)
2753 def _reporter(self, message):
2754 """Add header and report message."""
2755 self._downloader.to_screen(u'[facebook] %s' % message)
2757 def report_login(self):
2758 """Report attempt to log in."""
2759 self._reporter(u'Logging in')
2761 def report_video_webpage_download(self, video_id):
2762 """Report attempt to download video webpage."""
2763 self._reporter(u'%s: Downloading video webpage' % video_id)
2765 def report_information_extraction(self, video_id):
2766 """Report attempt to extract video information."""
2767 self._reporter(u'%s: Extracting video information' % video_id)
2769 def _parse_page(self, video_webpage):
2770 """Extract video information from page"""
2772 data = {'title': r'\("video_title", "(.*?)"\)',
2773 'description': r'<div class="datawrap">(.*?)</div>',
2774 'owner': r'\("video_owner_name", "(.*?)"\)',
2775 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2778 for piece in data.keys():
2779 mobj = re.search(data[piece], video_webpage)
2780 if mobj is not None:
2781 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2785 for fmt in self._available_formats:
2786 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2787 if mobj is not None:
2788 # URL is in a Javascript segment inside an escaped Unicode format within
2789 # the generally utf-8 page
2790 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2791 video_info['video_urls'] = video_urls
2795 def _real_initialize(self):
2796 if self._downloader is None:
2801 downloader_params = self._downloader.params
2803 # Attempt to use provided username and password or .netrc data
2804 if downloader_params.get('username', None) is not None:
2805 useremail = downloader_params['username']
2806 password = downloader_params['password']
2807 elif downloader_params.get('usenetrc', False):
2809 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2810 if info is not None:
2814 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2815 except (IOError, netrc.NetrcParseError), err:
2816 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2819 if useremail is None:
2828 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2831 login_results = urllib2.urlopen(request).read()
2832 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2833 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2835 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2836 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2839 def _real_extract(self, url):
2840 mobj = re.match(self._VALID_URL, url)
2842 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2844 video_id = mobj.group('ID')
2847 self.report_video_webpage_download(video_id)
2848 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2850 page = urllib2.urlopen(request)
2851 video_webpage = page.read()
2852 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2853 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2856 # Start extracting information
2857 self.report_information_extraction(video_id)
2859 # Extract information
2860 video_info = self._parse_page(video_webpage)
2863 if 'owner' not in video_info:
2864 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2866 video_uploader = video_info['owner']
2869 if 'title' not in video_info:
2870 self._downloader.trouble(u'ERROR: unable to extract video title')
2872 video_title = video_info['title']
2873 video_title = video_title.decode('utf-8')
2874 video_title = sanitize_title(video_title)
2876 simple_title = _simplify_title(video_title)
2879 if 'thumbnail' not in video_info:
2880 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2881 video_thumbnail = ''
2883 video_thumbnail = video_info['thumbnail']
2887 if 'upload_date' in video_info:
2888 upload_time = video_info['upload_date']
2889 timetuple = email.utils.parsedate_tz(upload_time)
2890 if timetuple is not None:
2892 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2897 video_description = video_info.get('description', 'No description available.')
2899 url_map = video_info['video_urls']
2900 if len(url_map.keys()) > 0:
2901 # Decide which formats to download
2902 req_format = self._downloader.params.get('format', None)
2903 format_limit = self._downloader.params.get('format_limit', None)
2905 if format_limit is not None and format_limit in self._available_formats:
2906 format_list = self._available_formats[self._available_formats.index(format_limit):]
2908 format_list = self._available_formats
2909 existing_formats = [x for x in format_list if x in url_map]
2910 if len(existing_formats) == 0:
2911 self._downloader.trouble(u'ERROR: no known formats available for video')
2913 if req_format is None:
2914 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2915 elif req_format == 'worst':
2916 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2917 elif req_format == '-1':
2918 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2921 if req_format not in url_map:
2922 self._downloader.trouble(u'ERROR: requested format not available')
2924 video_url_list = [(req_format, url_map[req_format])] # Specific format
2926 for format_param, video_real_url in video_url_list:
2928 # At this point we have a new video
2929 self._downloader.increment_downloads()
2932 video_extension = self._video_extensions.get(format_param, 'mp4')
2935 # Process video information
2936 self._downloader.process_info({
2937 'id': video_id.decode('utf-8'),
2938 'url': video_real_url.decode('utf-8'),
2939 'uploader': video_uploader.decode('utf-8'),
2940 'upload_date': upload_date,
2941 'title': video_title,
2942 'stitle': simple_title,
2943 'ext': video_extension.decode('utf-8'),
2944 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2945 'thumbnail': video_thumbnail.decode('utf-8'),
2946 'description': video_description.decode('utf-8'),
2949 except UnavailableVideoError, err:
2950 self._downloader.trouble(u'\nERROR: unable to download video')
2952 class BlipTVIE(InfoExtractor):
2953 """Information extractor for blip.tv"""
2955 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2956 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2957 IE_NAME = u'blip.tv'
2959 def report_extraction(self, file_id):
2960 """Report information extraction."""
2961 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2963 def report_direct_download(self, title):
2964 """Report information extraction."""
2965 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2967 def _real_extract(self, url):
2968 mobj = re.match(self._VALID_URL, url)
2970 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2977 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2978 request = urllib2.Request(json_url)
2979 self.report_extraction(mobj.group(1))
2982 urlh = urllib2.urlopen(request)
2983 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2984 basename = url.split('/')[-1]
2985 title,ext = os.path.splitext(basename)
2986 title = title.decode('UTF-8')
2987 ext = ext.replace('.', '')
2988 self.report_direct_download(title)
2993 'stitle': _simplify_title(title),
2997 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2998 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3000 if info is None: # Regular URL
3002 json_code = urlh.read()
3003 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3004 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3008 json_data = json.loads(json_code)
3009 if 'Post' in json_data:
3010 data = json_data['Post']
3014 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3015 video_url = data['media']['url']
3016 umobj = re.match(self._URL_EXT, video_url)
3018 raise ValueError('Can not determine filename extension')
3019 ext = umobj.group(1)
3022 'id': data['item_id'],
3024 'uploader': data['display_name'],
3025 'upload_date': upload_date,
3026 'title': data['title'],
3027 'stitle': _simplify_title(data['title']),
3029 'format': data['media']['mimeType'],
3030 'thumbnail': data['thumbnailUrl'],
3031 'description': data['description'],
3032 'player_url': data['embedUrl']
3034 except (ValueError,KeyError), err:
3035 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3038 self._downloader.increment_downloads()
3041 self._downloader.process_info(info)
3042 except UnavailableVideoError, err:
3043 self._downloader.trouble(u'\nERROR: unable to download video')
3046 class MyVideoIE(InfoExtractor):
3047 """Information Extractor for myvideo.de."""
3049 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3050 IE_NAME = u'myvideo'
3052 def __init__(self, downloader=None):
3053 InfoExtractor.__init__(self, downloader)
3055 def report_download_webpage(self, video_id):
3056 """Report webpage download."""
3057 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3059 def report_extraction(self, video_id):
3060 """Report information extraction."""
3061 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3063 def _real_extract(self,url):
3064 mobj = re.match(self._VALID_URL, url)
3066 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3069 video_id = mobj.group(1)
3072 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3074 self.report_download_webpage(video_id)
3075 webpage = urllib2.urlopen(request).read()
3076 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3077 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3080 self.report_extraction(video_id)
3081 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3084 self._downloader.trouble(u'ERROR: unable to extract media URL')
3086 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3088 mobj = re.search('<title>([^<]+)</title>', webpage)
3090 self._downloader.trouble(u'ERROR: unable to extract title')
3093 video_title = mobj.group(1)
3094 video_title = sanitize_title(video_title)
3096 simple_title = _simplify_title(video_title)
3099 self._downloader.process_info({
3103 'upload_date': u'NA',
3104 'title': video_title,
3105 'stitle': simple_title,
3110 except UnavailableVideoError:
3111 self._downloader.trouble(u'\nERROR: Unable to download video')
3113 class ComedyCentralIE(InfoExtractor):
3114 """Information extractor for The Daily Show and Colbert Report """
3116 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3117 IE_NAME = u'comedycentral'
3119 def report_extraction(self, episode_id):
3120 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3122 def report_config_download(self, episode_id):
3123 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3125 def report_index_download(self, episode_id):
3126 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3128 def report_player_url(self, episode_id):
3129 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3131 def _real_extract(self, url):
3132 mobj = re.match(self._VALID_URL, url)
3134 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3137 if mobj.group('shortname'):
3138 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3139 url = u'http://www.thedailyshow.com/full-episodes/'
3141 url = u'http://www.colbertnation.com/full-episodes/'
3142 mobj = re.match(self._VALID_URL, url)
3143 assert mobj is not None
3145 dlNewest = not mobj.group('episode')
3147 epTitle = mobj.group('showname')
3149 epTitle = mobj.group('episode')
3151 req = urllib2.Request(url)
3152 self.report_extraction(epTitle)
3154 htmlHandle = urllib2.urlopen(req)
3155 html = htmlHandle.read()
3156 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3157 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3160 url = htmlHandle.geturl()
3161 mobj = re.match(self._VALID_URL, url)
3163 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3165 if mobj.group('episode') == '':
3166 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3168 epTitle = mobj.group('episode')
3170 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3171 if len(mMovieParams) == 0:
3172 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3175 playerUrl_raw = mMovieParams[0][0]
3176 self.report_player_url(epTitle)
3178 urlHandle = urllib2.urlopen(playerUrl_raw)
3179 playerUrl = urlHandle.geturl()
3180 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3181 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3184 uri = mMovieParams[0][1]
3185 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3186 self.report_index_download(epTitle)
3188 indexXml = urllib2.urlopen(indexUrl).read()
3189 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3190 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3193 idoc = xml.etree.ElementTree.fromstring(indexXml)
3194 itemEls = idoc.findall('.//item')
3195 for itemEl in itemEls:
3196 mediaId = itemEl.findall('./guid')[0].text
3197 shortMediaId = mediaId.split(':')[-1]
3198 showId = mediaId.split(':')[-2].replace('.com', '')
3199 officialTitle = itemEl.findall('./title')[0].text
3200 officialDate = itemEl.findall('./pubDate')[0].text
3202 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3203 urllib.urlencode({'uri': mediaId}))
3204 configReq = urllib2.Request(configUrl)
3205 self.report_config_download(epTitle)
3207 configXml = urllib2.urlopen(configReq).read()
3208 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3209 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3212 cdoc = xml.etree.ElementTree.fromstring(configXml)
3214 for rendition in cdoc.findall('.//rendition'):
3215 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3219 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3222 # For now, just pick the highest bitrate
3223 format,video_url = turls[-1]
3225 self._downloader.increment_downloads()
3227 effTitle = showId + u'-' + epTitle
3232 'upload_date': officialDate,
3234 'stitle': _simplify_title(effTitle),
3238 'description': officialTitle,
3239 'player_url': playerUrl
3243 self._downloader.process_info(info)
3244 except UnavailableVideoError, err:
3245 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3249 class EscapistIE(InfoExtractor):
3250 """Information extractor for The Escapist """
3252 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3253 IE_NAME = u'escapist'
3255 def report_extraction(self, showName):
3256 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3258 def report_config_download(self, showName):
3259 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3261 def _real_extract(self, url):
3262 htmlParser = HTMLParser.HTMLParser()
3264 mobj = re.match(self._VALID_URL, url)
3266 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3268 showName = mobj.group('showname')
3269 videoId = mobj.group('episode')
3271 self.report_extraction(showName)
3273 webPage = urllib2.urlopen(url).read()
3274 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3275 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3278 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3279 description = htmlParser.unescape(descMatch.group(1))
3280 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3281 imgUrl = htmlParser.unescape(imgMatch.group(1))
3282 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3283 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3284 configUrlMatch = re.search('config=(.*)$', playerUrl)
3285 configUrl = urllib2.unquote(configUrlMatch.group(1))
3287 self.report_config_download(showName)
3289 configJSON = urllib2.urlopen(configUrl).read()
3290 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3291 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3294 # Technically, it's JavaScript, not JSON
3295 configJSON = configJSON.replace("'", '"')
3298 config = json.loads(configJSON)
3299 except (ValueError,), err:
3300 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3303 playlist = config['playlist']
3304 videoUrl = playlist[1]['url']
3306 self._downloader.increment_downloads()
3310 'uploader': showName,
3311 'upload_date': None,
3313 'stitle': _simplify_title(showName),
3316 'thumbnail': imgUrl,
3317 'description': description,
3318 'player_url': playerUrl,
3322 self._downloader.process_info(info)
3323 except UnavailableVideoError, err:
3324 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3327 class CollegeHumorIE(InfoExtractor):
3328 """Information extractor for collegehumor.com"""
3330 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3331 IE_NAME = u'collegehumor'
3333 def report_webpage(self, video_id):
3334 """Report information extraction."""
3335 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3337 def report_extraction(self, video_id):
3338 """Report information extraction."""
3339 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3341 def _real_extract(self, url):
3342 htmlParser = HTMLParser.HTMLParser()
3344 mobj = re.match(self._VALID_URL, url)
3346 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3348 video_id = mobj.group('videoid')
3350 self.report_webpage(video_id)
3351 request = urllib2.Request(url)
3353 webpage = urllib2.urlopen(request).read()
3354 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3355 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3358 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3360 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3362 internal_video_id = m.group('internalvideoid')
3366 'internal_id': internal_video_id,
3369 self.report_extraction(video_id)
3370 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3372 metaXml = urllib2.urlopen(xmlUrl).read()
3373 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3374 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3377 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3379 videoNode = mdoc.findall('./video')[0]
3380 info['description'] = videoNode.findall('./description')[0].text
3381 info['title'] = videoNode.findall('./caption')[0].text
3382 info['stitle'] = _simplify_title(info['title'])
3383 info['url'] = videoNode.findall('./file')[0].text
3384 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3385 info['ext'] = info['url'].rpartition('.')[2]
3386 info['format'] = info['ext']
3388 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3391 self._downloader.increment_downloads()
3394 self._downloader.process_info(info)
3395 except UnavailableVideoError, err:
3396 self._downloader.trouble(u'\nERROR: unable to download video')
3399 class XVideosIE(InfoExtractor):
3400 """Information extractor for xvideos.com"""
3402 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3403 IE_NAME = u'xvideos'
3405 def report_webpage(self, video_id):
3406 """Report information extraction."""
3407 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3409 def report_extraction(self, video_id):
3410 """Report information extraction."""
3411 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3413 def _real_extract(self, url):
3414 htmlParser = HTMLParser.HTMLParser()
3416 mobj = re.match(self._VALID_URL, url)
3418 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3420 video_id = mobj.group(1).decode('utf-8')
3422 self.report_webpage(video_id)
3424 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3426 webpage = urllib2.urlopen(request).read()
3427 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3428 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3431 self.report_extraction(video_id)
3435 mobj = re.search(r'flv_url=(.+?)&', webpage)
3437 self._downloader.trouble(u'ERROR: unable to extract video url')
3439 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3443 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3445 self._downloader.trouble(u'ERROR: unable to extract video title')
3447 video_title = mobj.group(1).decode('utf-8')
3450 # Extract video thumbnail
3451 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3453 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3455 video_thumbnail = mobj.group(1).decode('utf-8')
3459 self._downloader.increment_downloads()
3464 'upload_date': None,
3465 'title': video_title,
3466 'stitle': _simplify_title(video_title),
3469 'thumbnail': video_thumbnail,
3470 'description': None,
3475 self._downloader.process_info(info)
3476 except UnavailableVideoError, err:
3477 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3480 class SoundcloudIE(InfoExtractor):
3481 """Information extractor for soundcloud.com
3482 To access the media, the uid of the song and a stream token
3483 must be extracted from the page source and the script must make
3484 a request to media.soundcloud.com/crossdomain.xml. Then
3485 the media can be grabbed by requesting from an url composed
3486 of the stream token and uid
3489 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3490 IE_NAME = u'soundcloud'
3492 def __init__(self, downloader=None):
3493 InfoExtractor.__init__(self, downloader)
3495 def report_webpage(self, video_id):
3496 """Report information extraction."""
3497 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3499 def report_extraction(self, video_id):
3500 """Report information extraction."""
3501 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3503 def _real_extract(self, url):
3504 htmlParser = HTMLParser.HTMLParser()
3506 mobj = re.match(self._VALID_URL, url)
3508 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3511 # extract uploader (which is in the url)
3512 uploader = mobj.group(1).decode('utf-8')
3513 # extract simple title (uploader + slug of song title)
3514 slug_title = mobj.group(2).decode('utf-8')
3515 simple_title = uploader + '-' + slug_title
3517 self.report_webpage('%s/%s' % (uploader, slug_title))
3519 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3521 webpage = urllib2.urlopen(request).read()
3522 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3523 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3526 self.report_extraction('%s/%s' % (uploader, slug_title))
3528 # extract uid and stream token that soundcloud hands out for access
3529 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3531 video_id = mobj.group(1)
3532 stream_token = mobj.group(2)
3534 # extract unsimplified title
3535 mobj = re.search('"title":"(.*?)",', webpage)
3537 title = mobj.group(1)
3539 # construct media url (with uid/token)
3540 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3541 mediaURL = mediaURL % (video_id, stream_token)
3544 description = u'No description available'
3545 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3547 description = mobj.group(1)
3551 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3554 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3555 except Exception, e:
3558 # for soundcloud, a request to a cross domain is required for cookies
3559 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3562 self._downloader.process_info({
3563 'id': video_id.decode('utf-8'),
3565 'uploader': uploader.decode('utf-8'),
3566 'upload_date': upload_date,
3567 'title': simple_title.decode('utf-8'),
3568 'stitle': simple_title.decode('utf-8'),
3572 'description': description.decode('utf-8')
3574 except UnavailableVideoError:
3575 self._downloader.trouble(u'\nERROR: unable to download video')
3578 class InfoQIE(InfoExtractor):
3579 """Information extractor for infoq.com"""
3581 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3584 def report_webpage(self, video_id):
3585 """Report information extraction."""
3586 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3588 def report_extraction(self, video_id):
3589 """Report information extraction."""
3590 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3592 def _real_extract(self, url):
3593 htmlParser = HTMLParser.HTMLParser()
3595 mobj = re.match(self._VALID_URL, url)
3597 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3600 self.report_webpage(url)
3602 request = urllib2.Request(url)
3604 webpage = urllib2.urlopen(request).read()
3605 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3606 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3609 self.report_extraction(url)
3613 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3615 self._downloader.trouble(u'ERROR: unable to extract video url')
3617 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3621 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3623 self._downloader.trouble(u'ERROR: unable to extract video title')
3625 video_title = mobj.group(1).decode('utf-8')
3627 # Extract description
3628 video_description = u'No description available.'
3629 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3630 if mobj is not None:
3631 video_description = mobj.group(1).decode('utf-8')
3633 video_filename = video_url.split('/')[-1]
3634 video_id, extension = video_filename.split('.')
3636 self._downloader.increment_downloads()
3641 'upload_date': None,
3642 'title': video_title,
3643 'stitle': _simplify_title(video_title),
3645 'format': extension, # Extension is always(?) mp4, but seems to be flv
3647 'description': video_description,
3652 self._downloader.process_info(info)
3653 except UnavailableVideoError, err:
3654 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3656 class MixcloudIE(InfoExtractor):
3657 """Information extractor for www.mixcloud.com"""
3658 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3659 IE_NAME = u'mixcloud'
3661 def __init__(self, downloader=None):
3662 InfoExtractor.__init__(self, downloader)
3664 def report_download_json(self, file_id):
3665 """Report JSON download."""
3666 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3668 def report_extraction(self, file_id):
3669 """Report information extraction."""
3670 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3672 def get_urls(self, jsonData, fmt, bitrate='best'):
3673 """Get urls from 'audio_formats' section in json"""
3676 bitrate_list = jsonData[fmt]
3677 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3678 bitrate = max(bitrate_list) # select highest
3680 url_list = jsonData[fmt][bitrate]
3681 except TypeError: # we have no bitrate info.
3682 url_list = jsonData[fmt]
3686 def check_urls(self, url_list):
3687 """Returns 1st active url from list"""
3688 for url in url_list:
3690 urllib2.urlopen(url)
3692 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3697 def _print_formats(self, formats):
3698 print 'Available formats:'
3699 for fmt in formats.keys():
3700 for b in formats[fmt]:
3702 ext = formats[fmt][b][0]
3703 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3704 except TypeError: # we have no bitrate info
3705 ext = formats[fmt][0]
3706 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3709 def _real_extract(self, url):
3710 mobj = re.match(self._VALID_URL, url)
3712 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3714 # extract uploader & filename from url
3715 uploader = mobj.group(1).decode('utf-8')
3716 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3718 # construct API request
3719 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3720 # retrieve .json file with links to files
3721 request = urllib2.Request(file_url)
3723 self.report_download_json(file_url)
3724 jsonData = urllib2.urlopen(request).read()
3725 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3726 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3730 json_data = json.loads(jsonData)
3731 player_url = json_data['player_swf_url']
3732 formats = dict(json_data['audio_formats'])
3734 req_format = self._downloader.params.get('format', None)
3737 if self._downloader.params.get('listformats', None):
3738 self._print_formats(formats)
3741 if req_format is None or req_format == 'best':
3742 for format_param in formats.keys():
3743 url_list = self.get_urls(formats, format_param)
3745 file_url = self.check_urls(url_list)
3746 if file_url is not None:
3749 if req_format not in formats.keys():
3750 self._downloader.trouble(u'ERROR: format is not available')
3753 url_list = self.get_urls(formats, req_format)
3754 file_url = self.check_urls(url_list)
3755 format_param = req_format
3758 self._downloader.increment_downloads()
3760 # Process file information
3761 self._downloader.process_info({
3762 'id': file_id.decode('utf-8'),
3763 'url': file_url.decode('utf-8'),
3764 'uploader': uploader.decode('utf-8'),
3765 'upload_date': u'NA',
3766 'title': json_data['name'],
3767 'stitle': _simplify_title(json_data['name']),
3768 'ext': file_url.split('.')[-1].decode('utf-8'),
3769 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3770 'thumbnail': json_data['thumbnail_url'],
3771 'description': json_data['description'],
3772 'player_url': player_url.decode('utf-8'),
3774 except UnavailableVideoError, err:
3775 self._downloader.trouble(u'ERROR: unable to download file')
3777 class StanfordOpenClassroomIE(InfoExtractor):
3778 """Information extractor for Stanford's Open ClassRoom"""
3780 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3781 IE_NAME = u'stanfordoc'
3783 def report_download_webpage(self, objid):
3784 """Report information extraction."""
3785 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3787 def report_extraction(self, video_id):
3788 """Report information extraction."""
3789 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3791 def _real_extract(self, url):
3792 mobj = re.match(self._VALID_URL, url)
3794 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3797 if mobj.group('course') and mobj.group('video'): # A specific video
3798 course = mobj.group('course')
3799 video = mobj.group('video')
3801 'id': _simplify_title(course + '_' + video),
3804 self.report_extraction(info['id'])
3805 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3806 xmlUrl = baseUrl + video + '.xml'
3808 metaXml = urllib2.urlopen(xmlUrl).read()
3809 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3810 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3812 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3814 info['title'] = mdoc.findall('./title')[0].text
3815 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3817 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3819 info['stitle'] = _simplify_title(info['title'])
3820 info['ext'] = info['url'].rpartition('.')[2]
3821 info['format'] = info['ext']
3822 self._downloader.increment_downloads()
3824 self._downloader.process_info(info)
3825 except UnavailableVideoError, err:
3826 self._downloader.trouble(u'\nERROR: unable to download video')
3827 elif mobj.group('course'): # A course page
3828 unescapeHTML = HTMLParser.HTMLParser().unescape
3830 course = mobj.group('course')
3832 'id': _simplify_title(course),
3836 self.report_download_webpage(info['id'])
3838 coursepage = urllib2.urlopen(url).read()
3839 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3840 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3843 m = re.search('<h1>([^<]+)</h1>', coursepage)
3845 info['title'] = unescapeHTML(m.group(1))
3847 info['title'] = info['id']
3848 info['stitle'] = _simplify_title(info['title'])
3850 m = re.search('<description>([^<]+)</description>', coursepage)
3852 info['description'] = unescapeHTML(m.group(1))
3854 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3857 'type': 'reference',
3858 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3862 for entry in info['list']:
3863 assert entry['type'] == 'reference'
3864 self.extract(entry['url'])
3866 unescapeHTML = HTMLParser.HTMLParser().unescape
3869 'id': 'Stanford OpenClassroom',
3873 self.report_download_webpage(info['id'])
3874 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3876 rootpage = urllib2.urlopen(rootURL).read()
3877 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3878 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3881 info['title'] = info['id']
3882 info['stitle'] = _simplify_title(info['title'])
3884 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3887 'type': 'reference',
3888 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3892 for entry in info['list']:
3893 assert entry['type'] == 'reference'
3894 self.extract(entry['url'])
3896 class MTVIE(InfoExtractor):
3897 """Information extractor for MTV.com"""
3899 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3902 def report_webpage(self, video_id):
3903 """Report information extraction."""
3904 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3906 def report_extraction(self, video_id):
3907 """Report information extraction."""
3908 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3910 def _real_extract(self, url):
3911 mobj = re.match(self._VALID_URL, url)
3913 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3915 if not mobj.group('proto'):
3916 url = 'http://' + url
3917 video_id = mobj.group('videoid')
3918 self.report_webpage(video_id)
3920 request = urllib2.Request(url)
3922 webpage = urllib2.urlopen(request).read()
3923 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3924 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3927 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3929 self._downloader.trouble(u'ERROR: unable to extract song name')
3931 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3932 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3934 self._downloader.trouble(u'ERROR: unable to extract performer')
3936 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3937 video_title = performer + ' - ' + song_name
3939 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3941 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3943 mtvn_uri = mobj.group(1)
3945 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3947 self._downloader.trouble(u'ERROR: unable to extract content id')
3949 content_id = mobj.group(1)
3951 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3952 self.report_extraction(video_id)
3953 request = urllib2.Request(videogen_url)
3955 metadataXml = urllib2.urlopen(request).read()
3956 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3957 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3960 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3961 renditions = mdoc.findall('.//rendition')
3963 # For now, always pick the highest quality.
3964 rendition = renditions[-1]
3967 _,_,ext = rendition.attrib['type'].partition('/')
3968 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3969 video_url = rendition.find('./src').text
3971 self._downloader.trouble('Invalid rendition field.')
3974 self._downloader.increment_downloads()
3978 'uploader': performer,
3979 'title': video_title,
3980 'stitle': _simplify_title(video_title),
3986 self._downloader.process_info(info)
3987 except UnavailableVideoError, err:
3988 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3991 class PostProcessor(object):
3992 """Post Processor class.
3994 PostProcessor objects can be added to downloaders with their
3995 add_post_processor() method. When the downloader has finished a
3996 successful download, it will take its internal chain of PostProcessors
3997 and start calling the run() method on each one of them, first with
3998 an initial argument and then with the returned value of the previous
4001 The chain will be stopped if one of them ever returns None or the end
4002 of the chain is reached.
4004 PostProcessor objects follow a "mutual registration" process similar
4005 to InfoExtractor objects.
4010 def __init__(self, downloader=None):
4011 self._downloader = downloader
4013 def set_downloader(self, downloader):
4014 """Sets the downloader for this PP."""
4015 self._downloader = downloader
4017 def run(self, information):
4018 """Run the PostProcessor.
4020 The "information" argument is a dictionary like the ones
4021 composed by InfoExtractors. The only difference is that this
4022 one has an extra field called "filepath" that points to the
4025 When this method returns None, the postprocessing chain is
4026 stopped. However, this method may return an information
4027 dictionary that will be passed to the next postprocessing
4028 object in the chain. It can be the one it received after
4029 changing some fields.
4031 In addition, this method may raise a PostProcessingError
4032 exception that will be taken into account by the downloader
4035 return information # by default, do nothing
4037 class AudioConversionError(BaseException):
4038 def __init__(self, message):
4039 self.message = message
4041 class FFmpegExtractAudioPP(PostProcessor):
4043 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4044 PostProcessor.__init__(self, downloader)
4045 if preferredcodec is None:
4046 preferredcodec = 'best'
4047 self._preferredcodec = preferredcodec
4048 self._preferredquality = preferredquality
4049 self._keepvideo = keepvideo
4052 def get_audio_codec(path):
4054 cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4055 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4056 output = handle.communicate()[0]
4057 if handle.wait() != 0:
4059 except (IOError, OSError):
4062 for line in output.split('\n'):
4063 if line.startswith('codec_name='):
4064 audio_codec = line.split('=')[1].strip()
4065 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4070 def run_ffmpeg(path, out_path, codec, more_opts):
4074 acodec_opts = ['-acodec', codec]
4075 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4077 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4078 stdout,stderr = p.communicate()
4079 except (IOError, OSError):
4080 e = sys.exc_info()[1]
4081 if isinstance(e, OSError) and e.errno == 2:
4082 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4085 if p.returncode != 0:
4086 msg = stderr.strip().split('\n')[-1]
4087 raise AudioConversionError(msg)
4089 def run(self, information):
4090 path = information['filepath']
4092 filecodec = self.get_audio_codec(path)
4093 if filecodec is None:
4094 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4098 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4099 if self._preferredcodec == 'm4a' and filecodec == 'aac':
4100 # Lossless, but in another container
4102 extension = self._preferredcodec
4103 more_opts = ['-absf', 'aac_adtstoasc']
4104 elif filecodec in ['aac', 'mp3', 'vorbis']:
4105 # Lossless if possible
4107 extension = filecodec
4108 if filecodec == 'aac':
4109 more_opts = ['-f', 'adts']
4110 if filecodec == 'vorbis':
4114 acodec = 'libmp3lame'
4117 if self._preferredquality is not None:
4118 more_opts += ['-ab', self._preferredquality]
4120 # We convert the audio (lossy)
4121 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4122 extension = self._preferredcodec
4124 if self._preferredquality is not None:
4125 more_opts += ['-ab', self._preferredquality]
4126 if self._preferredcodec == 'aac':
4127 more_opts += ['-f', 'adts']
4128 if self._preferredcodec == 'm4a':
4129 more_opts += ['-absf', 'aac_adtstoasc']
4130 if self._preferredcodec == 'vorbis':
4132 if self._preferredcodec == 'wav':
4134 more_opts += ['-f', 'wav']
4136 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4137 new_path = prefix + sep + extension
4138 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4140 self.run_ffmpeg(path, new_path, acodec, more_opts)
4142 etype,e,tb = sys.exc_info()
4143 if isinstance(e, AudioConversionError):
4144 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4146 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4149 # Try to update the date time for extracted audio file.
4150 if information.get('filetime') is not None:
4152 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4154 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4156 if not self._keepvideo:
4158 os.remove(_encodeFilename(path))
4159 except (IOError, OSError):
4160 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4163 information['filepath'] = new_path
4167 def updateSelf(downloader, filename):
4168 ''' Update the program file with the latest version from the repository '''
4169 # Note: downloader only used for options
4170 if not os.access(filename, os.W_OK):
4171 sys.exit('ERROR: no write permissions on %s' % filename)
4173 downloader.to_screen(u'Updating to latest version...')
4177 urlh = urllib.urlopen(UPDATE_URL)
4178 newcontent = urlh.read()
4180 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4181 if vmatch is not None and vmatch.group(1) == __version__:
4182 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4186 except (IOError, OSError), err:
4187 sys.exit('ERROR: unable to download latest version')
4190 outf = open(filename, 'wb')
4192 outf.write(newcontent)
4195 except (IOError, OSError), err:
4196 sys.exit('ERROR: unable to overwrite current version')
4198 downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4206 def _readOptions(filename_bytes):
4208 optionf = open(filename_bytes)
4210 return [] # silently skip if file is not present
4214 res += shlex.split(l, comments=True)
4219 def _format_option_string(option):
4220 ''' ('-o', '--option') -> -o, --format METAVAR'''
4224 if option._short_opts: opts.append(option._short_opts[0])
4225 if option._long_opts: opts.append(option._long_opts[0])
4226 if len(opts) > 1: opts.insert(1, ', ')
4228 if option.takes_value(): opts.append(' %s' % option.metavar)
4230 return "".join(opts)
4232 def _find_term_columns():
4233 columns = os.environ.get('COLUMNS', None)
4238 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4239 out,err = sp.communicate()
4240 return int(out.split()[1])
4246 max_help_position = 80
4248 # No need to wrap help messages if we're on a wide console
4249 columns = _find_term_columns()
4250 if columns: max_width = columns
4252 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4253 fmt.format_option_strings = _format_option_string
4256 'version' : __version__,
4258 'usage' : '%prog [options] url [url...]',
4259 'conflict_handler' : 'resolve',
4262 parser = optparse.OptionParser(**kw)
4265 general = optparse.OptionGroup(parser, 'General Options')
4266 selection = optparse.OptionGroup(parser, 'Video Selection')
4267 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4268 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4269 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4270 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4271 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4273 general.add_option('-h', '--help',
4274 action='help', help='print this help text and exit')
4275 general.add_option('-v', '--version',
4276 action='version', help='print program version and exit')
4277 general.add_option('-U', '--update',
4278 action='store_true', dest='update_self', help='update this program to latest version')
4279 general.add_option('-i', '--ignore-errors',
4280 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4281 general.add_option('-r', '--rate-limit',
4282 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4283 general.add_option('-R', '--retries',
4284 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4285 general.add_option('--dump-user-agent',
4286 action='store_true', dest='dump_user_agent',
4287 help='display the current browser identification', default=False)
4288 general.add_option('--list-extractors',
4289 action='store_true', dest='list_extractors',
4290 help='List all supported extractors and the URLs they would handle', default=False)
4292 selection.add_option('--playlist-start',
4293 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4294 selection.add_option('--playlist-end',
4295 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4296 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4297 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4298 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4300 authentication.add_option('-u', '--username',
4301 dest='username', metavar='USERNAME', help='account username')
4302 authentication.add_option('-p', '--password',
4303 dest='password', metavar='PASSWORD', help='account password')
4304 authentication.add_option('-n', '--netrc',
4305 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4308 video_format.add_option('-f', '--format',
4309 action='store', dest='format', metavar='FORMAT', help='video format code')
4310 video_format.add_option('--all-formats',
4311 action='store_const', dest='format', help='download all available video formats', const='all')
4312 video_format.add_option('--prefer-free-formats',
4313 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4314 video_format.add_option('--max-quality',
4315 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4316 video_format.add_option('-F', '--list-formats',
4317 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4320 verbosity.add_option('-q', '--quiet',
4321 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4322 verbosity.add_option('-s', '--simulate',
4323 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4324 verbosity.add_option('--skip-download',
4325 action='store_true', dest='skip_download', help='do not download the video', default=False)
4326 verbosity.add_option('-g', '--get-url',
4327 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4328 verbosity.add_option('-e', '--get-title',
4329 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4330 verbosity.add_option('--get-thumbnail',
4331 action='store_true', dest='getthumbnail',
4332 help='simulate, quiet but print thumbnail URL', default=False)
4333 verbosity.add_option('--get-description',
4334 action='store_true', dest='getdescription',
4335 help='simulate, quiet but print video description', default=False)
4336 verbosity.add_option('--get-filename',
4337 action='store_true', dest='getfilename',
4338 help='simulate, quiet but print output filename', default=False)
4339 verbosity.add_option('--get-format',
4340 action='store_true', dest='getformat',
4341 help='simulate, quiet but print output format', default=False)
4342 verbosity.add_option('--no-progress',
4343 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4344 verbosity.add_option('--console-title',
4345 action='store_true', dest='consoletitle',
4346 help='display progress in console titlebar', default=False)
4349 filesystem.add_option('-t', '--title',
4350 action='store_true', dest='usetitle', help='use title in file name', default=False)
4351 filesystem.add_option('-l', '--literal',
4352 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4353 filesystem.add_option('-A', '--auto-number',
4354 action='store_true', dest='autonumber',
4355 help='number downloaded files starting from 00000', default=False)
4356 filesystem.add_option('-o', '--output',
4357 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4358 filesystem.add_option('-a', '--batch-file',
4359 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4360 filesystem.add_option('-w', '--no-overwrites',
4361 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4362 filesystem.add_option('-c', '--continue',
4363 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
4364 filesystem.add_option('--no-continue',
4365 action='store_false', dest='continue_dl',
4366 help='do not resume partially downloaded files (restart from beginning)')
4367 filesystem.add_option('--cookies',
4368 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4369 filesystem.add_option('--no-part',
4370 action='store_true', dest='nopart', help='do not use .part files', default=False)
4371 filesystem.add_option('--no-mtime',
4372 action='store_false', dest='updatetime',
4373 help='do not use the Last-modified header to set the file modification time', default=True)
4374 filesystem.add_option('--write-description',
4375 action='store_true', dest='writedescription',
4376 help='write video description to a .description file', default=False)
4377 filesystem.add_option('--write-info-json',
4378 action='store_true', dest='writeinfojson',
4379 help='write video metadata to a .info.json file', default=False)
4382 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4383 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4384 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4385 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4386 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4387 help='ffmpeg audio bitrate specification, 128k by default')
4388 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4389 help='keeps the video file on disk after the post-processing; the video is erased by default')
4392 parser.add_option_group(general)
4393 parser.add_option_group(selection)
4394 parser.add_option_group(filesystem)
4395 parser.add_option_group(verbosity)
4396 parser.add_option_group(video_format)
4397 parser.add_option_group(authentication)
4398 parser.add_option_group(postproc)
4400 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4402 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4404 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4405 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4406 opts, args = parser.parse_args(argv)
4408 return parser, opts, args
4410 def gen_extractors():
4411 """ Return a list of an instance of every supported extractor.
4412 The order does matter; the first extractor matched is the one handling the URL.
4414 youtube_ie = YoutubeIE()
4415 google_ie = GoogleIE()
4416 yahoo_ie = YahooIE()
4418 YoutubePlaylistIE(youtube_ie),
4419 YoutubeUserIE(youtube_ie),
4420 YoutubeSearchIE(youtube_ie),
4422 MetacafeIE(youtube_ie),
4425 GoogleSearchIE(google_ie),
4428 YahooSearchIE(yahoo_ie),
4441 StanfordOpenClassroomIE(),
4448 parser, opts, args = parseOpts()
4450 # Open appropriate CookieJar
4451 if opts.cookiefile is None:
4452 jar = cookielib.CookieJar()
4455 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4456 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4458 except (IOError, OSError), err:
4459 sys.exit(u'ERROR: unable to open cookie file')
4462 if opts.dump_user_agent:
4463 print std_headers['User-Agent']
4466 # Batch file verification
4468 if opts.batchfile is not None:
4470 if opts.batchfile == '-':
4473 batchfd = open(opts.batchfile, 'r')
4474 batchurls = batchfd.readlines()
4475 batchurls = [x.strip() for x in batchurls]
4476 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4478 sys.exit(u'ERROR: batch file could not be read')
4479 all_urls = batchurls + args
4481 # General configuration
4482 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4483 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4484 urllib2.install_opener(opener)
4485 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4487 extractors = gen_extractors()
4489 if opts.list_extractors:
4490 for ie in extractors:
4492 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4493 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4494 for mu in matchedUrls:
4498 # Conflicting, missing and erroneous options
4499 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4500 parser.error(u'using .netrc conflicts with giving username/password')
4501 if opts.password is not None and opts.username is None:
4502 parser.error(u'account username missing')
4503 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4504 parser.error(u'using output template conflicts with using title, literal title or auto number')
4505 if opts.usetitle and opts.useliteral:
4506 parser.error(u'using title conflicts with using literal title')
4507 if opts.username is not None and opts.password is None:
4508 opts.password = getpass.getpass(u'Type account password and press return:')
4509 if opts.ratelimit is not None:
4510 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4511 if numeric_limit is None:
4512 parser.error(u'invalid rate limit specified')
4513 opts.ratelimit = numeric_limit
4514 if opts.retries is not None:
4516 opts.retries = long(opts.retries)
4517 except (TypeError, ValueError), err:
4518 parser.error(u'invalid retry count specified')
4520 opts.playliststart = int(opts.playliststart)
4521 if opts.playliststart <= 0:
4522 raise ValueError(u'Playlist start must be positive')
4523 except (TypeError, ValueError), err:
4524 parser.error(u'invalid playlist start number specified')
4526 opts.playlistend = int(opts.playlistend)
4527 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4528 raise ValueError(u'Playlist end must be greater than playlist start')
4529 except (TypeError, ValueError), err:
4530 parser.error(u'invalid playlist end number specified')
4531 if opts.extractaudio:
4532 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4533 parser.error(u'invalid audio format specified')
4536 fd = FileDownloader({
4537 'usenetrc': opts.usenetrc,
4538 'username': opts.username,
4539 'password': opts.password,
4540 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4541 'forceurl': opts.geturl,
4542 'forcetitle': opts.gettitle,
4543 'forcethumbnail': opts.getthumbnail,
4544 'forcedescription': opts.getdescription,
4545 'forcefilename': opts.getfilename,
4546 'forceformat': opts.getformat,
4547 'simulate': opts.simulate,
4548 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4549 'format': opts.format,
4550 'format_limit': opts.format_limit,
4551 'listformats': opts.listformats,
4552 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4553 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4554 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4555 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4556 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4557 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4558 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4559 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4560 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4561 or u'%(id)s.%(ext)s'),
4562 'ignoreerrors': opts.ignoreerrors,
4563 'ratelimit': opts.ratelimit,
4564 'nooverwrites': opts.nooverwrites,
4565 'retries': opts.retries,
4566 'continuedl': opts.continue_dl,
4567 'noprogress': opts.noprogress,
4568 'playliststart': opts.playliststart,
4569 'playlistend': opts.playlistend,
4570 'logtostderr': opts.outtmpl == '-',
4571 'consoletitle': opts.consoletitle,
4572 'nopart': opts.nopart,
4573 'updatetime': opts.updatetime,
4574 'writedescription': opts.writedescription,
4575 'writeinfojson': opts.writeinfojson,
4576 'matchtitle': opts.matchtitle,
4577 'rejecttitle': opts.rejecttitle,
4578 'max_downloads': opts.max_downloads,
4579 'prefer_free_formats': opts.prefer_free_formats,
4581 for extractor in extractors:
4582 fd.add_info_extractor(extractor)
4585 if opts.extractaudio:
4586 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4589 if opts.update_self:
4590 updateSelf(fd, sys.argv[0])
4593 if len(all_urls) < 1:
4594 if not opts.update_self:
4595 parser.error(u'you must provide at least one URL')
4600 retcode = fd.download(all_urls)
4601 except MaxDownloadsReached:
4602 fd.to_screen(u'--max-download limit reached, aborting.')
4605 # Dump cookie jar if requested
4606 if opts.cookiefile is not None:
4609 except (IOError, OSError), err:
4610 sys.exit(u'ERROR: unable to save cookie jar')
4617 except DownloadError:
4619 except SameFileError:
4620 sys.exit(u'ERROR: fixed output name but more than one file to download')
4621 except KeyboardInterrupt:
4622 sys.exit(u'\nERROR: Interrupted by user')
4624 if __name__ == '__main__':
4627 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: