2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
21 __license__ = 'Public Domain'
22 __version__ = '2012.02.27'
24 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
57 except ImportError: # Python 2.4
60 import cStringIO as StringIO
64 # parse_qs was moved from the cgi module to the urlparse module recently.
66 from urlparse import parse_qs
68 from cgi import parse_qs
71 import xml.etree.ElementTree
72 except ImportError: # Python<2.5: Not officially supported, but let it slip
73 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
76 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
77 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
78 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
79 'Accept-Encoding': 'gzip, deflate',
80 'Accept-Language': 'en-us,en;q=0.5',
85 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
91 def raiseError(msg, i):
92 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
93 def skipSpace(i, expectMore=True):
94 while i < len(s) and s[i] in ' \t\r\n':
98 raiseError('Premature end', i)
100 def decodeEscape(match):
116 return unichr(int(esc[1:5], 16))
117 if len(esc) == 5+6 and esc[5:7] == '\\u':
118 hi = int(esc[1:5], 16)
119 low = int(esc[7:11], 16)
120 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
121 raise ValueError('Unknown escape ' + str(esc))
128 while s[e-bslashes-1] == '\\':
130 if bslashes % 2 == 1:
134 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
135 stri = rexp.sub(decodeEscape, s[i:e])
141 if s[i] == '}': # Empty dictionary
145 raiseError('Expected a string object key', i)
146 i,key = parseString(i)
148 if i >= len(s) or s[i] != ':':
149 raiseError('Expected a colon', i)
156 raiseError('Expected comma or closing curly brace', i)
161 if s[i] == ']': # Empty array
166 i = skipSpace(i) # Raise exception if premature end
170 raiseError('Expected a comma or closing bracket', i)
172 def parseDiscrete(i):
173 for k,v in {'true': True, 'false': False, 'null': None}.items():
174 if s.startswith(k, i):
176 raiseError('Not a boolean (or null)', i)
178 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
180 raiseError('Not a number', i)
182 if '.' in nums or 'e' in nums or 'E' in nums:
183 return (i+len(nums), float(nums))
184 return (i+len(nums), int(nums))
185 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
188 i,res = CHARMAP.get(s[i], parseNumber)(i)
189 i = skipSpace(i, False)
193 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
197 class IDParser(HTMLParser.HTMLParser):
198 """Modified HTMLParser that isolates a tag with the specified id"""
199 def __init__(self, id):
205 self.watch_startpos = False
206 HTMLParser.HTMLParser.__init__(self)
208 def loads(self, html):
213 def handle_starttag(self, tag, attrs):
216 self.find_startpos(None)
217 if 'id' in attrs and attrs['id'] == self.id:
220 self.watch_startpos = True
222 if not tag in self.depth: self.depth[tag] = 0
225 def handle_endtag(self, tag):
227 if tag in self.depth: self.depth[tag] -= 1
228 if self.depth[self.result[0]] == 0:
230 self.result.append(self.getpos())
232 def find_startpos(self, x):
233 """Needed to put the start position of the result (self.result[1])
234 after the opening tag with the requested id"""
235 if self.watch_startpos:
236 self.watch_startpos = False
237 self.result.append(self.getpos())
238 handle_entityref = handle_charref = handle_data = handle_comment = \
239 handle_decl = handle_pi = unknown_decl = find_startpos
241 def get_result(self):
242 if self.result == None: return None
243 if len(self.result) != 3: return None
244 lines = self.html.split('\n')
245 lines = lines[self.result[1][0]-1:self.result[2][0]]
246 lines[0] = lines[0][self.result[1][1]:]
248 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
249 lines[-1] = lines[-1][:self.result[2][1]]
250 return '\n'.join(lines).strip()
252 def get_element_by_id(id, html):
253 """Return the content of the tag with the specified id in the passed HTML document"""
254 parser = IDParser(id)
257 except HTMLParser.HTMLParseError:
259 return parser.get_result()
262 def preferredencoding():
263 """Get preferred encoding.
265 Returns the best encoding scheme for the system, based on
266 locale.getpreferredencoding() and some further tweaks.
268 def yield_preferredencoding():
270 pref = locale.getpreferredencoding()
276 return yield_preferredencoding().next()
279 def htmlentity_transform(matchobj):
280 """Transforms an HTML entity to a Unicode character.
282 This function receives a match object and is intended to be used with
283 the re.sub() function.
285 entity = matchobj.group(1)
287 # Known non-numeric HTML entity
288 if entity in htmlentitydefs.name2codepoint:
289 return unichr(htmlentitydefs.name2codepoint[entity])
292 mobj = re.match(ur'(?u)#(x?\d+)', entity)
294 numstr = mobj.group(1)
295 if numstr.startswith(u'x'):
297 numstr = u'0%s' % numstr
300 return unichr(long(numstr, base))
302 # Unknown entity in name, return its literal representation
303 return (u'&%s;' % entity)
306 def clean_html(html):
307 """Clean an HTML snippet into a readable string"""
309 html = html.replace('\n', ' ')
310 html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
312 html = re.sub('<.*?>', '', html)
313 # Replace html entities
314 html = _unescapeHTML(html)
318 def sanitize_title(utitle):
319 """Sanitizes a video title so it could be used as part of a filename."""
320 utitle = _unescapeHTML(utitle)
321 return utitle.replace(unicode(os.sep), u'%')
324 def sanitize_open(filename, open_mode):
325 """Try to open the given filename, and slightly tweak it if this fails.
327 Attempts to open the given filename. If this fails, it tries to change
328 the filename slightly, step by step, until it's either able to open it
329 or it fails and raises a final exception, like the standard open()
332 It returns the tuple (stream, definitive_file_name).
336 if sys.platform == 'win32':
338 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
339 return (sys.stdout, filename)
340 stream = open(_encodeFilename(filename), open_mode)
341 return (stream, filename)
342 except (IOError, OSError), err:
343 # In case of error, try to remove win32 forbidden chars
344 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
346 # An exception here should be caught in the caller
347 stream = open(_encodeFilename(filename), open_mode)
348 return (stream, filename)
351 def timeconvert(timestr):
352 """Convert RFC 2822 defined time string into system timestamp"""
354 timetuple = email.utils.parsedate_tz(timestr)
355 if timetuple is not None:
356 timestamp = email.utils.mktime_tz(timetuple)
359 def _simplify_title(title):
360 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
361 return expr.sub(u'_', title).strip(u'_')
363 def _orderedSet(iterable):
364 """ Remove all duplicates from the input iterable """
371 def _unescapeHTML(s):
373 @param s a string (of type unicode)
375 assert type(s) == type(u'')
377 result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
380 def _encodeFilename(s):
382 @param s The name of the file (of type unicode)
385 assert type(s) == type(u'')
387 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
388 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
389 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
390 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
393 return s.encode(sys.getfilesystemencoding(), 'ignore')
395 class DownloadError(Exception):
396 """Download Error exception.
398 This exception may be thrown by FileDownloader objects if they are not
399 configured to continue on errors. They will contain the appropriate
405 class SameFileError(Exception):
406 """Same File exception.
408 This exception will be thrown by FileDownloader objects if they detect
409 multiple files would have to be downloaded to the same file on disk.
414 class PostProcessingError(Exception):
415 """Post Processing exception.
417 This exception may be raised by PostProcessor's .run() method to
418 indicate an error in the postprocessing task.
422 class MaxDownloadsReached(Exception):
423 """ --max-downloads limit has been reached. """
427 class UnavailableVideoError(Exception):
428 """Unavailable Format exception.
430 This exception will be thrown when a video is requested
431 in a format that is not available for that video.
436 class ContentTooShortError(Exception):
437 """Content Too Short exception.
439 This exception may be raised by FileDownloader objects when a file they
440 download is too small for what the server announced first, indicating
441 the connection was probably interrupted.
447 def __init__(self, downloaded, expected):
448 self.downloaded = downloaded
449 self.expected = expected
452 class YoutubeDLHandler(urllib2.HTTPHandler):
453 """Handler for HTTP requests and responses.
455 This class, when installed with an OpenerDirector, automatically adds
456 the standard headers to every HTTP request and handles gzipped and
457 deflated responses from web servers. If compression is to be avoided in
458 a particular request, the original request in the program code only has
459 to include the HTTP header "Youtubedl-No-Compression", which will be
460 removed before making the real request.
462 Part of this code was copied from:
464 http://techknack.net/python-urllib2-handlers/
466 Andrew Rowls, the author of that code, agreed to release it to the
473 return zlib.decompress(data, -zlib.MAX_WBITS)
475 return zlib.decompress(data)
478 def addinfourl_wrapper(stream, headers, url, code):
479 if hasattr(urllib2.addinfourl, 'getcode'):
480 return urllib2.addinfourl(stream, headers, url, code)
481 ret = urllib2.addinfourl(stream, headers, url)
485 def http_request(self, req):
486 for h in std_headers:
489 req.add_header(h, std_headers[h])
490 if 'Youtubedl-no-compression' in req.headers:
491 if 'Accept-encoding' in req.headers:
492 del req.headers['Accept-encoding']
493 del req.headers['Youtubedl-no-compression']
496 def http_response(self, req, resp):
499 if resp.headers.get('Content-encoding', '') == 'gzip':
500 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
501 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
502 resp.msg = old_resp.msg
504 if resp.headers.get('Content-encoding', '') == 'deflate':
505 gz = StringIO.StringIO(self.deflate(resp.read()))
506 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
507 resp.msg = old_resp.msg
511 class FileDownloader(object):
512 """File Downloader class.
514 File downloader objects are the ones responsible of downloading the
515 actual video file and writing it to disk if the user has requested
516 it, among some other tasks. In most cases there should be one per
517 program. As, given a video URL, the downloader doesn't know how to
518 extract all the needed information, task that InfoExtractors do, it
519 has to pass the URL to one of them.
521 For this, file downloader objects have a method that allows
522 InfoExtractors to be registered in a given order. When it is passed
523 a URL, the file downloader handles it to the first InfoExtractor it
524 finds that reports being able to handle it. The InfoExtractor extracts
525 all the information about the video or videos the URL refers to, and
526 asks the FileDownloader to process the video information, possibly
527 downloading the video.
529 File downloaders accept a lot of parameters. In order not to saturate
530 the object constructor with arguments, it receives a dictionary of
531 options instead. These options are available through the params
532 attribute for the InfoExtractors to use. The FileDownloader also
533 registers itself as the downloader in charge for the InfoExtractors
534 that are added to it, so this is a "mutual registration".
538 username: Username for authentication purposes.
539 password: Password for authentication purposes.
540 usenetrc: Use netrc for authentication instead.
541 quiet: Do not print messages to stdout.
542 forceurl: Force printing final URL.
543 forcetitle: Force printing title.
544 forcethumbnail: Force printing thumbnail URL.
545 forcedescription: Force printing description.
546 forcefilename: Force printing final filename.
547 simulate: Do not download the video files.
548 format: Video format code.
549 format_limit: Highest quality format to try.
550 outtmpl: Template for output names.
551 ignoreerrors: Do not stop on download errors.
552 ratelimit: Download speed limit, in bytes/sec.
553 nooverwrites: Prevent overwriting files.
554 retries: Number of times to retry for HTTP error 5xx
555 continuedl: Try to continue downloads if possible.
556 noprogress: Do not print the progress bar.
557 playliststart: Playlist item to start at.
558 playlistend: Playlist item to end at.
559 matchtitle: Download only matching titles.
560 rejecttitle: Reject downloads for matching titles.
561 logtostderr: Log messages to stderr instead of stdout.
562 consoletitle: Display progress in console window's titlebar.
563 nopart: Do not use temporary .part files.
564 updatetime: Use the Last-modified header to set output file timestamps.
565 writedescription: Write the video description to a .description file
566 writeinfojson: Write the video description to a .info.json file
567 writesubtitles: Write the video subtitles to a .srt file
568 subtitleslang: Language of the subtitles to download
574 _download_retcode = None
575 _num_downloads = None
578 def __init__(self, params):
579 """Create a FileDownloader object with the given options."""
582 self._download_retcode = 0
583 self._num_downloads = 0
584 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
588 def format_bytes(bytes):
591 if type(bytes) is str:
596 exponent = long(math.log(bytes, 1024.0))
597 suffix = 'bkMGTPEZY'[exponent]
598 converted = float(bytes) / float(1024 ** exponent)
599 return '%.2f%s' % (converted, suffix)
602 def calc_percent(byte_counter, data_len):
605 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
608 def calc_eta(start, now, total, current):
612 if current == 0 or dif < 0.001: # One millisecond
614 rate = float(current) / dif
615 eta = long((float(total) - float(current)) / rate)
616 (eta_mins, eta_secs) = divmod(eta, 60)
619 return '%02d:%02d' % (eta_mins, eta_secs)
622 def calc_speed(start, now, bytes):
624 if bytes == 0 or dif < 0.001: # One millisecond
625 return '%10s' % '---b/s'
626 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
629 def best_block_size(elapsed_time, bytes):
630 new_min = max(bytes / 2.0, 1.0)
631 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
632 if elapsed_time < 0.001:
634 rate = bytes / elapsed_time
642 def parse_bytes(bytestr):
643 """Parse a string indicating a byte quantity into a long integer."""
644 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
647 number = float(matchobj.group(1))
648 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
649 return long(round(number * multiplier))
651 def add_info_extractor(self, ie):
652 """Add an InfoExtractor object to the end of the list."""
654 ie.set_downloader(self)
656 def add_post_processor(self, pp):
657 """Add a PostProcessor object to the end of the chain."""
659 pp.set_downloader(self)
661 def to_screen(self, message, skip_eol=False):
662 """Print message to stdout if not in quiet mode."""
663 assert type(message) == type(u'')
664 if not self.params.get('quiet', False):
665 terminator = [u'\n', u''][skip_eol]
666 output = message + terminator
668 if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
669 output = output.encode(preferredencoding(), 'ignore')
670 self._screen_file.write(output)
671 self._screen_file.flush()
673 def to_stderr(self, message):
674 """Print message to stderr."""
675 print >>sys.stderr, message.encode(preferredencoding())
677 def to_cons_title(self, message):
678 """Set console/terminal window title to message."""
679 if not self.params.get('consoletitle', False):
681 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
682 # c_wchar_p() might not be necessary if `message` is
683 # already of type unicode()
684 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
685 elif 'TERM' in os.environ:
686 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
688 def fixed_template(self):
689 """Checks if the output template is fixed."""
690 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
692 def trouble(self, message=None):
693 """Determine action to take when a download problem appears.
695 Depending on if the downloader has been configured to ignore
696 download errors or not, this method may throw an exception or
697 not when errors are found, after printing the message.
699 if message is not None:
700 self.to_stderr(message)
701 if not self.params.get('ignoreerrors', False):
702 raise DownloadError(message)
703 self._download_retcode = 1
705 def slow_down(self, start_time, byte_counter):
706 """Sleep if the download speed is over the rate limit."""
707 rate_limit = self.params.get('ratelimit', None)
708 if rate_limit is None or byte_counter == 0:
711 elapsed = now - start_time
714 speed = float(byte_counter) / elapsed
715 if speed > rate_limit:
716 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
718 def temp_name(self, filename):
719 """Returns a temporary filename for the given filename."""
720 if self.params.get('nopart', False) or filename == u'-' or \
721 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
723 return filename + u'.part'
725 def undo_temp_name(self, filename):
726 if filename.endswith(u'.part'):
727 return filename[:-len(u'.part')]
730 def try_rename(self, old_filename, new_filename):
732 if old_filename == new_filename:
734 os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
735 except (IOError, OSError), err:
736 self.trouble(u'ERROR: unable to rename file')
738 def try_utime(self, filename, last_modified_hdr):
739 """Try to set the last-modified time of the given file."""
740 if last_modified_hdr is None:
742 if not os.path.isfile(_encodeFilename(filename)):
744 timestr = last_modified_hdr
747 filetime = timeconvert(timestr)
751 os.utime(filename, (time.time(), filetime))
756 def report_writedescription(self, descfn):
757 """ Report that the description file is being written """
758 self.to_screen(u'[info] Writing video description to: ' + descfn)
760 def report_writesubtitles(self, srtfn):
761 """ Report that the subtitles file is being written """
762 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
764 def report_writeinfojson(self, infofn):
765 """ Report that the metadata file has been written """
766 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
768 def report_destination(self, filename):
769 """Report destination filename."""
770 self.to_screen(u'[download] Destination: ' + filename)
772 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
773 """Report download progress."""
774 if self.params.get('noprogress', False):
776 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
777 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
778 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
779 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
781 def report_resuming_byte(self, resume_len):
782 """Report attempt to resume at given byte."""
783 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
785 def report_retry(self, count, retries):
786 """Report retry in case of HTTP error 5xx"""
787 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
789 def report_file_already_downloaded(self, file_name):
790 """Report file has already been fully downloaded."""
792 self.to_screen(u'[download] %s has already been downloaded' % file_name)
793 except (UnicodeEncodeError), err:
794 self.to_screen(u'[download] The file has already been downloaded')
796 def report_unable_to_resume(self):
797 """Report it was impossible to resume download."""
798 self.to_screen(u'[download] Unable to resume')
800 def report_finish(self):
801 """Report download finished."""
802 if self.params.get('noprogress', False):
803 self.to_screen(u'[download] Download completed')
807 def increment_downloads(self):
808 """Increment the ordinal that assigns a number to each file."""
809 self._num_downloads += 1
811 def prepare_filename(self, info_dict):
812 """Generate the output filename."""
814 template_dict = dict(info_dict)
815 template_dict['epoch'] = unicode(long(time.time()))
816 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
817 filename = self.params['outtmpl'] % template_dict
819 except (ValueError, KeyError), err:
820 self.trouble(u'ERROR: invalid system charset or erroneous output template')
823 def _match_entry(self, info_dict):
824 """ Returns None iff the file should be downloaded """
826 title = info_dict['title']
827 matchtitle = self.params.get('matchtitle', False)
828 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
829 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
830 rejecttitle = self.params.get('rejecttitle', False)
831 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
832 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
835 def process_info(self, info_dict):
836 """Process a single dictionary returned by an InfoExtractor."""
838 reason = self._match_entry(info_dict)
839 if reason is not None:
840 self.to_screen(u'[download] ' + reason)
843 max_downloads = self.params.get('max_downloads')
844 if max_downloads is not None:
845 if self._num_downloads > int(max_downloads):
846 raise MaxDownloadsReached()
848 filename = self.prepare_filename(info_dict)
851 if self.params.get('forcetitle', False):
852 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
853 if self.params.get('forceurl', False):
854 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
855 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
856 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
857 if self.params.get('forcedescription', False) and 'description' in info_dict:
858 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
859 if self.params.get('forcefilename', False) and filename is not None:
860 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
861 if self.params.get('forceformat', False):
862 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
864 # Do nothing else if in simulate mode
865 if self.params.get('simulate', False):
872 dn = os.path.dirname(_encodeFilename(filename))
873 if dn != '' and not os.path.exists(dn): # dn is already encoded
875 except (OSError, IOError), err:
876 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
879 if self.params.get('writedescription', False):
881 descfn = filename + u'.description'
882 self.report_writedescription(descfn)
883 descfile = open(_encodeFilename(descfn), 'wb')
885 descfile.write(info_dict['description'].encode('utf-8'))
888 except (OSError, IOError):
889 self.trouble(u'ERROR: Cannot write description file ' + descfn)
892 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
893 # subtitles download errors are already managed as troubles in relevant IE
894 # that way it will silently go on when used with unsupporting IE
896 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
897 self.report_writesubtitles(srtfn)
898 srtfile = open(_encodeFilename(srtfn), 'wb')
900 srtfile.write(info_dict['subtitles'].encode('utf-8'))
903 except (OSError, IOError):
904 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
907 if self.params.get('writeinfojson', False):
908 infofn = filename + u'.info.json'
909 self.report_writeinfojson(infofn)
912 except (NameError,AttributeError):
913 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
916 infof = open(_encodeFilename(infofn), 'wb')
918 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
919 json.dump(json_info_dict, infof)
922 except (OSError, IOError):
923 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
926 if not self.params.get('skip_download', False):
927 if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
931 success = self._do_download(filename, info_dict)
932 except (OSError, IOError), err:
933 raise UnavailableVideoError
934 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
935 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
937 except (ContentTooShortError, ), err:
938 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
943 self.post_process(filename, info_dict)
944 except (PostProcessingError), err:
945 self.trouble(u'ERROR: postprocessing: %s' % str(err))
948 def download(self, url_list):
949 """Download a given list of URLs."""
950 if len(url_list) > 1 and self.fixed_template():
951 raise SameFileError(self.params['outtmpl'])
954 suitable_found = False
956 # Go to next InfoExtractor if not suitable
957 if not ie.suitable(url):
960 # Suitable InfoExtractor found
961 suitable_found = True
963 # Extract information from URL and process it
966 # Suitable InfoExtractor had been found; go to next URL
969 if not suitable_found:
970 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
972 return self._download_retcode
974 def post_process(self, filename, ie_info):
975 """Run the postprocessing chain on the given file."""
977 info['filepath'] = filename
983 def _download_with_rtmpdump(self, filename, url, player_url):
984 self.report_destination(filename)
985 tmpfilename = self.temp_name(filename)
987 # Check for rtmpdump first
989 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
990 except (OSError, IOError):
991 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
994 # Download using rtmpdump. rtmpdump returns exit code 2 when
995 # the connection was interrumpted and resuming appears to be
996 # possible. This is part of rtmpdump's normal usage, AFAIK.
997 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
998 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
999 if self.params.get('verbose', False):
1002 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
1005 self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
1006 retval = subprocess.call(args)
1007 while retval == 2 or retval == 1:
1008 prevsize = os.path.getsize(_encodeFilename(tmpfilename))
1009 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
1010 time.sleep(5.0) # This seems to be needed
1011 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
1012 cursize = os.path.getsize(_encodeFilename(tmpfilename))
1013 if prevsize == cursize and retval == 1:
1015 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
1016 if prevsize == cursize and retval == 2 and cursize > 1024:
1017 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
1021 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
1022 self.try_rename(tmpfilename, filename)
1025 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
1028 def _do_download(self, filename, info_dict):
1029 url = info_dict['url']
1030 player_url = info_dict.get('player_url', None)
1032 # Check file already present
1033 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
1034 self.report_file_already_downloaded(filename)
1037 # Attempt to download using rtmpdump
1038 if url.startswith('rtmp'):
1039 return self._download_with_rtmpdump(filename, url, player_url)
1041 tmpfilename = self.temp_name(filename)
1044 # Do not include the Accept-Encoding header
1045 headers = {'Youtubedl-no-compression': 'True'}
1046 basic_request = urllib2.Request(url, None, headers)
1047 request = urllib2.Request(url, None, headers)
1049 # Establish possible resume length
1050 if os.path.isfile(_encodeFilename(tmpfilename)):
1051 resume_len = os.path.getsize(_encodeFilename(tmpfilename))
1057 if self.params.get('continuedl', False):
1058 self.report_resuming_byte(resume_len)
1059 request.add_header('Range','bytes=%d-' % resume_len)
1065 retries = self.params.get('retries', 0)
1066 while count <= retries:
1067 # Establish connection
1069 if count == 0 and 'urlhandle' in info_dict:
1070 data = info_dict['urlhandle']
1071 data = urllib2.urlopen(request)
1073 except (urllib2.HTTPError, ), err:
1074 if (err.code < 500 or err.code >= 600) and err.code != 416:
1075 # Unexpected HTTP error
1077 elif err.code == 416:
1078 # Unable to resume (requested range not satisfiable)
1080 # Open the connection again without the range header
1081 data = urllib2.urlopen(basic_request)
1082 content_length = data.info()['Content-Length']
1083 except (urllib2.HTTPError, ), err:
1084 if err.code < 500 or err.code >= 600:
1087 # Examine the reported length
1088 if (content_length is not None and
1089 (resume_len - 100 < long(content_length) < resume_len + 100)):
1090 # The file had already been fully downloaded.
1091 # Explanation to the above condition: in issue #175 it was revealed that
1092 # YouTube sometimes adds or removes a few bytes from the end of the file,
1093 # changing the file size slightly and causing problems for some users. So
1094 # I decided to implement a suggested change and consider the file
1095 # completely downloaded if the file size differs less than 100 bytes from
1096 # the one in the hard drive.
1097 self.report_file_already_downloaded(filename)
1098 self.try_rename(tmpfilename, filename)
1101 # The length does not match, we start the download over
1102 self.report_unable_to_resume()
1107 if count <= retries:
1108 self.report_retry(count, retries)
1111 self.trouble(u'ERROR: giving up after %s retries' % retries)
1114 data_len = data.info().get('Content-length', None)
1115 if data_len is not None:
1116 data_len = long(data_len) + resume_len
1117 data_len_str = self.format_bytes(data_len)
1118 byte_counter = 0 + resume_len
1122 # Download and write
1123 before = time.time()
1124 data_block = data.read(block_size)
1126 if len(data_block) == 0:
1128 byte_counter += len(data_block)
1130 # Open file just in time
1133 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1134 assert stream is not None
1135 filename = self.undo_temp_name(tmpfilename)
1136 self.report_destination(filename)
1137 except (OSError, IOError), err:
1138 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1141 stream.write(data_block)
1142 except (IOError, OSError), err:
1143 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1145 block_size = self.best_block_size(after - before, len(data_block))
1148 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1149 if data_len is None:
1150 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1152 percent_str = self.calc_percent(byte_counter, data_len)
1153 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1154 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1157 self.slow_down(start, byte_counter - resume_len)
1160 self.trouble(u'\nERROR: Did not get any data blocks')
1163 self.report_finish()
1164 if data_len is not None and byte_counter != data_len:
1165 raise ContentTooShortError(byte_counter, long(data_len))
1166 self.try_rename(tmpfilename, filename)
1168 # Update file modification time
1169 if self.params.get('updatetime', True):
1170 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1175 class InfoExtractor(object):
1176 """Information Extractor class.
1178 Information extractors are the classes that, given a URL, extract
1179 information from the video (or videos) the URL refers to. This
1180 information includes the real video URL, the video title and simplified
1181 title, author and others. The information is stored in a dictionary
1182 which is then passed to the FileDownloader. The FileDownloader
1183 processes this information possibly downloading the video to the file
1184 system, among other possible outcomes. The dictionaries must include
1185 the following fields:
1187 id: Video identifier.
1188 url: Final video URL.
1189 uploader: Nickname of the video uploader.
1190 title: Literal title.
1191 stitle: Simplified title.
1192 ext: Video filename extension.
1193 format: Video format.
1194 player_url: SWF Player URL (may be None).
1196 The following fields are optional. Their primary purpose is to allow
1197 youtube-dl to serve as the backend for a video search function, such
1198 as the one in youtube2mp3. They are only used when their respective
1199 forced printing functions are called:
1201 thumbnail: Full URL to a video thumbnail image.
1202 description: One-line video description.
1204 Subclasses of this one should re-define the _real_initialize() and
1205 _real_extract() methods and define a _VALID_URL regexp.
1206 Probably, they should also be added to the list of extractors.
1212 def __init__(self, downloader=None):
1213 """Constructor. Receives an optional downloader."""
1215 self.set_downloader(downloader)
1217 def suitable(self, url):
1218 """Receives a URL and returns True if suitable for this IE."""
1219 return re.match(self._VALID_URL, url) is not None
1221 def initialize(self):
1222 """Initializes an instance (authentication, etc)."""
1224 self._real_initialize()
1227 def extract(self, url):
1228 """Extracts URL information and returns it in list of dicts."""
1230 return self._real_extract(url)
1232 def set_downloader(self, downloader):
1233 """Sets the downloader for this IE."""
1234 self._downloader = downloader
1236 def _real_initialize(self):
1237 """Real initialization process. Redefine in subclasses."""
1240 def _real_extract(self, url):
1241 """Real extraction process. Redefine in subclasses."""
1245 class YoutubeIE(InfoExtractor):
1246 """Information extractor for youtube.com."""
1248 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1249 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1250 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1251 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1252 _NETRC_MACHINE = 'youtube'
1253 # Listed in order of quality
1254 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1255 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1256 _video_extensions = {
1262 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1267 _video_dimensions = {
1282 IE_NAME = u'youtube'
1284 def report_lang(self):
1285 """Report attempt to set language."""
1286 self._downloader.to_screen(u'[youtube] Setting language')
1288 def report_login(self):
1289 """Report attempt to log in."""
1290 self._downloader.to_screen(u'[youtube] Logging in')
1292 def report_age_confirmation(self):
1293 """Report attempt to confirm age."""
1294 self._downloader.to_screen(u'[youtube] Confirming age')
1296 def report_video_webpage_download(self, video_id):
1297 """Report attempt to download video webpage."""
1298 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1300 def report_video_info_webpage_download(self, video_id):
1301 """Report attempt to download video info webpage."""
1302 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1304 def report_video_subtitles_download(self, video_id):
1305 """Report attempt to download video info webpage."""
1306 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1308 def report_information_extraction(self, video_id):
1309 """Report attempt to extract video information."""
1310 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1312 def report_unavailable_format(self, video_id, format):
1313 """Report extracted video URL."""
1314 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1316 def report_rtmp_download(self):
1317 """Indicate the download will use the RTMP protocol."""
1318 self._downloader.to_screen(u'[youtube] RTMP download detected')
1320 def _closed_captions_xml_to_srt(self, xml_string):
1322 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1323 # TODO parse xml instead of regex
1324 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1325 if not dur: dur = '4'
1326 start = float(start)
1327 end = start + float(dur)
1328 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1329 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1330 caption = _unescapeHTML(caption)
1331 caption = _unescapeHTML(caption) # double cycle, inentional
1332 srt += str(n) + '\n'
1333 srt += start + ' --> ' + end + '\n'
1334 srt += caption + '\n\n'
1337 def _print_formats(self, formats):
1338 print 'Available formats:'
1340 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1342 def _real_initialize(self):
1343 if self._downloader is None:
1348 downloader_params = self._downloader.params
1350 # Attempt to use provided username and password or .netrc data
1351 if downloader_params.get('username', None) is not None:
1352 username = downloader_params['username']
1353 password = downloader_params['password']
1354 elif downloader_params.get('usenetrc', False):
1356 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1357 if info is not None:
1361 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1362 except (IOError, netrc.NetrcParseError), err:
1363 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1367 request = urllib2.Request(self._LANG_URL)
1370 urllib2.urlopen(request).read()
1371 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1372 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1375 # No authentication to be performed
1376 if username is None:
1381 'current_form': 'loginForm',
1383 'action_login': 'Log In',
1384 'username': username,
1385 'password': password,
1387 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1390 login_results = urllib2.urlopen(request).read()
1391 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1392 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1394 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1395 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1401 'action_confirm': 'Confirm',
1403 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1405 self.report_age_confirmation()
1406 age_results = urllib2.urlopen(request).read()
1407 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1408 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1411 def _real_extract(self, url):
1412 # Extract video id from URL
1413 mobj = re.match(self._VALID_URL, url)
1415 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1417 video_id = mobj.group(2)
1420 self.report_video_webpage_download(video_id)
1421 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1423 video_webpage = urllib2.urlopen(request).read()
1424 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1425 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1428 # Attempt to extract SWF player URL
1429 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1430 if mobj is not None:
1431 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1436 self.report_video_info_webpage_download(video_id)
1437 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1438 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1439 % (video_id, el_type))
1440 request = urllib2.Request(video_info_url)
1442 video_info_webpage = urllib2.urlopen(request).read()
1443 video_info = parse_qs(video_info_webpage)
1444 if 'token' in video_info:
1446 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1447 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1449 if 'token' not in video_info:
1450 if 'reason' in video_info:
1451 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1453 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1456 # Start extracting information
1457 self.report_information_extraction(video_id)
1460 if 'author' not in video_info:
1461 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1463 video_uploader = urllib.unquote_plus(video_info['author'][0])
1466 if 'title' not in video_info:
1467 self._downloader.trouble(u'ERROR: unable to extract video title')
1469 video_title = urllib.unquote_plus(video_info['title'][0])
1470 video_title = video_title.decode('utf-8')
1471 video_title = sanitize_title(video_title)
1474 simple_title = _simplify_title(video_title)
1477 if 'thumbnail_url' not in video_info:
1478 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1479 video_thumbnail = ''
1480 else: # don't panic if we can't find it
1481 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1485 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1486 if mobj is not None:
1487 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1488 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1489 for expression in format_expressions:
1491 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1496 video_description = get_element_by_id("eow-description", video_webpage)
1497 if video_description: video_description = clean_html(video_description.decode('utf8'))
1498 else: video_description = ''
1501 video_subtitles = None
1502 if self._downloader.params.get('writesubtitles', False):
1503 self.report_video_subtitles_download(video_id)
1504 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1506 srt_list = urllib2.urlopen(request).read()
1507 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1508 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1510 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1512 if self._downloader.params.get('subtitleslang', False):
1513 srt_lang = self._downloader.params.get('subtitleslang')
1514 elif 'en' in srt_lang_list:
1517 srt_lang = srt_lang_list[0]
1518 if not srt_lang in srt_lang_list:
1519 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
1521 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1523 srt_xml = urllib2.urlopen(request).read()
1524 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1525 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1527 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1529 self._downloader.trouble(u'WARNING: video has no closed captions')
1532 video_token = urllib.unquote_plus(video_info['token'][0])
1534 # Decide which formats to download
1535 req_format = self._downloader.params.get('format', None)
1537 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1538 self.report_rtmp_download()
1539 video_url_list = [(None, video_info['conn'][0])]
1540 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1541 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1542 url_data = [parse_qs(uds) for uds in url_data_strs]
1543 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1544 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1546 format_limit = self._downloader.params.get('format_limit', None)
1547 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1548 if format_limit is not None and format_limit in available_formats:
1549 format_list = available_formats[available_formats.index(format_limit):]
1551 format_list = available_formats
1552 existing_formats = [x for x in format_list if x in url_map]
1553 if len(existing_formats) == 0:
1554 self._downloader.trouble(u'ERROR: no known formats available for video')
1556 if self._downloader.params.get('listformats', None):
1557 self._print_formats(existing_formats)
1559 if req_format is None or req_format == 'best':
1560 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1561 elif req_format == 'worst':
1562 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1563 elif req_format in ('-1', 'all'):
1564 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1566 # Specific formats. We pick the first in a slash-delimeted sequence.
1567 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1568 req_formats = req_format.split('/')
1569 video_url_list = None
1570 for rf in req_formats:
1572 video_url_list = [(rf, url_map[rf])]
1574 if video_url_list is None:
1575 self._downloader.trouble(u'ERROR: requested format not available')
1578 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1581 for format_param, video_real_url in video_url_list:
1582 # At this point we have a new video
1583 self._downloader.increment_downloads()
1586 video_extension = self._video_extensions.get(format_param, 'flv')
1589 # Process video information
1590 self._downloader.process_info({
1591 'id': video_id.decode('utf-8'),
1592 'url': video_real_url.decode('utf-8'),
1593 'uploader': video_uploader.decode('utf-8'),
1594 'upload_date': upload_date,
1595 'title': video_title,
1596 'stitle': simple_title,
1597 'ext': video_extension.decode('utf-8'),
1598 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1599 'thumbnail': video_thumbnail.decode('utf-8'),
1600 'description': video_description,
1601 'player_url': player_url,
1602 'subtitles': video_subtitles
1604 except UnavailableVideoError, err:
1605 self._downloader.trouble(u'\nERROR: unable to download video')
1608 class MetacafeIE(InfoExtractor):
1609 """Information Extractor for metacafe.com."""
1611 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1612 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1613 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1615 IE_NAME = u'metacafe'
1617 def __init__(self, youtube_ie, downloader=None):
1618 InfoExtractor.__init__(self, downloader)
1619 self._youtube_ie = youtube_ie
1621 def report_disclaimer(self):
1622 """Report disclaimer retrieval."""
1623 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1625 def report_age_confirmation(self):
1626 """Report attempt to confirm age."""
1627 self._downloader.to_screen(u'[metacafe] Confirming age')
1629 def report_download_webpage(self, video_id):
1630 """Report webpage download."""
1631 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1633 def report_extraction(self, video_id):
1634 """Report information extraction."""
1635 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1637 def _real_initialize(self):
1638 # Retrieve disclaimer
1639 request = urllib2.Request(self._DISCLAIMER)
1641 self.report_disclaimer()
1642 disclaimer = urllib2.urlopen(request).read()
1643 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1644 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1650 'submit': "Continue - I'm over 18",
1652 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1654 self.report_age_confirmation()
1655 disclaimer = urllib2.urlopen(request).read()
1656 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1657 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1660 def _real_extract(self, url):
1661 # Extract id and simplified title from URL
1662 mobj = re.match(self._VALID_URL, url)
1664 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1667 video_id = mobj.group(1)
1669 # Check if video comes from YouTube
1670 mobj2 = re.match(r'^yt-(.*)$', video_id)
1671 if mobj2 is not None:
1672 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1675 # At this point we have a new video
1676 self._downloader.increment_downloads()
1678 simple_title = mobj.group(2).decode('utf-8')
1680 # Retrieve video webpage to extract further information
1681 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1683 self.report_download_webpage(video_id)
1684 webpage = urllib2.urlopen(request).read()
1685 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1686 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1689 # Extract URL, uploader and title from webpage
1690 self.report_extraction(video_id)
1691 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1692 if mobj is not None:
1693 mediaURL = urllib.unquote(mobj.group(1))
1694 video_extension = mediaURL[-3:]
1696 # Extract gdaKey if available
1697 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1699 video_url = mediaURL
1701 gdaKey = mobj.group(1)
1702 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1704 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1706 self._downloader.trouble(u'ERROR: unable to extract media URL')
1708 vardict = parse_qs(mobj.group(1))
1709 if 'mediaData' not in vardict:
1710 self._downloader.trouble(u'ERROR: unable to extract media URL')
1712 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1714 self._downloader.trouble(u'ERROR: unable to extract media URL')
1716 mediaURL = mobj.group(1).replace('\\/', '/')
1717 video_extension = mediaURL[-3:]
1718 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1720 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1722 self._downloader.trouble(u'ERROR: unable to extract title')
1724 video_title = mobj.group(1).decode('utf-8')
1725 video_title = sanitize_title(video_title)
1727 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1729 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1731 video_uploader = mobj.group(1)
1734 # Process video information
1735 self._downloader.process_info({
1736 'id': video_id.decode('utf-8'),
1737 'url': video_url.decode('utf-8'),
1738 'uploader': video_uploader.decode('utf-8'),
1739 'upload_date': u'NA',
1740 'title': video_title,
1741 'stitle': simple_title,
1742 'ext': video_extension.decode('utf-8'),
1746 except UnavailableVideoError:
1747 self._downloader.trouble(u'\nERROR: unable to download video')
1750 class DailymotionIE(InfoExtractor):
1751 """Information Extractor for Dailymotion"""
1753 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1754 IE_NAME = u'dailymotion'
1756 def __init__(self, downloader=None):
1757 InfoExtractor.__init__(self, downloader)
1759 def report_download_webpage(self, video_id):
1760 """Report webpage download."""
1761 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1763 def report_extraction(self, video_id):
1764 """Report information extraction."""
1765 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1767 def _real_extract(self, url):
1768 # Extract id and simplified title from URL
1769 mobj = re.match(self._VALID_URL, url)
1771 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1774 # At this point we have a new video
1775 self._downloader.increment_downloads()
1776 video_id = mobj.group(1)
1778 video_extension = 'flv'
1780 # Retrieve video webpage to extract further information
1781 request = urllib2.Request(url)
1782 request.add_header('Cookie', 'family_filter=off')
1784 self.report_download_webpage(video_id)
1785 webpage = urllib2.urlopen(request).read()
1786 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1787 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1790 # Extract URL, uploader and title from webpage
1791 self.report_extraction(video_id)
1792 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1794 self._downloader.trouble(u'ERROR: unable to extract media URL')
1796 sequence = urllib.unquote(mobj.group(1))
1797 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1799 self._downloader.trouble(u'ERROR: unable to extract media URL')
1801 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1803 # if needed add http://www.dailymotion.com/ if relative URL
1805 video_url = mediaURL
1807 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1809 self._downloader.trouble(u'ERROR: unable to extract title')
1811 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1812 video_title = sanitize_title(video_title)
1813 simple_title = _simplify_title(video_title)
1815 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1817 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1819 video_uploader = mobj.group(1)
1822 # Process video information
1823 self._downloader.process_info({
1824 'id': video_id.decode('utf-8'),
1825 'url': video_url.decode('utf-8'),
1826 'uploader': video_uploader.decode('utf-8'),
1827 'upload_date': u'NA',
1828 'title': video_title,
1829 'stitle': simple_title,
1830 'ext': video_extension.decode('utf-8'),
1834 except UnavailableVideoError:
1835 self._downloader.trouble(u'\nERROR: unable to download video')
1838 class GoogleIE(InfoExtractor):
1839 """Information extractor for video.google.com."""
1841 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1842 IE_NAME = u'video.google'
1844 def __init__(self, downloader=None):
1845 InfoExtractor.__init__(self, downloader)
1847 def report_download_webpage(self, video_id):
1848 """Report webpage download."""
1849 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1851 def report_extraction(self, video_id):
1852 """Report information extraction."""
1853 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1855 def _real_extract(self, url):
1856 # Extract id from URL
1857 mobj = re.match(self._VALID_URL, url)
1859 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1862 # At this point we have a new video
1863 self._downloader.increment_downloads()
1864 video_id = mobj.group(1)
1866 video_extension = 'mp4'
1868 # Retrieve video webpage to extract further information
1869 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1871 self.report_download_webpage(video_id)
1872 webpage = urllib2.urlopen(request).read()
1873 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1874 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1877 # Extract URL, uploader, and title from webpage
1878 self.report_extraction(video_id)
1879 mobj = re.search(r"download_url:'([^']+)'", webpage)
1881 video_extension = 'flv'
1882 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1884 self._downloader.trouble(u'ERROR: unable to extract media URL')
1886 mediaURL = urllib.unquote(mobj.group(1))
1887 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1888 mediaURL = mediaURL.replace('\\x26', '\x26')
1890 video_url = mediaURL
1892 mobj = re.search(r'<title>(.*)</title>', webpage)
1894 self._downloader.trouble(u'ERROR: unable to extract title')
1896 video_title = mobj.group(1).decode('utf-8')
1897 video_title = sanitize_title(video_title)
1898 simple_title = _simplify_title(video_title)
1900 # Extract video description
1901 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1903 self._downloader.trouble(u'ERROR: unable to extract video description')
1905 video_description = mobj.group(1).decode('utf-8')
1906 if not video_description:
1907 video_description = 'No description available.'
1909 # Extract video thumbnail
1910 if self._downloader.params.get('forcethumbnail', False):
1911 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1913 webpage = urllib2.urlopen(request).read()
1914 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1915 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1917 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1919 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1921 video_thumbnail = mobj.group(1)
1922 else: # we need something to pass to process_info
1923 video_thumbnail = ''
1926 # Process video information
1927 self._downloader.process_info({
1928 'id': video_id.decode('utf-8'),
1929 'url': video_url.decode('utf-8'),
1931 'upload_date': u'NA',
1932 'title': video_title,
1933 'stitle': simple_title,
1934 'ext': video_extension.decode('utf-8'),
1938 except UnavailableVideoError:
1939 self._downloader.trouble(u'\nERROR: unable to download video')
1942 class PhotobucketIE(InfoExtractor):
1943 """Information extractor for photobucket.com."""
1945 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1946 IE_NAME = u'photobucket'
1948 def __init__(self, downloader=None):
1949 InfoExtractor.__init__(self, downloader)
1951 def report_download_webpage(self, video_id):
1952 """Report webpage download."""
1953 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1955 def report_extraction(self, video_id):
1956 """Report information extraction."""
1957 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1959 def _real_extract(self, url):
1960 # Extract id from URL
1961 mobj = re.match(self._VALID_URL, url)
1963 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1966 # At this point we have a new video
1967 self._downloader.increment_downloads()
1968 video_id = mobj.group(1)
1970 video_extension = 'flv'
1972 # Retrieve video webpage to extract further information
1973 request = urllib2.Request(url)
1975 self.report_download_webpage(video_id)
1976 webpage = urllib2.urlopen(request).read()
1977 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1978 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1981 # Extract URL, uploader, and title from webpage
1982 self.report_extraction(video_id)
1983 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1985 self._downloader.trouble(u'ERROR: unable to extract media URL')
1987 mediaURL = urllib.unquote(mobj.group(1))
1989 video_url = mediaURL
1991 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1993 self._downloader.trouble(u'ERROR: unable to extract title')
1995 video_title = mobj.group(1).decode('utf-8')
1996 video_title = sanitize_title(video_title)
1997 simple_title = _simplify_title(vide_title)
1999 video_uploader = mobj.group(2).decode('utf-8')
2002 # Process video information
2003 self._downloader.process_info({
2004 'id': video_id.decode('utf-8'),
2005 'url': video_url.decode('utf-8'),
2006 'uploader': video_uploader,
2007 'upload_date': u'NA',
2008 'title': video_title,
2009 'stitle': simple_title,
2010 'ext': video_extension.decode('utf-8'),
2014 except UnavailableVideoError:
2015 self._downloader.trouble(u'\nERROR: unable to download video')
2018 class YahooIE(InfoExtractor):
2019 """Information extractor for video.yahoo.com."""
2021 # _VALID_URL matches all Yahoo! Video URLs
2022 # _VPAGE_URL matches only the extractable '/watch/' URLs
2023 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
2024 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
2025 IE_NAME = u'video.yahoo'
2027 def __init__(self, downloader=None):
2028 InfoExtractor.__init__(self, downloader)
2030 def report_download_webpage(self, video_id):
2031 """Report webpage download."""
2032 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
2034 def report_extraction(self, video_id):
2035 """Report information extraction."""
2036 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
2038 def _real_extract(self, url, new_video=True):
2039 # Extract ID from URL
2040 mobj = re.match(self._VALID_URL, url)
2042 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2045 # At this point we have a new video
2046 self._downloader.increment_downloads()
2047 video_id = mobj.group(2)
2048 video_extension = 'flv'
2050 # Rewrite valid but non-extractable URLs as
2051 # extractable English language /watch/ URLs
2052 if re.match(self._VPAGE_URL, url) is None:
2053 request = urllib2.Request(url)
2055 webpage = urllib2.urlopen(request).read()
2056 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2057 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2060 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
2062 self._downloader.trouble(u'ERROR: Unable to extract id field')
2064 yahoo_id = mobj.group(1)
2066 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2068 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2070 yahoo_vid = mobj.group(1)
2072 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2073 return self._real_extract(url, new_video=False)
2075 # Retrieve video webpage to extract further information
2076 request = urllib2.Request(url)
2078 self.report_download_webpage(video_id)
2079 webpage = urllib2.urlopen(request).read()
2080 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2081 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2084 # Extract uploader and title from webpage
2085 self.report_extraction(video_id)
2086 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2088 self._downloader.trouble(u'ERROR: unable to extract video title')
2090 video_title = mobj.group(1).decode('utf-8')
2091 simple_title = _simplify_title(video_title)
2093 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2095 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2097 video_uploader = mobj.group(1).decode('utf-8')
2099 # Extract video thumbnail
2100 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2102 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2104 video_thumbnail = mobj.group(1).decode('utf-8')
2106 # Extract video description
2107 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2109 self._downloader.trouble(u'ERROR: unable to extract video description')
2111 video_description = mobj.group(1).decode('utf-8')
2112 if not video_description:
2113 video_description = 'No description available.'
2115 # Extract video height and width
2116 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2118 self._downloader.trouble(u'ERROR: unable to extract video height')
2120 yv_video_height = mobj.group(1)
2122 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2124 self._downloader.trouble(u'ERROR: unable to extract video width')
2126 yv_video_width = mobj.group(1)
2128 # Retrieve video playlist to extract media URL
2129 # I'm not completely sure what all these options are, but we
2130 # seem to need most of them, otherwise the server sends a 401.
2131 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
2132 yv_bitrate = '700' # according to Wikipedia this is hard-coded
2133 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2134 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2135 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2137 self.report_download_webpage(video_id)
2138 webpage = urllib2.urlopen(request).read()
2139 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2140 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2143 # Extract media URL from playlist XML
2144 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2146 self._downloader.trouble(u'ERROR: Unable to extract media URL')
2148 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2149 video_url = _unescapeHTML(video_url)
2152 # Process video information
2153 self._downloader.process_info({
2154 'id': video_id.decode('utf-8'),
2156 'uploader': video_uploader,
2157 'upload_date': u'NA',
2158 'title': video_title,
2159 'stitle': simple_title,
2160 'ext': video_extension.decode('utf-8'),
2161 'thumbnail': video_thumbnail.decode('utf-8'),
2162 'description': video_description,
2163 'thumbnail': video_thumbnail,
2166 except UnavailableVideoError:
2167 self._downloader.trouble(u'\nERROR: unable to download video')
2170 class VimeoIE(InfoExtractor):
2171 """Information extractor for vimeo.com."""
2173 # _VALID_URL matches Vimeo URLs
2174 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2177 def __init__(self, downloader=None):
2178 InfoExtractor.__init__(self, downloader)
2180 def report_download_webpage(self, video_id):
2181 """Report webpage download."""
2182 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2184 def report_extraction(self, video_id):
2185 """Report information extraction."""
2186 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2188 def _real_extract(self, url, new_video=True):
2189 # Extract ID from URL
2190 mobj = re.match(self._VALID_URL, url)
2192 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2195 # At this point we have a new video
2196 self._downloader.increment_downloads()
2197 video_id = mobj.group(1)
2199 # Retrieve video webpage to extract further information
2200 request = urllib2.Request(url, None, std_headers)
2202 self.report_download_webpage(video_id)
2203 webpage = urllib2.urlopen(request).read()
2204 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2205 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2208 # Now we begin extracting as much information as we can from what we
2209 # retrieved. First we extract the information common to all extractors,
2210 # and latter we extract those that are Vimeo specific.
2211 self.report_extraction(video_id)
2213 # Extract the config JSON
2214 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2216 config = json.loads(config)
2218 self._downloader.trouble(u'ERROR: unable to extract info section')
2222 video_title = config["video"]["title"]
2223 simple_title = _simplify_title(video_title)
2226 video_uploader = config["video"]["owner"]["name"]
2228 # Extract video thumbnail
2229 video_thumbnail = config["video"]["thumbnail"]
2231 # Extract video description
2232 video_description = get_element_by_id("description", webpage)
2233 if video_description: video_description = clean_html(video_description.decode('utf8'))
2234 else: video_description = ''
2236 # Extract upload date
2237 video_upload_date = u'NA'
2238 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2239 if mobj is not None:
2240 video_upload_date = mobj.group(1)
2242 # Vimeo specific: extract request signature and timestamp
2243 sig = config['request']['signature']
2244 timestamp = config['request']['timestamp']
2246 # Vimeo specific: extract video codec and quality information
2247 # TODO bind to format param
2248 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2249 for codec in codecs:
2250 if codec[0] in config["video"]["files"]:
2251 video_codec = codec[0]
2252 video_extension = codec[1]
2253 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2254 else: quality = 'sd'
2257 self._downloader.trouble(u'ERROR: no known codec found')
2260 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2261 %(video_id, sig, timestamp, quality, video_codec.upper())
2264 # Process video information
2265 self._downloader.process_info({
2268 'uploader': video_uploader,
2269 'upload_date': video_upload_date,
2270 'title': video_title,
2271 'stitle': simple_title,
2272 'ext': video_extension,
2273 'thumbnail': video_thumbnail,
2274 'description': video_description,
2277 except UnavailableVideoError:
2278 self._downloader.trouble(u'ERROR: unable to download video')
2281 class GenericIE(InfoExtractor):
2282 """Generic last-resort information extractor."""
2285 IE_NAME = u'generic'
2287 def __init__(self, downloader=None):
2288 InfoExtractor.__init__(self, downloader)
2290 def report_download_webpage(self, video_id):
2291 """Report webpage download."""
2292 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2293 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2295 def report_extraction(self, video_id):
2296 """Report information extraction."""
2297 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2299 def _real_extract(self, url):
2300 # At this point we have a new video
2301 self._downloader.increment_downloads()
2303 video_id = url.split('/')[-1]
2304 request = urllib2.Request(url)
2306 self.report_download_webpage(video_id)
2307 webpage = urllib2.urlopen(request).read()
2308 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2309 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2311 except ValueError, err:
2312 # since this is the last-resort InfoExtractor, if
2313 # this error is thrown, it'll be thrown here
2314 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2317 self.report_extraction(video_id)
2318 # Start with something easy: JW Player in SWFObject
2319 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2321 # Broaden the search a little bit
2322 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2324 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2327 # It's possible that one of the regexes
2328 # matched, but returned an empty group:
2329 if mobj.group(1) is None:
2330 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2333 video_url = urllib.unquote(mobj.group(1))
2334 video_id = os.path.basename(video_url)
2336 # here's a fun little line of code for you:
2337 video_extension = os.path.splitext(video_id)[1][1:]
2338 video_id = os.path.splitext(video_id)[0]
2340 # it's tempting to parse this further, but you would
2341 # have to take into account all the variations like
2342 # Video Title - Site Name
2343 # Site Name | Video Title
2344 # Video Title - Tagline | Site Name
2345 # and so on and so forth; it's just not practical
2346 mobj = re.search(r'<title>(.*)</title>', webpage)
2348 self._downloader.trouble(u'ERROR: unable to extract title')
2350 video_title = mobj.group(1).decode('utf-8')
2351 video_title = sanitize_title(video_title)
2352 simple_title = _simplify_title(video_title)
2354 # video uploader is domain name
2355 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2357 self._downloader.trouble(u'ERROR: unable to extract title')
2359 video_uploader = mobj.group(1).decode('utf-8')
2362 # Process video information
2363 self._downloader.process_info({
2364 'id': video_id.decode('utf-8'),
2365 'url': video_url.decode('utf-8'),
2366 'uploader': video_uploader,
2367 'upload_date': u'NA',
2368 'title': video_title,
2369 'stitle': simple_title,
2370 'ext': video_extension.decode('utf-8'),
2374 except UnavailableVideoError, err:
2375 self._downloader.trouble(u'\nERROR: unable to download video')
2378 class YoutubeSearchIE(InfoExtractor):
2379 """Information Extractor for YouTube search queries."""
2380 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2381 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
2383 _max_youtube_results = 1000
2384 IE_NAME = u'youtube:search'
2386 def __init__(self, youtube_ie, downloader=None):
2387 InfoExtractor.__init__(self, downloader)
2388 self._youtube_ie = youtube_ie
2390 def report_download_page(self, query, pagenum):
2391 """Report attempt to download playlist page with given number."""
2392 query = query.decode(preferredencoding())
2393 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2395 def _real_initialize(self):
2396 self._youtube_ie.initialize()
2398 def _real_extract(self, query):
2399 mobj = re.match(self._VALID_URL, query)
2401 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2404 prefix, query = query.split(':')
2406 query = query.encode('utf-8')
2408 self._download_n_results(query, 1)
2410 elif prefix == 'all':
2411 self._download_n_results(query, self._max_youtube_results)
2417 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2419 elif n > self._max_youtube_results:
2420 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2421 n = self._max_youtube_results
2422 self._download_n_results(query, n)
2424 except ValueError: # parsing prefix as integer fails
2425 self._download_n_results(query, 1)
2428 def _download_n_results(self, query, n):
2429 """Downloads a specified number of results for a query"""
2435 while (50 * pagenum) < limit:
2436 self.report_download_page(query, pagenum+1)
2437 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
2438 request = urllib2.Request(result_url)
2440 data = urllib2.urlopen(request).read()
2441 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2442 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
2444 api_response = json.loads(data)['data']
2446 new_ids = list(video['id'] for video in api_response['items'])
2447 video_ids += new_ids
2449 limit = min(n, api_response['totalItems'])
2452 if len(video_ids) > n:
2453 video_ids = video_ids[:n]
2454 for id in video_ids:
2455 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2459 class GoogleSearchIE(InfoExtractor):
2460 """Information Extractor for Google Video search queries."""
2461 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2462 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2463 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2464 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2466 _max_google_results = 1000
2467 IE_NAME = u'video.google:search'
2469 def __init__(self, google_ie, downloader=None):
2470 InfoExtractor.__init__(self, downloader)
2471 self._google_ie = google_ie
2473 def report_download_page(self, query, pagenum):
2474 """Report attempt to download playlist page with given number."""
2475 query = query.decode(preferredencoding())
2476 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2478 def _real_initialize(self):
2479 self._google_ie.initialize()
2481 def _real_extract(self, query):
2482 mobj = re.match(self._VALID_URL, query)
2484 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2487 prefix, query = query.split(':')
2489 query = query.encode('utf-8')
2491 self._download_n_results(query, 1)
2493 elif prefix == 'all':
2494 self._download_n_results(query, self._max_google_results)
2500 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2502 elif n > self._max_google_results:
2503 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2504 n = self._max_google_results
2505 self._download_n_results(query, n)
2507 except ValueError: # parsing prefix as integer fails
2508 self._download_n_results(query, 1)
2511 def _download_n_results(self, query, n):
2512 """Downloads a specified number of results for a query"""
2518 self.report_download_page(query, pagenum)
2519 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2520 request = urllib2.Request(result_url)
2522 page = urllib2.urlopen(request).read()
2523 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2524 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2527 # Extract video identifiers
2528 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2529 video_id = mobj.group(1)
2530 if video_id not in video_ids:
2531 video_ids.append(video_id)
2532 if len(video_ids) == n:
2533 # Specified n videos reached
2534 for id in video_ids:
2535 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2538 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2539 for id in video_ids:
2540 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2543 pagenum = pagenum + 1
2546 class YahooSearchIE(InfoExtractor):
2547 """Information Extractor for Yahoo! Video search queries."""
2548 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2549 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2550 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2551 _MORE_PAGES_INDICATOR = r'\s*Next'
2553 _max_yahoo_results = 1000
2554 IE_NAME = u'video.yahoo:search'
2556 def __init__(self, yahoo_ie, downloader=None):
2557 InfoExtractor.__init__(self, downloader)
2558 self._yahoo_ie = yahoo_ie
2560 def report_download_page(self, query, pagenum):
2561 """Report attempt to download playlist page with given number."""
2562 query = query.decode(preferredencoding())
2563 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2565 def _real_initialize(self):
2566 self._yahoo_ie.initialize()
2568 def _real_extract(self, query):
2569 mobj = re.match(self._VALID_URL, query)
2571 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2574 prefix, query = query.split(':')
2576 query = query.encode('utf-8')
2578 self._download_n_results(query, 1)
2580 elif prefix == 'all':
2581 self._download_n_results(query, self._max_yahoo_results)
2587 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2589 elif n > self._max_yahoo_results:
2590 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2591 n = self._max_yahoo_results
2592 self._download_n_results(query, n)
2594 except ValueError: # parsing prefix as integer fails
2595 self._download_n_results(query, 1)
2598 def _download_n_results(self, query, n):
2599 """Downloads a specified number of results for a query"""
2602 already_seen = set()
2606 self.report_download_page(query, pagenum)
2607 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2608 request = urllib2.Request(result_url)
2610 page = urllib2.urlopen(request).read()
2611 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2612 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2615 # Extract video identifiers
2616 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2617 video_id = mobj.group(1)
2618 if video_id not in already_seen:
2619 video_ids.append(video_id)
2620 already_seen.add(video_id)
2621 if len(video_ids) == n:
2622 # Specified n videos reached
2623 for id in video_ids:
2624 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2627 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2628 for id in video_ids:
2629 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2632 pagenum = pagenum + 1
2635 class YoutubePlaylistIE(InfoExtractor):
2636 """Information Extractor for YouTube playlists."""
2638 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2639 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2640 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=PL%s&'
2641 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2643 IE_NAME = u'youtube:playlist'
2645 def __init__(self, youtube_ie, downloader=None):
2646 InfoExtractor.__init__(self, downloader)
2647 self._youtube_ie = youtube_ie
2649 def report_download_page(self, playlist_id, pagenum):
2650 """Report attempt to download playlist page with given number."""
2651 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2653 def _real_initialize(self):
2654 self._youtube_ie.initialize()
2656 def _real_extract(self, url):
2657 # Extract playlist id
2658 mobj = re.match(self._VALID_URL, url)
2660 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2664 if mobj.group(3) is not None:
2665 self._youtube_ie.extract(mobj.group(3))
2668 # Download playlist pages
2669 # prefix is 'p' as default for playlists but there are other types that need extra care
2670 playlist_prefix = mobj.group(1)
2671 if playlist_prefix == 'a':
2672 playlist_access = 'artist'
2674 playlist_prefix = 'p'
2675 playlist_access = 'view_play_list'
2676 playlist_id = mobj.group(2)
2681 self.report_download_page(playlist_id, pagenum)
2682 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2683 request = urllib2.Request(url)
2685 page = urllib2.urlopen(request).read()
2686 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2687 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2690 # Extract video identifiers
2692 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
2693 if mobj.group(1) not in ids_in_page:
2694 ids_in_page.append(mobj.group(1))
2695 video_ids.extend(ids_in_page)
2697 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2699 pagenum = pagenum + 1
2701 playliststart = self._downloader.params.get('playliststart', 1) - 1
2702 playlistend = self._downloader.params.get('playlistend', -1)
2703 if playlistend == -1:
2704 video_ids = video_ids[playliststart:]
2706 video_ids = video_ids[playliststart:playlistend]
2708 for id in video_ids:
2709 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2713 class YoutubeUserIE(InfoExtractor):
2714 """Information Extractor for YouTube users."""
2716 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2717 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2718 _GDATA_PAGE_SIZE = 50
2719 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2720 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2722 IE_NAME = u'youtube:user'
2724 def __init__(self, youtube_ie, downloader=None):
2725 InfoExtractor.__init__(self, downloader)
2726 self._youtube_ie = youtube_ie
2728 def report_download_page(self, username, start_index):
2729 """Report attempt to download user page."""
2730 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2731 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2733 def _real_initialize(self):
2734 self._youtube_ie.initialize()
2736 def _real_extract(self, url):
2738 mobj = re.match(self._VALID_URL, url)
2740 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2743 username = mobj.group(1)
2745 # Download video ids using YouTube Data API. Result size per
2746 # query is limited (currently to 50 videos) so we need to query
2747 # page by page until there are no video ids - it means we got
2754 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2755 self.report_download_page(username, start_index)
2757 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2760 page = urllib2.urlopen(request).read()
2761 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2762 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2765 # Extract video identifiers
2768 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2769 if mobj.group(1) not in ids_in_page:
2770 ids_in_page.append(mobj.group(1))
2772 video_ids.extend(ids_in_page)
2774 # A little optimization - if current page is not
2775 # "full", ie. does not contain PAGE_SIZE video ids then
2776 # we can assume that this page is the last one - there
2777 # are no more ids on further pages - no need to query
2780 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2785 all_ids_count = len(video_ids)
2786 playliststart = self._downloader.params.get('playliststart', 1) - 1
2787 playlistend = self._downloader.params.get('playlistend', -1)
2789 if playlistend == -1:
2790 video_ids = video_ids[playliststart:]
2792 video_ids = video_ids[playliststart:playlistend]
2794 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2795 (username, all_ids_count, len(video_ids)))
2797 for video_id in video_ids:
2798 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2801 class DepositFilesIE(InfoExtractor):
2802 """Information extractor for depositfiles.com"""
2804 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2805 IE_NAME = u'DepositFiles'
2807 def __init__(self, downloader=None):
2808 InfoExtractor.__init__(self, downloader)
2810 def report_download_webpage(self, file_id):
2811 """Report webpage download."""
2812 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2814 def report_extraction(self, file_id):
2815 """Report information extraction."""
2816 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2818 def _real_extract(self, url):
2819 # At this point we have a new file
2820 self._downloader.increment_downloads()
2822 file_id = url.split('/')[-1]
2823 # Rebuild url in english locale
2824 url = 'http://depositfiles.com/en/files/' + file_id
2826 # Retrieve file webpage with 'Free download' button pressed
2827 free_download_indication = { 'gateway_result' : '1' }
2828 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2830 self.report_download_webpage(file_id)
2831 webpage = urllib2.urlopen(request).read()
2832 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2833 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2836 # Search for the real file URL
2837 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2838 if (mobj is None) or (mobj.group(1) is None):
2839 # Try to figure out reason of the error.
2840 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2841 if (mobj is not None) and (mobj.group(1) is not None):
2842 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2843 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2845 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2848 file_url = mobj.group(1)
2849 file_extension = os.path.splitext(file_url)[1][1:]
2851 # Search for file title
2852 mobj = re.search(r'<b title="(.*?)">', webpage)
2854 self._downloader.trouble(u'ERROR: unable to extract title')
2856 file_title = mobj.group(1).decode('utf-8')
2859 # Process file information
2860 self._downloader.process_info({
2861 'id': file_id.decode('utf-8'),
2862 'url': file_url.decode('utf-8'),
2864 'upload_date': u'NA',
2865 'title': file_title,
2866 'stitle': file_title,
2867 'ext': file_extension.decode('utf-8'),
2871 except UnavailableVideoError, err:
2872 self._downloader.trouble(u'ERROR: unable to download file')
2875 class FacebookIE(InfoExtractor):
2876 """Information Extractor for Facebook"""
2878 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2879 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2880 _NETRC_MACHINE = 'facebook'
2881 _available_formats = ['video', 'highqual', 'lowqual']
2882 _video_extensions = {
2887 IE_NAME = u'facebook'
2889 def __init__(self, downloader=None):
2890 InfoExtractor.__init__(self, downloader)
2892 def _reporter(self, message):
2893 """Add header and report message."""
2894 self._downloader.to_screen(u'[facebook] %s' % message)
2896 def report_login(self):
2897 """Report attempt to log in."""
2898 self._reporter(u'Logging in')
2900 def report_video_webpage_download(self, video_id):
2901 """Report attempt to download video webpage."""
2902 self._reporter(u'%s: Downloading video webpage' % video_id)
2904 def report_information_extraction(self, video_id):
2905 """Report attempt to extract video information."""
2906 self._reporter(u'%s: Extracting video information' % video_id)
2908 def _parse_page(self, video_webpage):
2909 """Extract video information from page"""
2911 data = {'title': r'\("video_title", "(.*?)"\)',
2912 'description': r'<div class="datawrap">(.*?)</div>',
2913 'owner': r'\("video_owner_name", "(.*?)"\)',
2914 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2917 for piece in data.keys():
2918 mobj = re.search(data[piece], video_webpage)
2919 if mobj is not None:
2920 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2924 for fmt in self._available_formats:
2925 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2926 if mobj is not None:
2927 # URL is in a Javascript segment inside an escaped Unicode format within
2928 # the generally utf-8 page
2929 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2930 video_info['video_urls'] = video_urls
2934 def _real_initialize(self):
2935 if self._downloader is None:
2940 downloader_params = self._downloader.params
2942 # Attempt to use provided username and password or .netrc data
2943 if downloader_params.get('username', None) is not None:
2944 useremail = downloader_params['username']
2945 password = downloader_params['password']
2946 elif downloader_params.get('usenetrc', False):
2948 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2949 if info is not None:
2953 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2954 except (IOError, netrc.NetrcParseError), err:
2955 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2958 if useremail is None:
2967 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2970 login_results = urllib2.urlopen(request).read()
2971 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2972 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2974 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2975 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2978 def _real_extract(self, url):
2979 mobj = re.match(self._VALID_URL, url)
2981 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2983 video_id = mobj.group('ID')
2986 self.report_video_webpage_download(video_id)
2987 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2989 page = urllib2.urlopen(request)
2990 video_webpage = page.read()
2991 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2992 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2995 # Start extracting information
2996 self.report_information_extraction(video_id)
2998 # Extract information
2999 video_info = self._parse_page(video_webpage)
3002 if 'owner' not in video_info:
3003 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
3005 video_uploader = video_info['owner']
3008 if 'title' not in video_info:
3009 self._downloader.trouble(u'ERROR: unable to extract video title')
3011 video_title = video_info['title']
3012 video_title = video_title.decode('utf-8')
3013 video_title = sanitize_title(video_title)
3015 simple_title = _simplify_title(video_title)
3018 if 'thumbnail' not in video_info:
3019 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
3020 video_thumbnail = ''
3022 video_thumbnail = video_info['thumbnail']
3026 if 'upload_date' in video_info:
3027 upload_time = video_info['upload_date']
3028 timetuple = email.utils.parsedate_tz(upload_time)
3029 if timetuple is not None:
3031 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
3036 video_description = video_info.get('description', 'No description available.')
3038 url_map = video_info['video_urls']
3039 if len(url_map.keys()) > 0:
3040 # Decide which formats to download
3041 req_format = self._downloader.params.get('format', None)
3042 format_limit = self._downloader.params.get('format_limit', None)
3044 if format_limit is not None and format_limit in self._available_formats:
3045 format_list = self._available_formats[self._available_formats.index(format_limit):]
3047 format_list = self._available_formats
3048 existing_formats = [x for x in format_list if x in url_map]
3049 if len(existing_formats) == 0:
3050 self._downloader.trouble(u'ERROR: no known formats available for video')
3052 if req_format is None:
3053 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
3054 elif req_format == 'worst':
3055 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3056 elif req_format == '-1':
3057 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3060 if req_format not in url_map:
3061 self._downloader.trouble(u'ERROR: requested format not available')
3063 video_url_list = [(req_format, url_map[req_format])] # Specific format
3065 for format_param, video_real_url in video_url_list:
3067 # At this point we have a new video
3068 self._downloader.increment_downloads()
3071 video_extension = self._video_extensions.get(format_param, 'mp4')
3074 # Process video information
3075 self._downloader.process_info({
3076 'id': video_id.decode('utf-8'),
3077 'url': video_real_url.decode('utf-8'),
3078 'uploader': video_uploader.decode('utf-8'),
3079 'upload_date': upload_date,
3080 'title': video_title,
3081 'stitle': simple_title,
3082 'ext': video_extension.decode('utf-8'),
3083 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3084 'thumbnail': video_thumbnail.decode('utf-8'),
3085 'description': video_description.decode('utf-8'),
3088 except UnavailableVideoError, err:
3089 self._downloader.trouble(u'\nERROR: unable to download video')
3091 class BlipTVIE(InfoExtractor):
3092 """Information extractor for blip.tv"""
3094 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3095 _URL_EXT = r'^.*\.([a-z0-9]+)$'
3096 IE_NAME = u'blip.tv'
3098 def report_extraction(self, file_id):
3099 """Report information extraction."""
3100 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3102 def report_direct_download(self, title):
3103 """Report information extraction."""
3104 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3106 def _real_extract(self, url):
3107 mobj = re.match(self._VALID_URL, url)
3109 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3116 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3117 request = urllib2.Request(json_url)
3118 self.report_extraction(mobj.group(1))
3121 urlh = urllib2.urlopen(request)
3122 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3123 basename = url.split('/')[-1]
3124 title,ext = os.path.splitext(basename)
3125 title = title.decode('UTF-8')
3126 ext = ext.replace('.', '')
3127 self.report_direct_download(title)
3132 'stitle': _simplify_title(title),
3136 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3137 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3139 if info is None: # Regular URL
3141 json_code = urlh.read()
3142 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3143 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3147 json_data = json.loads(json_code)
3148 if 'Post' in json_data:
3149 data = json_data['Post']
3153 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3154 video_url = data['media']['url']
3155 umobj = re.match(self._URL_EXT, video_url)
3157 raise ValueError('Can not determine filename extension')
3158 ext = umobj.group(1)
3161 'id': data['item_id'],
3163 'uploader': data['display_name'],
3164 'upload_date': upload_date,
3165 'title': data['title'],
3166 'stitle': _simplify_title(data['title']),
3168 'format': data['media']['mimeType'],
3169 'thumbnail': data['thumbnailUrl'],
3170 'description': data['description'],
3171 'player_url': data['embedUrl']
3173 except (ValueError,KeyError), err:
3174 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3177 self._downloader.increment_downloads()
3180 self._downloader.process_info(info)
3181 except UnavailableVideoError, err:
3182 self._downloader.trouble(u'\nERROR: unable to download video')
3185 class MyVideoIE(InfoExtractor):
3186 """Information Extractor for myvideo.de."""
3188 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3189 IE_NAME = u'myvideo'
3191 def __init__(self, downloader=None):
3192 InfoExtractor.__init__(self, downloader)
3194 def report_download_webpage(self, video_id):
3195 """Report webpage download."""
3196 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3198 def report_extraction(self, video_id):
3199 """Report information extraction."""
3200 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3202 def _real_extract(self,url):
3203 mobj = re.match(self._VALID_URL, url)
3205 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3208 video_id = mobj.group(1)
3211 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3213 self.report_download_webpage(video_id)
3214 webpage = urllib2.urlopen(request).read()
3215 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3216 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3219 self.report_extraction(video_id)
3220 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3223 self._downloader.trouble(u'ERROR: unable to extract media URL')
3225 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3227 mobj = re.search('<title>([^<]+)</title>', webpage)
3229 self._downloader.trouble(u'ERROR: unable to extract title')
3232 video_title = mobj.group(1)
3233 video_title = sanitize_title(video_title)
3235 simple_title = _simplify_title(video_title)
3238 self._downloader.process_info({
3242 'upload_date': u'NA',
3243 'title': video_title,
3244 'stitle': simple_title,
3249 except UnavailableVideoError:
3250 self._downloader.trouble(u'\nERROR: Unable to download video')
3252 class ComedyCentralIE(InfoExtractor):
3253 """Information extractor for The Daily Show and Colbert Report """
3255 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3256 IE_NAME = u'comedycentral'
3258 def report_extraction(self, episode_id):
3259 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3261 def report_config_download(self, episode_id):
3262 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3264 def report_index_download(self, episode_id):
3265 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3267 def report_player_url(self, episode_id):
3268 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3270 def _real_extract(self, url):
3271 mobj = re.match(self._VALID_URL, url)
3273 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3276 if mobj.group('shortname'):
3277 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3278 url = u'http://www.thedailyshow.com/full-episodes/'
3280 url = u'http://www.colbertnation.com/full-episodes/'
3281 mobj = re.match(self._VALID_URL, url)
3282 assert mobj is not None
3284 dlNewest = not mobj.group('episode')
3286 epTitle = mobj.group('showname')
3288 epTitle = mobj.group('episode')
3290 req = urllib2.Request(url)
3291 self.report_extraction(epTitle)
3293 htmlHandle = urllib2.urlopen(req)
3294 html = htmlHandle.read()
3295 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3296 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3299 url = htmlHandle.geturl()
3300 mobj = re.match(self._VALID_URL, url)
3302 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3304 if mobj.group('episode') == '':
3305 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3307 epTitle = mobj.group('episode')
3309 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3310 if len(mMovieParams) == 0:
3311 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3314 playerUrl_raw = mMovieParams[0][0]
3315 self.report_player_url(epTitle)
3317 urlHandle = urllib2.urlopen(playerUrl_raw)
3318 playerUrl = urlHandle.geturl()
3319 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3320 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3323 uri = mMovieParams[0][1]
3324 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3325 self.report_index_download(epTitle)
3327 indexXml = urllib2.urlopen(indexUrl).read()
3328 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3329 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3332 idoc = xml.etree.ElementTree.fromstring(indexXml)
3333 itemEls = idoc.findall('.//item')
3334 for itemEl in itemEls:
3335 mediaId = itemEl.findall('./guid')[0].text
3336 shortMediaId = mediaId.split(':')[-1]
3337 showId = mediaId.split(':')[-2].replace('.com', '')
3338 officialTitle = itemEl.findall('./title')[0].text
3339 officialDate = itemEl.findall('./pubDate')[0].text
3341 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3342 urllib.urlencode({'uri': mediaId}))
3343 configReq = urllib2.Request(configUrl)
3344 self.report_config_download(epTitle)
3346 configXml = urllib2.urlopen(configReq).read()
3347 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3348 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3351 cdoc = xml.etree.ElementTree.fromstring(configXml)
3353 for rendition in cdoc.findall('.//rendition'):
3354 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3358 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3361 # For now, just pick the highest bitrate
3362 format,video_url = turls[-1]
3364 self._downloader.increment_downloads()
3366 effTitle = showId + u'-' + epTitle
3371 'upload_date': officialDate,
3373 'stitle': _simplify_title(effTitle),
3377 'description': officialTitle,
3378 'player_url': playerUrl
3382 self._downloader.process_info(info)
3383 except UnavailableVideoError, err:
3384 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3388 class EscapistIE(InfoExtractor):
3389 """Information extractor for The Escapist """
3391 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3392 IE_NAME = u'escapist'
3394 def report_extraction(self, showName):
3395 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3397 def report_config_download(self, showName):
3398 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3400 def _real_extract(self, url):
3401 mobj = re.match(self._VALID_URL, url)
3403 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3405 showName = mobj.group('showname')
3406 videoId = mobj.group('episode')
3408 self.report_extraction(showName)
3410 webPage = urllib2.urlopen(url).read()
3411 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3412 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3415 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3416 description = _unescapeHTML(descMatch.group(1))
3417 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3418 imgUrl = _unescapeHTML(imgMatch.group(1))
3419 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3420 playerUrl = _unescapeHTML(playerUrlMatch.group(1))
3421 configUrlMatch = re.search('config=(.*)$', playerUrl)
3422 configUrl = urllib2.unquote(configUrlMatch.group(1))
3424 self.report_config_download(showName)
3426 configJSON = urllib2.urlopen(configUrl).read()
3427 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3428 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3431 # Technically, it's JavaScript, not JSON
3432 configJSON = configJSON.replace("'", '"')
3435 config = json.loads(configJSON)
3436 except (ValueError,), err:
3437 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3440 playlist = config['playlist']
3441 videoUrl = playlist[1]['url']
3443 self._downloader.increment_downloads()
3447 'uploader': showName,
3448 'upload_date': None,
3450 'stitle': _simplify_title(showName),
3453 'thumbnail': imgUrl,
3454 'description': description,
3455 'player_url': playerUrl,
3459 self._downloader.process_info(info)
3460 except UnavailableVideoError, err:
3461 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3464 class CollegeHumorIE(InfoExtractor):
3465 """Information extractor for collegehumor.com"""
3467 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3468 IE_NAME = u'collegehumor'
3470 def report_webpage(self, video_id):
3471 """Report information extraction."""
3472 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3474 def report_extraction(self, video_id):
3475 """Report information extraction."""
3476 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3478 def _real_extract(self, url):
3479 mobj = re.match(self._VALID_URL, url)
3481 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3483 video_id = mobj.group('videoid')
3485 self.report_webpage(video_id)
3486 request = urllib2.Request(url)
3488 webpage = urllib2.urlopen(request).read()
3489 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3490 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3493 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3495 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3497 internal_video_id = m.group('internalvideoid')
3501 'internal_id': internal_video_id,
3504 self.report_extraction(video_id)
3505 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3507 metaXml = urllib2.urlopen(xmlUrl).read()
3508 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3509 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3512 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3514 videoNode = mdoc.findall('./video')[0]
3515 info['description'] = videoNode.findall('./description')[0].text
3516 info['title'] = videoNode.findall('./caption')[0].text
3517 info['stitle'] = _simplify_title(info['title'])
3518 info['url'] = videoNode.findall('./file')[0].text
3519 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3520 info['ext'] = info['url'].rpartition('.')[2]
3521 info['format'] = info['ext']
3523 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3526 self._downloader.increment_downloads()
3529 self._downloader.process_info(info)
3530 except UnavailableVideoError, err:
3531 self._downloader.trouble(u'\nERROR: unable to download video')
3534 class XVideosIE(InfoExtractor):
3535 """Information extractor for xvideos.com"""
3537 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3538 IE_NAME = u'xvideos'
3540 def report_webpage(self, video_id):
3541 """Report information extraction."""
3542 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3544 def report_extraction(self, video_id):
3545 """Report information extraction."""
3546 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3548 def _real_extract(self, url):
3549 mobj = re.match(self._VALID_URL, url)
3551 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3553 video_id = mobj.group(1).decode('utf-8')
3555 self.report_webpage(video_id)
3557 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3559 webpage = urllib2.urlopen(request).read()
3560 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3561 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3564 self.report_extraction(video_id)
3568 mobj = re.search(r'flv_url=(.+?)&', webpage)
3570 self._downloader.trouble(u'ERROR: unable to extract video url')
3572 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3576 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3578 self._downloader.trouble(u'ERROR: unable to extract video title')
3580 video_title = mobj.group(1).decode('utf-8')
3583 # Extract video thumbnail
3584 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3586 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3588 video_thumbnail = mobj.group(1).decode('utf-8')
3592 self._downloader.increment_downloads()
3597 'upload_date': None,
3598 'title': video_title,
3599 'stitle': _simplify_title(video_title),
3602 'thumbnail': video_thumbnail,
3603 'description': None,
3608 self._downloader.process_info(info)
3609 except UnavailableVideoError, err:
3610 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3613 class SoundcloudIE(InfoExtractor):
3614 """Information extractor for soundcloud.com
3615 To access the media, the uid of the song and a stream token
3616 must be extracted from the page source and the script must make
3617 a request to media.soundcloud.com/crossdomain.xml. Then
3618 the media can be grabbed by requesting from an url composed
3619 of the stream token and uid
3622 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3623 IE_NAME = u'soundcloud'
3625 def __init__(self, downloader=None):
3626 InfoExtractor.__init__(self, downloader)
3628 def report_webpage(self, video_id):
3629 """Report information extraction."""
3630 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3632 def report_extraction(self, video_id):
3633 """Report information extraction."""
3634 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3636 def _real_extract(self, url):
3637 mobj = re.match(self._VALID_URL, url)
3639 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3642 # extract uploader (which is in the url)
3643 uploader = mobj.group(1).decode('utf-8')
3644 # extract simple title (uploader + slug of song title)
3645 slug_title = mobj.group(2).decode('utf-8')
3646 simple_title = uploader + '-' + slug_title
3648 self.report_webpage('%s/%s' % (uploader, slug_title))
3650 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3652 webpage = urllib2.urlopen(request).read()
3653 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3654 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3657 self.report_extraction('%s/%s' % (uploader, slug_title))
3659 # extract uid and stream token that soundcloud hands out for access
3660 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3662 video_id = mobj.group(1)
3663 stream_token = mobj.group(2)
3665 # extract unsimplified title
3666 mobj = re.search('"title":"(.*?)",', webpage)
3668 title = mobj.group(1)
3670 # construct media url (with uid/token)
3671 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3672 mediaURL = mediaURL % (video_id, stream_token)
3675 description = u'No description available'
3676 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3678 description = mobj.group(1)
3682 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3685 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3686 except Exception, e:
3689 # for soundcloud, a request to a cross domain is required for cookies
3690 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3693 self._downloader.process_info({
3694 'id': video_id.decode('utf-8'),
3696 'uploader': uploader.decode('utf-8'),
3697 'upload_date': upload_date,
3698 'title': simple_title.decode('utf-8'),
3699 'stitle': simple_title.decode('utf-8'),
3703 'description': description.decode('utf-8')
3705 except UnavailableVideoError:
3706 self._downloader.trouble(u'\nERROR: unable to download video')
3709 class InfoQIE(InfoExtractor):
3710 """Information extractor for infoq.com"""
3712 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3715 def report_webpage(self, video_id):
3716 """Report information extraction."""
3717 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3719 def report_extraction(self, video_id):
3720 """Report information extraction."""
3721 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3723 def _real_extract(self, url):
3724 mobj = re.match(self._VALID_URL, url)
3726 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3729 self.report_webpage(url)
3731 request = urllib2.Request(url)
3733 webpage = urllib2.urlopen(request).read()
3734 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3735 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3738 self.report_extraction(url)
3742 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3744 self._downloader.trouble(u'ERROR: unable to extract video url')
3746 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3750 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3752 self._downloader.trouble(u'ERROR: unable to extract video title')
3754 video_title = mobj.group(1).decode('utf-8')
3756 # Extract description
3757 video_description = u'No description available.'
3758 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3759 if mobj is not None:
3760 video_description = mobj.group(1).decode('utf-8')
3762 video_filename = video_url.split('/')[-1]
3763 video_id, extension = video_filename.split('.')
3765 self._downloader.increment_downloads()
3770 'upload_date': None,
3771 'title': video_title,
3772 'stitle': _simplify_title(video_title),
3774 'format': extension, # Extension is always(?) mp4, but seems to be flv
3776 'description': video_description,
3781 self._downloader.process_info(info)
3782 except UnavailableVideoError, err:
3783 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3785 class MixcloudIE(InfoExtractor):
3786 """Information extractor for www.mixcloud.com"""
3787 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3788 IE_NAME = u'mixcloud'
3790 def __init__(self, downloader=None):
3791 InfoExtractor.__init__(self, downloader)
3793 def report_download_json(self, file_id):
3794 """Report JSON download."""
3795 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3797 def report_extraction(self, file_id):
3798 """Report information extraction."""
3799 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3801 def get_urls(self, jsonData, fmt, bitrate='best'):
3802 """Get urls from 'audio_formats' section in json"""
3805 bitrate_list = jsonData[fmt]
3806 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3807 bitrate = max(bitrate_list) # select highest
3809 url_list = jsonData[fmt][bitrate]
3810 except TypeError: # we have no bitrate info.
3811 url_list = jsonData[fmt]
3815 def check_urls(self, url_list):
3816 """Returns 1st active url from list"""
3817 for url in url_list:
3819 urllib2.urlopen(url)
3821 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3826 def _print_formats(self, formats):
3827 print 'Available formats:'
3828 for fmt in formats.keys():
3829 for b in formats[fmt]:
3831 ext = formats[fmt][b][0]
3832 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3833 except TypeError: # we have no bitrate info
3834 ext = formats[fmt][0]
3835 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3838 def _real_extract(self, url):
3839 mobj = re.match(self._VALID_URL, url)
3841 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3843 # extract uploader & filename from url
3844 uploader = mobj.group(1).decode('utf-8')
3845 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3847 # construct API request
3848 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3849 # retrieve .json file with links to files
3850 request = urllib2.Request(file_url)
3852 self.report_download_json(file_url)
3853 jsonData = urllib2.urlopen(request).read()
3854 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3855 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3859 json_data = json.loads(jsonData)
3860 player_url = json_data['player_swf_url']
3861 formats = dict(json_data['audio_formats'])
3863 req_format = self._downloader.params.get('format', None)
3866 if self._downloader.params.get('listformats', None):
3867 self._print_formats(formats)
3870 if req_format is None or req_format == 'best':
3871 for format_param in formats.keys():
3872 url_list = self.get_urls(formats, format_param)
3874 file_url = self.check_urls(url_list)
3875 if file_url is not None:
3878 if req_format not in formats.keys():
3879 self._downloader.trouble(u'ERROR: format is not available')
3882 url_list = self.get_urls(formats, req_format)
3883 file_url = self.check_urls(url_list)
3884 format_param = req_format
3887 self._downloader.increment_downloads()
3889 # Process file information
3890 self._downloader.process_info({
3891 'id': file_id.decode('utf-8'),
3892 'url': file_url.decode('utf-8'),
3893 'uploader': uploader.decode('utf-8'),
3894 'upload_date': u'NA',
3895 'title': json_data['name'],
3896 'stitle': _simplify_title(json_data['name']),
3897 'ext': file_url.split('.')[-1].decode('utf-8'),
3898 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3899 'thumbnail': json_data['thumbnail_url'],
3900 'description': json_data['description'],
3901 'player_url': player_url.decode('utf-8'),
3903 except UnavailableVideoError, err:
3904 self._downloader.trouble(u'ERROR: unable to download file')
3906 class StanfordOpenClassroomIE(InfoExtractor):
3907 """Information extractor for Stanford's Open ClassRoom"""
3909 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3910 IE_NAME = u'stanfordoc'
3912 def report_download_webpage(self, objid):
3913 """Report information extraction."""
3914 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3916 def report_extraction(self, video_id):
3917 """Report information extraction."""
3918 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3920 def _real_extract(self, url):
3921 mobj = re.match(self._VALID_URL, url)
3923 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3926 if mobj.group('course') and mobj.group('video'): # A specific video
3927 course = mobj.group('course')
3928 video = mobj.group('video')
3930 'id': _simplify_title(course + '_' + video),
3933 self.report_extraction(info['id'])
3934 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3935 xmlUrl = baseUrl + video + '.xml'
3937 metaXml = urllib2.urlopen(xmlUrl).read()
3938 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3939 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3941 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3943 info['title'] = mdoc.findall('./title')[0].text
3944 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3946 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3948 info['stitle'] = _simplify_title(info['title'])
3949 info['ext'] = info['url'].rpartition('.')[2]
3950 info['format'] = info['ext']
3951 self._downloader.increment_downloads()
3953 self._downloader.process_info(info)
3954 except UnavailableVideoError, err:
3955 self._downloader.trouble(u'\nERROR: unable to download video')
3956 elif mobj.group('course'): # A course page
3957 course = mobj.group('course')
3959 'id': _simplify_title(course),
3963 self.report_download_webpage(info['id'])
3965 coursepage = urllib2.urlopen(url).read()
3966 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3967 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3970 m = re.search('<h1>([^<]+)</h1>', coursepage)
3972 info['title'] = _unescapeHTML(m.group(1))
3974 info['title'] = info['id']
3975 info['stitle'] = _simplify_title(info['title'])
3977 m = re.search('<description>([^<]+)</description>', coursepage)
3979 info['description'] = _unescapeHTML(m.group(1))
3981 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3984 'type': 'reference',
3985 'url': 'http://openclassroom.stanford.edu/MainFolder/' + _unescapeHTML(vpage),
3989 for entry in info['list']:
3990 assert entry['type'] == 'reference'
3991 self.extract(entry['url'])
3994 'id': 'Stanford OpenClassroom',
3998 self.report_download_webpage(info['id'])
3999 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
4001 rootpage = urllib2.urlopen(rootURL).read()
4002 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4003 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
4006 info['title'] = info['id']
4007 info['stitle'] = _simplify_title(info['title'])
4009 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
4012 'type': 'reference',
4013 'url': 'http://openclassroom.stanford.edu/MainFolder/' + _unescapeHTML(cpage),
4017 for entry in info['list']:
4018 assert entry['type'] == 'reference'
4019 self.extract(entry['url'])
4021 class MTVIE(InfoExtractor):
4022 """Information extractor for MTV.com"""
4024 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
4027 def report_webpage(self, video_id):
4028 """Report information extraction."""
4029 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
4031 def report_extraction(self, video_id):
4032 """Report information extraction."""
4033 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
4035 def _real_extract(self, url):
4036 mobj = re.match(self._VALID_URL, url)
4038 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4040 if not mobj.group('proto'):
4041 url = 'http://' + url
4042 video_id = mobj.group('videoid')
4043 self.report_webpage(video_id)
4045 request = urllib2.Request(url)
4047 webpage = urllib2.urlopen(request).read()
4048 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4049 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4052 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4054 self._downloader.trouble(u'ERROR: unable to extract song name')
4056 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4057 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4059 self._downloader.trouble(u'ERROR: unable to extract performer')
4061 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4062 video_title = performer + ' - ' + song_name
4064 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4066 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4068 mtvn_uri = mobj.group(1)
4070 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4072 self._downloader.trouble(u'ERROR: unable to extract content id')
4074 content_id = mobj.group(1)
4076 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4077 self.report_extraction(video_id)
4078 request = urllib2.Request(videogen_url)
4080 metadataXml = urllib2.urlopen(request).read()
4081 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4082 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4085 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4086 renditions = mdoc.findall('.//rendition')
4088 # For now, always pick the highest quality.
4089 rendition = renditions[-1]
4092 _,_,ext = rendition.attrib['type'].partition('/')
4093 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4094 video_url = rendition.find('./src').text
4096 self._downloader.trouble('Invalid rendition field.')
4099 self._downloader.increment_downloads()
4103 'uploader': performer,
4104 'title': video_title,
4105 'stitle': _simplify_title(video_title),
4111 self._downloader.process_info(info)
4112 except UnavailableVideoError, err:
4113 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4116 class PostProcessor(object):
4117 """Post Processor class.
4119 PostProcessor objects can be added to downloaders with their
4120 add_post_processor() method. When the downloader has finished a
4121 successful download, it will take its internal chain of PostProcessors
4122 and start calling the run() method on each one of them, first with
4123 an initial argument and then with the returned value of the previous
4126 The chain will be stopped if one of them ever returns None or the end
4127 of the chain is reached.
4129 PostProcessor objects follow a "mutual registration" process similar
4130 to InfoExtractor objects.
4135 def __init__(self, downloader=None):
4136 self._downloader = downloader
4138 def set_downloader(self, downloader):
4139 """Sets the downloader for this PP."""
4140 self._downloader = downloader
4142 def run(self, information):
4143 """Run the PostProcessor.
4145 The "information" argument is a dictionary like the ones
4146 composed by InfoExtractors. The only difference is that this
4147 one has an extra field called "filepath" that points to the
4150 When this method returns None, the postprocessing chain is
4151 stopped. However, this method may return an information
4152 dictionary that will be passed to the next postprocessing
4153 object in the chain. It can be the one it received after
4154 changing some fields.
4156 In addition, this method may raise a PostProcessingError
4157 exception that will be taken into account by the downloader
4160 return information # by default, do nothing
4162 class AudioConversionError(BaseException):
4163 def __init__(self, message):
4164 self.message = message
4166 class FFmpegExtractAudioPP(PostProcessor):
4168 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4169 PostProcessor.__init__(self, downloader)
4170 if preferredcodec is None:
4171 preferredcodec = 'best'
4172 self._preferredcodec = preferredcodec
4173 self._preferredquality = preferredquality
4174 self._keepvideo = keepvideo
4177 def get_audio_codec(path):
4179 cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4180 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4181 output = handle.communicate()[0]
4182 if handle.wait() != 0:
4184 except (IOError, OSError):
4187 for line in output.split('\n'):
4188 if line.startswith('codec_name='):
4189 audio_codec = line.split('=')[1].strip()
4190 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4195 def run_ffmpeg(path, out_path, codec, more_opts):
4199 acodec_opts = ['-acodec', codec]
4200 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4202 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4203 stdout,stderr = p.communicate()
4204 except (IOError, OSError):
4205 e = sys.exc_info()[1]
4206 if isinstance(e, OSError) and e.errno == 2:
4207 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4210 if p.returncode != 0:
4211 msg = stderr.strip().split('\n')[-1]
4212 raise AudioConversionError(msg)
4214 def run(self, information):
4215 path = information['filepath']
4217 filecodec = self.get_audio_codec(path)
4218 if filecodec is None:
4219 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4223 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4224 if self._preferredcodec == 'm4a' and filecodec == 'aac':
4225 # Lossless, but in another container
4227 extension = self._preferredcodec
4228 more_opts = ['-absf', 'aac_adtstoasc']
4229 elif filecodec in ['aac', 'mp3', 'vorbis']:
4230 # Lossless if possible
4232 extension = filecodec
4233 if filecodec == 'aac':
4234 more_opts = ['-f', 'adts']
4235 if filecodec == 'vorbis':
4239 acodec = 'libmp3lame'
4242 if self._preferredquality is not None:
4243 more_opts += ['-ab', self._preferredquality]
4245 # We convert the audio (lossy)
4246 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4247 extension = self._preferredcodec
4249 if self._preferredquality is not None:
4250 more_opts += ['-ab', self._preferredquality]
4251 if self._preferredcodec == 'aac':
4252 more_opts += ['-f', 'adts']
4253 if self._preferredcodec == 'm4a':
4254 more_opts += ['-absf', 'aac_adtstoasc']
4255 if self._preferredcodec == 'vorbis':
4257 if self._preferredcodec == 'wav':
4259 more_opts += ['-f', 'wav']
4261 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4262 new_path = prefix + sep + extension
4263 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4265 self.run_ffmpeg(path, new_path, acodec, more_opts)
4267 etype,e,tb = sys.exc_info()
4268 if isinstance(e, AudioConversionError):
4269 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4271 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4274 # Try to update the date time for extracted audio file.
4275 if information.get('filetime') is not None:
4277 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4279 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4281 if not self._keepvideo:
4283 os.remove(_encodeFilename(path))
4284 except (IOError, OSError):
4285 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4288 information['filepath'] = new_path
4292 def updateSelf(downloader, filename):
4293 ''' Update the program file with the latest version from the repository '''
4294 # Note: downloader only used for options
4295 if not os.access(filename, os.W_OK):
4296 sys.exit('ERROR: no write permissions on %s' % filename)
4298 downloader.to_screen(u'Updating to latest version...')
4302 urlh = urllib.urlopen(UPDATE_URL)
4303 newcontent = urlh.read()
4305 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4306 if vmatch is not None and vmatch.group(1) == __version__:
4307 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4311 except (IOError, OSError), err:
4312 sys.exit('ERROR: unable to download latest version')
4315 outf = open(filename, 'wb')
4317 outf.write(newcontent)
4320 except (IOError, OSError), err:
4321 sys.exit('ERROR: unable to overwrite current version')
4323 downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4326 def _readOptions(filename_bytes):
4328 optionf = open(filename_bytes)
4330 return [] # silently skip if file is not present
4334 res += shlex.split(l, comments=True)
4339 def _format_option_string(option):
4340 ''' ('-o', '--option') -> -o, --format METAVAR'''
4344 if option._short_opts: opts.append(option._short_opts[0])
4345 if option._long_opts: opts.append(option._long_opts[0])
4346 if len(opts) > 1: opts.insert(1, ', ')
4348 if option.takes_value(): opts.append(' %s' % option.metavar)
4350 return "".join(opts)
4352 def _find_term_columns():
4353 columns = os.environ.get('COLUMNS', None)
4358 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4359 out,err = sp.communicate()
4360 return int(out.split()[1])
4366 max_help_position = 80
4368 # No need to wrap help messages if we're on a wide console
4369 columns = _find_term_columns()
4370 if columns: max_width = columns
4372 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4373 fmt.format_option_strings = _format_option_string
4376 'version' : __version__,
4378 'usage' : '%prog [options] url [url...]',
4379 'conflict_handler' : 'resolve',
4382 parser = optparse.OptionParser(**kw)
4385 general = optparse.OptionGroup(parser, 'General Options')
4386 selection = optparse.OptionGroup(parser, 'Video Selection')
4387 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4388 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4389 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4390 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4391 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4393 general.add_option('-h', '--help',
4394 action='help', help='print this help text and exit')
4395 general.add_option('-v', '--version',
4396 action='version', help='print program version and exit')
4397 general.add_option('-U', '--update',
4398 action='store_true', dest='update_self', help='update this program to latest version')
4399 general.add_option('-i', '--ignore-errors',
4400 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4401 general.add_option('-r', '--rate-limit',
4402 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4403 general.add_option('-R', '--retries',
4404 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4405 general.add_option('--dump-user-agent',
4406 action='store_true', dest='dump_user_agent',
4407 help='display the current browser identification', default=False)
4408 general.add_option('--list-extractors',
4409 action='store_true', dest='list_extractors',
4410 help='List all supported extractors and the URLs they would handle', default=False)
4412 selection.add_option('--playlist-start',
4413 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4414 selection.add_option('--playlist-end',
4415 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4416 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4417 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4418 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4420 authentication.add_option('-u', '--username',
4421 dest='username', metavar='USERNAME', help='account username')
4422 authentication.add_option('-p', '--password',
4423 dest='password', metavar='PASSWORD', help='account password')
4424 authentication.add_option('-n', '--netrc',
4425 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4428 video_format.add_option('-f', '--format',
4429 action='store', dest='format', metavar='FORMAT', help='video format code')
4430 video_format.add_option('--all-formats',
4431 action='store_const', dest='format', help='download all available video formats', const='all')
4432 video_format.add_option('--prefer-free-formats',
4433 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4434 video_format.add_option('--max-quality',
4435 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4436 video_format.add_option('-F', '--list-formats',
4437 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4438 video_format.add_option('--write-srt',
4439 action='store_true', dest='writesubtitles',
4440 help='write video closed captions to a .srt file (currently youtube only)', default=False)
4441 video_format.add_option('--srt-lang',
4442 action='store', dest='subtitleslang', metavar='LANG',
4443 help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
4446 verbosity.add_option('-q', '--quiet',
4447 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4448 verbosity.add_option('-s', '--simulate',
4449 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4450 verbosity.add_option('--skip-download',
4451 action='store_true', dest='skip_download', help='do not download the video', default=False)
4452 verbosity.add_option('-g', '--get-url',
4453 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4454 verbosity.add_option('-e', '--get-title',
4455 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4456 verbosity.add_option('--get-thumbnail',
4457 action='store_true', dest='getthumbnail',
4458 help='simulate, quiet but print thumbnail URL', default=False)
4459 verbosity.add_option('--get-description',
4460 action='store_true', dest='getdescription',
4461 help='simulate, quiet but print video description', default=False)
4462 verbosity.add_option('--get-filename',
4463 action='store_true', dest='getfilename',
4464 help='simulate, quiet but print output filename', default=False)
4465 verbosity.add_option('--get-format',
4466 action='store_true', dest='getformat',
4467 help='simulate, quiet but print output format', default=False)
4468 verbosity.add_option('--no-progress',
4469 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4470 verbosity.add_option('--console-title',
4471 action='store_true', dest='consoletitle',
4472 help='display progress in console titlebar', default=False)
4473 verbosity.add_option('-v', '--verbose',
4474 action='store_true', dest='verbose', help='print various debugging information', default=False)
4477 filesystem.add_option('-t', '--title',
4478 action='store_true', dest='usetitle', help='use title in file name', default=False)
4479 filesystem.add_option('-l', '--literal',
4480 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4481 filesystem.add_option('-A', '--auto-number',
4482 action='store_true', dest='autonumber',
4483 help='number downloaded files starting from 00000', default=False)
4484 filesystem.add_option('-o', '--output',
4485 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4486 filesystem.add_option('-a', '--batch-file',
4487 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4488 filesystem.add_option('-w', '--no-overwrites',
4489 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4490 filesystem.add_option('-c', '--continue',
4491 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4492 filesystem.add_option('--no-continue',
4493 action='store_false', dest='continue_dl',
4494 help='do not resume partially downloaded files (restart from beginning)')
4495 filesystem.add_option('--cookies',
4496 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4497 filesystem.add_option('--no-part',
4498 action='store_true', dest='nopart', help='do not use .part files', default=False)
4499 filesystem.add_option('--no-mtime',
4500 action='store_false', dest='updatetime',
4501 help='do not use the Last-modified header to set the file modification time', default=True)
4502 filesystem.add_option('--write-description',
4503 action='store_true', dest='writedescription',
4504 help='write video description to a .description file', default=False)
4505 filesystem.add_option('--write-info-json',
4506 action='store_true', dest='writeinfojson',
4507 help='write video metadata to a .info.json file', default=False)
4510 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4511 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4512 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4513 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4514 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4515 help='ffmpeg audio bitrate specification, 128k by default')
4516 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4517 help='keeps the video file on disk after the post-processing; the video is erased by default')
4520 parser.add_option_group(general)
4521 parser.add_option_group(selection)
4522 parser.add_option_group(filesystem)
4523 parser.add_option_group(verbosity)
4524 parser.add_option_group(video_format)
4525 parser.add_option_group(authentication)
4526 parser.add_option_group(postproc)
4528 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4530 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4532 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4533 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4534 opts, args = parser.parse_args(argv)
4536 return parser, opts, args
4538 def gen_extractors():
4539 """ Return a list of an instance of every supported extractor.
4540 The order does matter; the first extractor matched is the one handling the URL.
4542 youtube_ie = YoutubeIE()
4543 google_ie = GoogleIE()
4544 yahoo_ie = YahooIE()
4546 YoutubePlaylistIE(youtube_ie),
4547 YoutubeUserIE(youtube_ie),
4548 YoutubeSearchIE(youtube_ie),
4550 MetacafeIE(youtube_ie),
4553 GoogleSearchIE(google_ie),
4556 YahooSearchIE(yahoo_ie),
4569 StanfordOpenClassroomIE(),
4576 parser, opts, args = parseOpts()
4578 # Open appropriate CookieJar
4579 if opts.cookiefile is None:
4580 jar = cookielib.CookieJar()
4583 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4584 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4586 except (IOError, OSError), err:
4587 sys.exit(u'ERROR: unable to open cookie file')
4590 if opts.dump_user_agent:
4591 print std_headers['User-Agent']
4594 # Batch file verification
4596 if opts.batchfile is not None:
4598 if opts.batchfile == '-':
4601 batchfd = open(opts.batchfile, 'r')
4602 batchurls = batchfd.readlines()
4603 batchurls = [x.strip() for x in batchurls]
4604 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4606 sys.exit(u'ERROR: batch file could not be read')
4607 all_urls = batchurls + args
4609 # General configuration
4610 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4611 proxy_handler = urllib2.ProxyHandler()
4612 opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4613 urllib2.install_opener(opener)
4614 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4617 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4619 extractors = gen_extractors()
4621 if opts.list_extractors:
4622 for ie in extractors:
4624 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4625 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4626 for mu in matchedUrls:
4630 # Conflicting, missing and erroneous options
4631 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4632 parser.error(u'using .netrc conflicts with giving username/password')
4633 if opts.password is not None and opts.username is None:
4634 parser.error(u'account username missing')
4635 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4636 parser.error(u'using output template conflicts with using title, literal title or auto number')
4637 if opts.usetitle and opts.useliteral:
4638 parser.error(u'using title conflicts with using literal title')
4639 if opts.username is not None and opts.password is None:
4640 opts.password = getpass.getpass(u'Type account password and press return:')
4641 if opts.ratelimit is not None:
4642 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4643 if numeric_limit is None:
4644 parser.error(u'invalid rate limit specified')
4645 opts.ratelimit = numeric_limit
4646 if opts.retries is not None:
4648 opts.retries = long(opts.retries)
4649 except (TypeError, ValueError), err:
4650 parser.error(u'invalid retry count specified')
4652 opts.playliststart = int(opts.playliststart)
4653 if opts.playliststart <= 0:
4654 raise ValueError(u'Playlist start must be positive')
4655 except (TypeError, ValueError), err:
4656 parser.error(u'invalid playlist start number specified')
4658 opts.playlistend = int(opts.playlistend)
4659 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4660 raise ValueError(u'Playlist end must be greater than playlist start')
4661 except (TypeError, ValueError), err:
4662 parser.error(u'invalid playlist end number specified')
4663 if opts.extractaudio:
4664 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4665 parser.error(u'invalid audio format specified')
4668 fd = FileDownloader({
4669 'usenetrc': opts.usenetrc,
4670 'username': opts.username,
4671 'password': opts.password,
4672 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4673 'forceurl': opts.geturl,
4674 'forcetitle': opts.gettitle,
4675 'forcethumbnail': opts.getthumbnail,
4676 'forcedescription': opts.getdescription,
4677 'forcefilename': opts.getfilename,
4678 'forceformat': opts.getformat,
4679 'simulate': opts.simulate,
4680 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4681 'format': opts.format,
4682 'format_limit': opts.format_limit,
4683 'listformats': opts.listformats,
4684 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4685 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4686 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4687 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4688 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4689 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4690 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4691 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4692 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4693 or u'%(id)s.%(ext)s'),
4694 'ignoreerrors': opts.ignoreerrors,
4695 'ratelimit': opts.ratelimit,
4696 'nooverwrites': opts.nooverwrites,
4697 'retries': opts.retries,
4698 'continuedl': opts.continue_dl,
4699 'noprogress': opts.noprogress,
4700 'playliststart': opts.playliststart,
4701 'playlistend': opts.playlistend,
4702 'logtostderr': opts.outtmpl == '-',
4703 'consoletitle': opts.consoletitle,
4704 'nopart': opts.nopart,
4705 'updatetime': opts.updatetime,
4706 'writedescription': opts.writedescription,
4707 'writeinfojson': opts.writeinfojson,
4708 'writesubtitles': opts.writesubtitles,
4709 'subtitleslang': opts.subtitleslang,
4710 'matchtitle': opts.matchtitle,
4711 'rejecttitle': opts.rejecttitle,
4712 'max_downloads': opts.max_downloads,
4713 'prefer_free_formats': opts.prefer_free_formats,
4714 'verbose': opts.verbose,
4716 for extractor in extractors:
4717 fd.add_info_extractor(extractor)
4720 if opts.extractaudio:
4721 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4724 if opts.update_self:
4725 updateSelf(fd, sys.argv[0])
4728 if len(all_urls) < 1:
4729 if not opts.update_self:
4730 parser.error(u'you must provide at least one URL')
4735 retcode = fd.download(all_urls)
4736 except MaxDownloadsReached:
4737 fd.to_screen(u'--max-download limit reached, aborting.')
4740 # Dump cookie jar if requested
4741 if opts.cookiefile is not None:
4744 except (IOError, OSError), err:
4745 sys.exit(u'ERROR: unable to save cookie jar')
4752 except DownloadError:
4754 except SameFileError:
4755 sys.exit(u'ERROR: fixed output name but more than one file to download')
4756 except KeyboardInterrupt:
4757 sys.exit(u'\nERROR: Interrupted by user')
4759 if __name__ == '__main__':
4762 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: