2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
21 __license__ = 'Public Domain'
22 __version__ = '2012.02.27'
24 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
57 except ImportError: # Python 2.4
60 import cStringIO as StringIO
64 # parse_qs was moved from the cgi module to the urlparse module recently.
66 from urlparse import parse_qs
68 from cgi import parse_qs
71 import xml.etree.ElementTree
72 except ImportError: # Python<2.5: Not officially supported, but let it slip
73 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
76 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
77 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
78 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
79 'Accept-Encoding': 'gzip, deflate',
80 'Accept-Language': 'en-us,en;q=0.5',
85 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
91 def raiseError(msg, i):
92 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
93 def skipSpace(i, expectMore=True):
94 while i < len(s) and s[i] in ' \t\r\n':
98 raiseError('Premature end', i)
100 def decodeEscape(match):
116 return unichr(int(esc[1:5], 16))
117 if len(esc) == 5+6 and esc[5:7] == '\\u':
118 hi = int(esc[1:5], 16)
119 low = int(esc[7:11], 16)
120 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
121 raise ValueError('Unknown escape ' + str(esc))
128 while s[e-bslashes-1] == '\\':
130 if bslashes % 2 == 1:
134 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
135 stri = rexp.sub(decodeEscape, s[i:e])
141 if s[i] == '}': # Empty dictionary
145 raiseError('Expected a string object key', i)
146 i,key = parseString(i)
148 if i >= len(s) or s[i] != ':':
149 raiseError('Expected a colon', i)
156 raiseError('Expected comma or closing curly brace', i)
161 if s[i] == ']': # Empty array
166 i = skipSpace(i) # Raise exception if premature end
170 raiseError('Expected a comma or closing bracket', i)
172 def parseDiscrete(i):
173 for k,v in {'true': True, 'false': False, 'null': None}.items():
174 if s.startswith(k, i):
176 raiseError('Not a boolean (or null)', i)
178 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
180 raiseError('Not a number', i)
182 if '.' in nums or 'e' in nums or 'E' in nums:
183 return (i+len(nums), float(nums))
184 return (i+len(nums), int(nums))
185 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
188 i,res = CHARMAP.get(s[i], parseNumber)(i)
189 i = skipSpace(i, False)
193 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
197 class IDParser(HTMLParser.HTMLParser):
198 """Modified HTMLParser that isolates a tag with the specified id"""
199 def __init__(self, id):
205 self.watch_startpos = False
206 HTMLParser.HTMLParser.__init__(self)
208 def loads(self, html):
213 def handle_starttag(self, tag, attrs):
216 self.find_startpos(None)
217 if 'id' in attrs and attrs['id'] == self.id:
220 self.watch_startpos = True
222 if not tag in self.depth: self.depth[tag] = 0
225 def handle_endtag(self, tag):
227 if tag in self.depth: self.depth[tag] -= 1
228 if self.depth[self.result[0]] == 0:
230 self.result.append(self.getpos())
232 def find_startpos(self, x):
233 """Needed to put the start position of the result (self.result[1])
234 after the opening tag with the requested id"""
235 if self.watch_startpos:
236 self.watch_startpos = False
237 self.result.append(self.getpos())
238 handle_entityref = handle_charref = handle_data = handle_comment = \
239 handle_decl = handle_pi = unknown_decl = find_startpos
241 def get_result(self):
242 if self.result == None: return None
243 if len(self.result) != 3: return None
244 lines = self.html.split('\n')
245 lines = lines[self.result[1][0]-1:self.result[2][0]]
246 lines[0] = lines[0][self.result[1][1]:]
248 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
249 lines[-1] = lines[-1][:self.result[2][1]]
250 return '\n'.join(lines).strip()
252 def get_element_by_id(id, html):
253 """Return the content of the tag with the specified id in the passed HTML document"""
254 parser = IDParser(id)
256 return parser.get_result()
259 def preferredencoding():
260 """Get preferred encoding.
262 Returns the best encoding scheme for the system, based on
263 locale.getpreferredencoding() and some further tweaks.
265 def yield_preferredencoding():
267 pref = locale.getpreferredencoding()
273 return yield_preferredencoding().next()
276 def htmlentity_transform(matchobj):
277 """Transforms an HTML entity to a Unicode character.
279 This function receives a match object and is intended to be used with
280 the re.sub() function.
282 entity = matchobj.group(1)
284 # Known non-numeric HTML entity
285 if entity in htmlentitydefs.name2codepoint:
286 return unichr(htmlentitydefs.name2codepoint[entity])
289 mobj = re.match(ur'(?u)#(x?\d+)', entity)
291 numstr = mobj.group(1)
292 if numstr.startswith(u'x'):
294 numstr = u'0%s' % numstr
297 return unichr(long(numstr, base))
299 # Unknown entity in name, return its literal representation
300 return (u'&%s;' % entity)
303 def clean_html(html):
304 """Clean an HTML snippet into a readable string"""
306 html = html.replace('\n', ' ')
307 html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
309 html = re.sub('<.*?>', '', html)
310 # Replace html entities
311 html = _unescapeHTML(html)
315 def sanitize_title(utitle):
316 """Sanitizes a video title so it could be used as part of a filename."""
317 utitle = _unescapeHTML(utitle)
318 return utitle.replace(unicode(os.sep), u'%')
321 def sanitize_open(filename, open_mode):
322 """Try to open the given filename, and slightly tweak it if this fails.
324 Attempts to open the given filename. If this fails, it tries to change
325 the filename slightly, step by step, until it's either able to open it
326 or it fails and raises a final exception, like the standard open()
329 It returns the tuple (stream, definitive_file_name).
333 if sys.platform == 'win32':
335 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
336 return (sys.stdout, filename)
337 stream = open(_encodeFilename(filename), open_mode)
338 return (stream, filename)
339 except (IOError, OSError), err:
340 # In case of error, try to remove win32 forbidden chars
341 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
343 # An exception here should be caught in the caller
344 stream = open(_encodeFilename(filename), open_mode)
345 return (stream, filename)
348 def timeconvert(timestr):
349 """Convert RFC 2822 defined time string into system timestamp"""
351 timetuple = email.utils.parsedate_tz(timestr)
352 if timetuple is not None:
353 timestamp = email.utils.mktime_tz(timetuple)
356 def _simplify_title(title):
357 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
358 return expr.sub(u'_', title).strip(u'_')
360 def _orderedSet(iterable):
361 """ Remove all duplicates from the input iterable """
368 def _unescapeHTML(s):
370 @param s a string (of type unicode)
372 assert type(s) == type(u'')
374 result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
377 def _encodeFilename(s):
379 @param s The name of the file (of type unicode)
382 assert type(s) == type(u'')
384 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
385 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
386 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
387 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
390 return s.encode(sys.getfilesystemencoding(), 'ignore')
392 class DownloadError(Exception):
393 """Download Error exception.
395 This exception may be thrown by FileDownloader objects if they are not
396 configured to continue on errors. They will contain the appropriate
402 class SameFileError(Exception):
403 """Same File exception.
405 This exception will be thrown by FileDownloader objects if they detect
406 multiple files would have to be downloaded to the same file on disk.
411 class PostProcessingError(Exception):
412 """Post Processing exception.
414 This exception may be raised by PostProcessor's .run() method to
415 indicate an error in the postprocessing task.
419 class MaxDownloadsReached(Exception):
420 """ --max-downloads limit has been reached. """
424 class UnavailableVideoError(Exception):
425 """Unavailable Format exception.
427 This exception will be thrown when a video is requested
428 in a format that is not available for that video.
433 class ContentTooShortError(Exception):
434 """Content Too Short exception.
436 This exception may be raised by FileDownloader objects when a file they
437 download is too small for what the server announced first, indicating
438 the connection was probably interrupted.
444 def __init__(self, downloaded, expected):
445 self.downloaded = downloaded
446 self.expected = expected
449 class YoutubeDLHandler(urllib2.HTTPHandler):
450 """Handler for HTTP requests and responses.
452 This class, when installed with an OpenerDirector, automatically adds
453 the standard headers to every HTTP request and handles gzipped and
454 deflated responses from web servers. If compression is to be avoided in
455 a particular request, the original request in the program code only has
456 to include the HTTP header "Youtubedl-No-Compression", which will be
457 removed before making the real request.
459 Part of this code was copied from:
461 http://techknack.net/python-urllib2-handlers/
463 Andrew Rowls, the author of that code, agreed to release it to the
470 return zlib.decompress(data, -zlib.MAX_WBITS)
472 return zlib.decompress(data)
475 def addinfourl_wrapper(stream, headers, url, code):
476 if hasattr(urllib2.addinfourl, 'getcode'):
477 return urllib2.addinfourl(stream, headers, url, code)
478 ret = urllib2.addinfourl(stream, headers, url)
482 def http_request(self, req):
483 for h in std_headers:
486 req.add_header(h, std_headers[h])
487 if 'Youtubedl-no-compression' in req.headers:
488 if 'Accept-encoding' in req.headers:
489 del req.headers['Accept-encoding']
490 del req.headers['Youtubedl-no-compression']
493 def http_response(self, req, resp):
496 if resp.headers.get('Content-encoding', '') == 'gzip':
497 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
498 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
499 resp.msg = old_resp.msg
501 if resp.headers.get('Content-encoding', '') == 'deflate':
502 gz = StringIO.StringIO(self.deflate(resp.read()))
503 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
504 resp.msg = old_resp.msg
508 class FileDownloader(object):
509 """File Downloader class.
511 File downloader objects are the ones responsible of downloading the
512 actual video file and writing it to disk if the user has requested
513 it, among some other tasks. In most cases there should be one per
514 program. As, given a video URL, the downloader doesn't know how to
515 extract all the needed information, task that InfoExtractors do, it
516 has to pass the URL to one of them.
518 For this, file downloader objects have a method that allows
519 InfoExtractors to be registered in a given order. When it is passed
520 a URL, the file downloader handles it to the first InfoExtractor it
521 finds that reports being able to handle it. The InfoExtractor extracts
522 all the information about the video or videos the URL refers to, and
523 asks the FileDownloader to process the video information, possibly
524 downloading the video.
526 File downloaders accept a lot of parameters. In order not to saturate
527 the object constructor with arguments, it receives a dictionary of
528 options instead. These options are available through the params
529 attribute for the InfoExtractors to use. The FileDownloader also
530 registers itself as the downloader in charge for the InfoExtractors
531 that are added to it, so this is a "mutual registration".
535 username: Username for authentication purposes.
536 password: Password for authentication purposes.
537 usenetrc: Use netrc for authentication instead.
538 quiet: Do not print messages to stdout.
539 forceurl: Force printing final URL.
540 forcetitle: Force printing title.
541 forcethumbnail: Force printing thumbnail URL.
542 forcedescription: Force printing description.
543 forcefilename: Force printing final filename.
544 simulate: Do not download the video files.
545 format: Video format code.
546 format_limit: Highest quality format to try.
547 outtmpl: Template for output names.
548 ignoreerrors: Do not stop on download errors.
549 ratelimit: Download speed limit, in bytes/sec.
550 nooverwrites: Prevent overwriting files.
551 retries: Number of times to retry for HTTP error 5xx
552 continuedl: Try to continue downloads if possible.
553 noprogress: Do not print the progress bar.
554 playliststart: Playlist item to start at.
555 playlistend: Playlist item to end at.
556 matchtitle: Download only matching titles.
557 rejecttitle: Reject downloads for matching titles.
558 logtostderr: Log messages to stderr instead of stdout.
559 consoletitle: Display progress in console window's titlebar.
560 nopart: Do not use temporary .part files.
561 updatetime: Use the Last-modified header to set output file timestamps.
562 writedescription: Write the video description to a .description file
563 writeinfojson: Write the video description to a .info.json file
564 writesubtitles: Write the video subtitles to a .srt file
565 subtitleslang: Language of the subtitles to download
571 _download_retcode = None
572 _num_downloads = None
575 def __init__(self, params):
576 """Create a FileDownloader object with the given options."""
579 self._download_retcode = 0
580 self._num_downloads = 0
581 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
585 def format_bytes(bytes):
588 if type(bytes) is str:
593 exponent = long(math.log(bytes, 1024.0))
594 suffix = 'bkMGTPEZY'[exponent]
595 converted = float(bytes) / float(1024 ** exponent)
596 return '%.2f%s' % (converted, suffix)
599 def calc_percent(byte_counter, data_len):
602 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
605 def calc_eta(start, now, total, current):
609 if current == 0 or dif < 0.001: # One millisecond
611 rate = float(current) / dif
612 eta = long((float(total) - float(current)) / rate)
613 (eta_mins, eta_secs) = divmod(eta, 60)
616 return '%02d:%02d' % (eta_mins, eta_secs)
619 def calc_speed(start, now, bytes):
621 if bytes == 0 or dif < 0.001: # One millisecond
622 return '%10s' % '---b/s'
623 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
626 def best_block_size(elapsed_time, bytes):
627 new_min = max(bytes / 2.0, 1.0)
628 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
629 if elapsed_time < 0.001:
631 rate = bytes / elapsed_time
639 def parse_bytes(bytestr):
640 """Parse a string indicating a byte quantity into a long integer."""
641 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
644 number = float(matchobj.group(1))
645 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
646 return long(round(number * multiplier))
648 def add_info_extractor(self, ie):
649 """Add an InfoExtractor object to the end of the list."""
651 ie.set_downloader(self)
653 def add_post_processor(self, pp):
654 """Add a PostProcessor object to the end of the chain."""
656 pp.set_downloader(self)
658 def to_screen(self, message, skip_eol=False):
659 """Print message to stdout if not in quiet mode."""
660 assert type(message) == type(u'')
661 if not self.params.get('quiet', False):
662 terminator = [u'\n', u''][skip_eol]
663 output = message + terminator
665 if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
666 output = output.encode(preferredencoding(), 'ignore')
667 self._screen_file.write(output)
668 self._screen_file.flush()
670 def to_stderr(self, message):
671 """Print message to stderr."""
672 print >>sys.stderr, message.encode(preferredencoding())
674 def to_cons_title(self, message):
675 """Set console/terminal window title to message."""
676 if not self.params.get('consoletitle', False):
678 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
679 # c_wchar_p() might not be necessary if `message` is
680 # already of type unicode()
681 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
682 elif 'TERM' in os.environ:
683 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
685 def fixed_template(self):
686 """Checks if the output template is fixed."""
687 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
689 def trouble(self, message=None):
690 """Determine action to take when a download problem appears.
692 Depending on if the downloader has been configured to ignore
693 download errors or not, this method may throw an exception or
694 not when errors are found, after printing the message.
696 if message is not None:
697 self.to_stderr(message)
698 if not self.params.get('ignoreerrors', False):
699 raise DownloadError(message)
700 self._download_retcode = 1
702 def slow_down(self, start_time, byte_counter):
703 """Sleep if the download speed is over the rate limit."""
704 rate_limit = self.params.get('ratelimit', None)
705 if rate_limit is None or byte_counter == 0:
708 elapsed = now - start_time
711 speed = float(byte_counter) / elapsed
712 if speed > rate_limit:
713 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
715 def temp_name(self, filename):
716 """Returns a temporary filename for the given filename."""
717 if self.params.get('nopart', False) or filename == u'-' or \
718 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
720 return filename + u'.part'
722 def undo_temp_name(self, filename):
723 if filename.endswith(u'.part'):
724 return filename[:-len(u'.part')]
727 def try_rename(self, old_filename, new_filename):
729 if old_filename == new_filename:
731 os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
732 except (IOError, OSError), err:
733 self.trouble(u'ERROR: unable to rename file')
735 def try_utime(self, filename, last_modified_hdr):
736 """Try to set the last-modified time of the given file."""
737 if last_modified_hdr is None:
739 if not os.path.isfile(_encodeFilename(filename)):
741 timestr = last_modified_hdr
744 filetime = timeconvert(timestr)
748 os.utime(filename, (time.time(), filetime))
753 def report_writedescription(self, descfn):
754 """ Report that the description file is being written """
755 self.to_screen(u'[info] Writing video description to: ' + descfn)
757 def report_writesubtitles(self, srtfn):
758 """ Report that the subtitles file is being written """
759 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
761 def report_writeinfojson(self, infofn):
762 """ Report that the metadata file has been written """
763 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
765 def report_destination(self, filename):
766 """Report destination filename."""
767 self.to_screen(u'[download] Destination: ' + filename)
769 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
770 """Report download progress."""
771 if self.params.get('noprogress', False):
773 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
774 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
775 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
776 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
778 def report_resuming_byte(self, resume_len):
779 """Report attempt to resume at given byte."""
780 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
782 def report_retry(self, count, retries):
783 """Report retry in case of HTTP error 5xx"""
784 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
786 def report_file_already_downloaded(self, file_name):
787 """Report file has already been fully downloaded."""
789 self.to_screen(u'[download] %s has already been downloaded' % file_name)
790 except (UnicodeEncodeError), err:
791 self.to_screen(u'[download] The file has already been downloaded')
793 def report_unable_to_resume(self):
794 """Report it was impossible to resume download."""
795 self.to_screen(u'[download] Unable to resume')
797 def report_finish(self):
798 """Report download finished."""
799 if self.params.get('noprogress', False):
800 self.to_screen(u'[download] Download completed')
804 def increment_downloads(self):
805 """Increment the ordinal that assigns a number to each file."""
806 self._num_downloads += 1
808 def prepare_filename(self, info_dict):
809 """Generate the output filename."""
811 template_dict = dict(info_dict)
812 template_dict['epoch'] = unicode(long(time.time()))
813 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
814 filename = self.params['outtmpl'] % template_dict
816 except (ValueError, KeyError), err:
817 self.trouble(u'ERROR: invalid system charset or erroneous output template')
820 def _match_entry(self, info_dict):
821 """ Returns None iff the file should be downloaded """
823 title = info_dict['title']
824 matchtitle = self.params.get('matchtitle', False)
825 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
826 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
827 rejecttitle = self.params.get('rejecttitle', False)
828 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
829 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
832 def process_info(self, info_dict):
833 """Process a single dictionary returned by an InfoExtractor."""
835 reason = self._match_entry(info_dict)
836 if reason is not None:
837 self.to_screen(u'[download] ' + reason)
840 max_downloads = self.params.get('max_downloads')
841 if max_downloads is not None:
842 if self._num_downloads > int(max_downloads):
843 raise MaxDownloadsReached()
845 filename = self.prepare_filename(info_dict)
848 if self.params.get('forcetitle', False):
849 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
850 if self.params.get('forceurl', False):
851 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
852 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
853 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
854 if self.params.get('forcedescription', False) and 'description' in info_dict:
855 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
856 if self.params.get('forcefilename', False) and filename is not None:
857 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
858 if self.params.get('forceformat', False):
859 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
861 # Do nothing else if in simulate mode
862 if self.params.get('simulate', False):
869 dn = os.path.dirname(_encodeFilename(filename))
870 if dn != '' and not os.path.exists(dn): # dn is already encoded
872 except (OSError, IOError), err:
873 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
876 if self.params.get('writedescription', False):
878 descfn = filename + u'.description'
879 self.report_writedescription(descfn)
880 descfile = open(_encodeFilename(descfn), 'wb')
882 descfile.write(info_dict['description'].encode('utf-8'))
885 except (OSError, IOError):
886 self.trouble(u'ERROR: Cannot write description file ' + descfn)
889 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
890 # subtitles download errors are already managed as troubles in relevant IE
891 # that way it will silently go on when used with unsupporting IE
893 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
894 self.report_writesubtitles(srtfn)
895 srtfile = open(_encodeFilename(srtfn), 'wb')
897 srtfile.write(info_dict['subtitles'].encode('utf-8'))
900 except (OSError, IOError):
901 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
904 if self.params.get('writeinfojson', False):
905 infofn = filename + u'.info.json'
906 self.report_writeinfojson(infofn)
909 except (NameError,AttributeError):
910 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
913 infof = open(_encodeFilename(infofn), 'wb')
915 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
916 json.dump(json_info_dict, infof)
919 except (OSError, IOError):
920 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
923 if not self.params.get('skip_download', False):
924 if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
928 success = self._do_download(filename, info_dict)
929 except (OSError, IOError), err:
930 raise UnavailableVideoError
931 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
932 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
934 except (ContentTooShortError, ), err:
935 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
940 self.post_process(filename, info_dict)
941 except (PostProcessingError), err:
942 self.trouble(u'ERROR: postprocessing: %s' % str(err))
945 def download(self, url_list):
946 """Download a given list of URLs."""
947 if len(url_list) > 1 and self.fixed_template():
948 raise SameFileError(self.params['outtmpl'])
951 suitable_found = False
953 # Go to next InfoExtractor if not suitable
954 if not ie.suitable(url):
957 # Suitable InfoExtractor found
958 suitable_found = True
960 # Extract information from URL and process it
963 # Suitable InfoExtractor had been found; go to next URL
966 if not suitable_found:
967 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
969 return self._download_retcode
971 def post_process(self, filename, ie_info):
972 """Run the postprocessing chain on the given file."""
974 info['filepath'] = filename
980 def _download_with_rtmpdump(self, filename, url, player_url):
981 self.report_destination(filename)
982 tmpfilename = self.temp_name(filename)
984 # Check for rtmpdump first
986 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
987 except (OSError, IOError):
988 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
991 # Download using rtmpdump. rtmpdump returns exit code 2 when
992 # the connection was interrumpted and resuming appears to be
993 # possible. This is part of rtmpdump's normal usage, AFAIK.
994 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
995 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
996 if self.params.get('verbose', False):
999 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
1002 self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
1003 retval = subprocess.call(args)
1004 while retval == 2 or retval == 1:
1005 prevsize = os.path.getsize(_encodeFilename(tmpfilename))
1006 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
1007 time.sleep(5.0) # This seems to be needed
1008 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
1009 cursize = os.path.getsize(_encodeFilename(tmpfilename))
1010 if prevsize == cursize and retval == 1:
1012 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
1013 if prevsize == cursize and retval == 2 and cursize > 1024:
1014 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
1018 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
1019 self.try_rename(tmpfilename, filename)
1022 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
1025 def _do_download(self, filename, info_dict):
1026 url = info_dict['url']
1027 player_url = info_dict.get('player_url', None)
1029 # Check file already present
1030 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
1031 self.report_file_already_downloaded(filename)
1034 # Attempt to download using rtmpdump
1035 if url.startswith('rtmp'):
1036 return self._download_with_rtmpdump(filename, url, player_url)
1038 tmpfilename = self.temp_name(filename)
1041 # Do not include the Accept-Encoding header
1042 headers = {'Youtubedl-no-compression': 'True'}
1043 basic_request = urllib2.Request(url, None, headers)
1044 request = urllib2.Request(url, None, headers)
1046 # Establish possible resume length
1047 if os.path.isfile(_encodeFilename(tmpfilename)):
1048 resume_len = os.path.getsize(_encodeFilename(tmpfilename))
1054 if self.params.get('continuedl', False):
1055 self.report_resuming_byte(resume_len)
1056 request.add_header('Range','bytes=%d-' % resume_len)
1062 retries = self.params.get('retries', 0)
1063 while count <= retries:
1064 # Establish connection
1066 if count == 0 and 'urlhandle' in info_dict:
1067 data = info_dict['urlhandle']
1068 data = urllib2.urlopen(request)
1070 except (urllib2.HTTPError, ), err:
1071 if (err.code < 500 or err.code >= 600) and err.code != 416:
1072 # Unexpected HTTP error
1074 elif err.code == 416:
1075 # Unable to resume (requested range not satisfiable)
1077 # Open the connection again without the range header
1078 data = urllib2.urlopen(basic_request)
1079 content_length = data.info()['Content-Length']
1080 except (urllib2.HTTPError, ), err:
1081 if err.code < 500 or err.code >= 600:
1084 # Examine the reported length
1085 if (content_length is not None and
1086 (resume_len - 100 < long(content_length) < resume_len + 100)):
1087 # The file had already been fully downloaded.
1088 # Explanation to the above condition: in issue #175 it was revealed that
1089 # YouTube sometimes adds or removes a few bytes from the end of the file,
1090 # changing the file size slightly and causing problems for some users. So
1091 # I decided to implement a suggested change and consider the file
1092 # completely downloaded if the file size differs less than 100 bytes from
1093 # the one in the hard drive.
1094 self.report_file_already_downloaded(filename)
1095 self.try_rename(tmpfilename, filename)
1098 # The length does not match, we start the download over
1099 self.report_unable_to_resume()
1104 if count <= retries:
1105 self.report_retry(count, retries)
1108 self.trouble(u'ERROR: giving up after %s retries' % retries)
1111 data_len = data.info().get('Content-length', None)
1112 if data_len is not None:
1113 data_len = long(data_len) + resume_len
1114 data_len_str = self.format_bytes(data_len)
1115 byte_counter = 0 + resume_len
1119 # Download and write
1120 before = time.time()
1121 data_block = data.read(block_size)
1123 if len(data_block) == 0:
1125 byte_counter += len(data_block)
1127 # Open file just in time
1130 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1131 assert stream is not None
1132 filename = self.undo_temp_name(tmpfilename)
1133 self.report_destination(filename)
1134 except (OSError, IOError), err:
1135 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1138 stream.write(data_block)
1139 except (IOError, OSError), err:
1140 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1142 block_size = self.best_block_size(after - before, len(data_block))
1145 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1146 if data_len is None:
1147 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1149 percent_str = self.calc_percent(byte_counter, data_len)
1150 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1151 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1154 self.slow_down(start, byte_counter - resume_len)
1157 self.trouble(u'\nERROR: Did not get any data blocks')
1160 self.report_finish()
1161 if data_len is not None and byte_counter != data_len:
1162 raise ContentTooShortError(byte_counter, long(data_len))
1163 self.try_rename(tmpfilename, filename)
1165 # Update file modification time
1166 if self.params.get('updatetime', True):
1167 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1172 class InfoExtractor(object):
1173 """Information Extractor class.
1175 Information extractors are the classes that, given a URL, extract
1176 information from the video (or videos) the URL refers to. This
1177 information includes the real video URL, the video title and simplified
1178 title, author and others. The information is stored in a dictionary
1179 which is then passed to the FileDownloader. The FileDownloader
1180 processes this information possibly downloading the video to the file
1181 system, among other possible outcomes. The dictionaries must include
1182 the following fields:
1184 id: Video identifier.
1185 url: Final video URL.
1186 uploader: Nickname of the video uploader.
1187 title: Literal title.
1188 stitle: Simplified title.
1189 ext: Video filename extension.
1190 format: Video format.
1191 player_url: SWF Player URL (may be None).
1193 The following fields are optional. Their primary purpose is to allow
1194 youtube-dl to serve as the backend for a video search function, such
1195 as the one in youtube2mp3. They are only used when their respective
1196 forced printing functions are called:
1198 thumbnail: Full URL to a video thumbnail image.
1199 description: One-line video description.
1201 Subclasses of this one should re-define the _real_initialize() and
1202 _real_extract() methods and define a _VALID_URL regexp.
1203 Probably, they should also be added to the list of extractors.
1209 def __init__(self, downloader=None):
1210 """Constructor. Receives an optional downloader."""
1212 self.set_downloader(downloader)
1214 def suitable(self, url):
1215 """Receives a URL and returns True if suitable for this IE."""
1216 return re.match(self._VALID_URL, url) is not None
1218 def initialize(self):
1219 """Initializes an instance (authentication, etc)."""
1221 self._real_initialize()
1224 def extract(self, url):
1225 """Extracts URL information and returns it in list of dicts."""
1227 return self._real_extract(url)
1229 def set_downloader(self, downloader):
1230 """Sets the downloader for this IE."""
1231 self._downloader = downloader
1233 def _real_initialize(self):
1234 """Real initialization process. Redefine in subclasses."""
1237 def _real_extract(self, url):
1238 """Real extraction process. Redefine in subclasses."""
1242 class YoutubeIE(InfoExtractor):
1243 """Information extractor for youtube.com."""
1245 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1246 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1247 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1248 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1249 _NETRC_MACHINE = 'youtube'
1250 # Listed in order of quality
1251 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1252 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1253 _video_extensions = {
1259 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1264 _video_dimensions = {
1279 IE_NAME = u'youtube'
1281 def report_lang(self):
1282 """Report attempt to set language."""
1283 self._downloader.to_screen(u'[youtube] Setting language')
1285 def report_login(self):
1286 """Report attempt to log in."""
1287 self._downloader.to_screen(u'[youtube] Logging in')
1289 def report_age_confirmation(self):
1290 """Report attempt to confirm age."""
1291 self._downloader.to_screen(u'[youtube] Confirming age')
1293 def report_video_webpage_download(self, video_id):
1294 """Report attempt to download video webpage."""
1295 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1297 def report_video_info_webpage_download(self, video_id):
1298 """Report attempt to download video info webpage."""
1299 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1301 def report_video_subtitles_download(self, video_id):
1302 """Report attempt to download video info webpage."""
1303 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1305 def report_information_extraction(self, video_id):
1306 """Report attempt to extract video information."""
1307 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1309 def report_unavailable_format(self, video_id, format):
1310 """Report extracted video URL."""
1311 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1313 def report_rtmp_download(self):
1314 """Indicate the download will use the RTMP protocol."""
1315 self._downloader.to_screen(u'[youtube] RTMP download detected')
1317 def _closed_captions_xml_to_srt(self, xml_string):
1319 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1320 # TODO parse xml instead of regex
1321 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1322 if not dur: dur = '4'
1323 start = float(start)
1324 end = start + float(dur)
1325 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1326 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1327 caption = _unescapeHTML(caption)
1328 caption = _unescapeHTML(caption) # double cycle, inentional
1329 srt += str(n) + '\n'
1330 srt += start + ' --> ' + end + '\n'
1331 srt += caption + '\n\n'
1334 def _print_formats(self, formats):
1335 print 'Available formats:'
1337 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1339 def _real_initialize(self):
1340 if self._downloader is None:
1345 downloader_params = self._downloader.params
1347 # Attempt to use provided username and password or .netrc data
1348 if downloader_params.get('username', None) is not None:
1349 username = downloader_params['username']
1350 password = downloader_params['password']
1351 elif downloader_params.get('usenetrc', False):
1353 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1354 if info is not None:
1358 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1359 except (IOError, netrc.NetrcParseError), err:
1360 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1364 request = urllib2.Request(self._LANG_URL)
1367 urllib2.urlopen(request).read()
1368 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1369 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1372 # No authentication to be performed
1373 if username is None:
1378 'current_form': 'loginForm',
1380 'action_login': 'Log In',
1381 'username': username,
1382 'password': password,
1384 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1387 login_results = urllib2.urlopen(request).read()
1388 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1389 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1391 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1392 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1398 'action_confirm': 'Confirm',
1400 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1402 self.report_age_confirmation()
1403 age_results = urllib2.urlopen(request).read()
1404 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1405 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1408 def _real_extract(self, url):
1409 # Extract video id from URL
1410 mobj = re.match(self._VALID_URL, url)
1412 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1414 video_id = mobj.group(2)
1417 self.report_video_webpage_download(video_id)
1418 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1420 video_webpage = urllib2.urlopen(request).read()
1421 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1422 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1425 # Attempt to extract SWF player URL
1426 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1427 if mobj is not None:
1428 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1433 self.report_video_info_webpage_download(video_id)
1434 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1435 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1436 % (video_id, el_type))
1437 request = urllib2.Request(video_info_url)
1439 video_info_webpage = urllib2.urlopen(request).read()
1440 video_info = parse_qs(video_info_webpage)
1441 if 'token' in video_info:
1443 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1444 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1446 if 'token' not in video_info:
1447 if 'reason' in video_info:
1448 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1450 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1453 # Start extracting information
1454 self.report_information_extraction(video_id)
1457 if 'author' not in video_info:
1458 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1460 video_uploader = urllib.unquote_plus(video_info['author'][0])
1463 if 'title' not in video_info:
1464 self._downloader.trouble(u'ERROR: unable to extract video title')
1466 video_title = urllib.unquote_plus(video_info['title'][0])
1467 video_title = video_title.decode('utf-8')
1468 video_title = sanitize_title(video_title)
1471 simple_title = _simplify_title(video_title)
1474 if 'thumbnail_url' not in video_info:
1475 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1476 video_thumbnail = ''
1477 else: # don't panic if we can't find it
1478 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1482 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1483 if mobj is not None:
1484 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1485 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1486 for expression in format_expressions:
1488 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1493 video_description = get_element_by_id("eow-description", video_webpage)
1494 if video_description: video_description = clean_html(video_description.decode('utf8'))
1495 else: video_description = ''
1498 video_subtitles = None
1499 if self._downloader.params.get('writesubtitles', False):
1500 self.report_video_subtitles_download(video_id)
1501 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1503 srt_list = urllib2.urlopen(request).read()
1504 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1505 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1507 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1509 if self._downloader.params.get('subtitleslang', False):
1510 srt_lang = self._downloader.params.get('subtitleslang')
1511 elif 'en' in srt_lang_list:
1514 srt_lang = srt_lang_list[0]
1515 if not srt_lang in srt_lang_list:
1516 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
1518 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1520 srt_xml = urllib2.urlopen(request).read()
1521 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1522 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1524 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1526 self._downloader.trouble(u'WARNING: video has no closed captions')
1529 video_token = urllib.unquote_plus(video_info['token'][0])
1531 # Decide which formats to download
1532 req_format = self._downloader.params.get('format', None)
1534 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1535 self.report_rtmp_download()
1536 video_url_list = [(None, video_info['conn'][0])]
1537 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1538 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1539 url_data = [parse_qs(uds) for uds in url_data_strs]
1540 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1541 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1543 format_limit = self._downloader.params.get('format_limit', None)
1544 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1545 if format_limit is not None and format_limit in available_formats:
1546 format_list = available_formats[available_formats.index(format_limit):]
1548 format_list = available_formats
1549 existing_formats = [x for x in format_list if x in url_map]
1550 if len(existing_formats) == 0:
1551 self._downloader.trouble(u'ERROR: no known formats available for video')
1553 if self._downloader.params.get('listformats', None):
1554 self._print_formats(existing_formats)
1556 if req_format is None or req_format == 'best':
1557 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1558 elif req_format == 'worst':
1559 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1560 elif req_format in ('-1', 'all'):
1561 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1563 # Specific formats. We pick the first in a slash-delimeted sequence.
1564 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1565 req_formats = req_format.split('/')
1566 video_url_list = None
1567 for rf in req_formats:
1569 video_url_list = [(rf, url_map[rf])]
1571 if video_url_list is None:
1572 self._downloader.trouble(u'ERROR: requested format not available')
1575 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1578 for format_param, video_real_url in video_url_list:
1579 # At this point we have a new video
1580 self._downloader.increment_downloads()
1583 video_extension = self._video_extensions.get(format_param, 'flv')
1586 # Process video information
1587 self._downloader.process_info({
1588 'id': video_id.decode('utf-8'),
1589 'url': video_real_url.decode('utf-8'),
1590 'uploader': video_uploader.decode('utf-8'),
1591 'upload_date': upload_date,
1592 'title': video_title,
1593 'stitle': simple_title,
1594 'ext': video_extension.decode('utf-8'),
1595 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1596 'thumbnail': video_thumbnail.decode('utf-8'),
1597 'description': video_description,
1598 'player_url': player_url,
1599 'subtitles': video_subtitles
1601 except UnavailableVideoError, err:
1602 self._downloader.trouble(u'\nERROR: unable to download video')
1605 class MetacafeIE(InfoExtractor):
1606 """Information Extractor for metacafe.com."""
1608 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1609 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1610 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1612 IE_NAME = u'metacafe'
1614 def __init__(self, youtube_ie, downloader=None):
1615 InfoExtractor.__init__(self, downloader)
1616 self._youtube_ie = youtube_ie
1618 def report_disclaimer(self):
1619 """Report disclaimer retrieval."""
1620 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1622 def report_age_confirmation(self):
1623 """Report attempt to confirm age."""
1624 self._downloader.to_screen(u'[metacafe] Confirming age')
1626 def report_download_webpage(self, video_id):
1627 """Report webpage download."""
1628 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1630 def report_extraction(self, video_id):
1631 """Report information extraction."""
1632 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1634 def _real_initialize(self):
1635 # Retrieve disclaimer
1636 request = urllib2.Request(self._DISCLAIMER)
1638 self.report_disclaimer()
1639 disclaimer = urllib2.urlopen(request).read()
1640 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1641 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1647 'submit': "Continue - I'm over 18",
1649 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1651 self.report_age_confirmation()
1652 disclaimer = urllib2.urlopen(request).read()
1653 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1654 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1657 def _real_extract(self, url):
1658 # Extract id and simplified title from URL
1659 mobj = re.match(self._VALID_URL, url)
1661 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1664 video_id = mobj.group(1)
1666 # Check if video comes from YouTube
1667 mobj2 = re.match(r'^yt-(.*)$', video_id)
1668 if mobj2 is not None:
1669 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1672 # At this point we have a new video
1673 self._downloader.increment_downloads()
1675 simple_title = mobj.group(2).decode('utf-8')
1677 # Retrieve video webpage to extract further information
1678 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1680 self.report_download_webpage(video_id)
1681 webpage = urllib2.urlopen(request).read()
1682 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1683 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1686 # Extract URL, uploader and title from webpage
1687 self.report_extraction(video_id)
1688 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1689 if mobj is not None:
1690 mediaURL = urllib.unquote(mobj.group(1))
1691 video_extension = mediaURL[-3:]
1693 # Extract gdaKey if available
1694 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1696 video_url = mediaURL
1698 gdaKey = mobj.group(1)
1699 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1701 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1703 self._downloader.trouble(u'ERROR: unable to extract media URL')
1705 vardict = parse_qs(mobj.group(1))
1706 if 'mediaData' not in vardict:
1707 self._downloader.trouble(u'ERROR: unable to extract media URL')
1709 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1711 self._downloader.trouble(u'ERROR: unable to extract media URL')
1713 mediaURL = mobj.group(1).replace('\\/', '/')
1714 video_extension = mediaURL[-3:]
1715 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1717 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1719 self._downloader.trouble(u'ERROR: unable to extract title')
1721 video_title = mobj.group(1).decode('utf-8')
1722 video_title = sanitize_title(video_title)
1724 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1726 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1728 video_uploader = mobj.group(1)
1731 # Process video information
1732 self._downloader.process_info({
1733 'id': video_id.decode('utf-8'),
1734 'url': video_url.decode('utf-8'),
1735 'uploader': video_uploader.decode('utf-8'),
1736 'upload_date': u'NA',
1737 'title': video_title,
1738 'stitle': simple_title,
1739 'ext': video_extension.decode('utf-8'),
1743 except UnavailableVideoError:
1744 self._downloader.trouble(u'\nERROR: unable to download video')
1747 class DailymotionIE(InfoExtractor):
1748 """Information Extractor for Dailymotion"""
1750 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1751 IE_NAME = u'dailymotion'
1753 def __init__(self, downloader=None):
1754 InfoExtractor.__init__(self, downloader)
1756 def report_download_webpage(self, video_id):
1757 """Report webpage download."""
1758 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1760 def report_extraction(self, video_id):
1761 """Report information extraction."""
1762 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1764 def _real_extract(self, url):
1765 # Extract id and simplified title from URL
1766 mobj = re.match(self._VALID_URL, url)
1768 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1771 # At this point we have a new video
1772 self._downloader.increment_downloads()
1773 video_id = mobj.group(1)
1775 video_extension = 'flv'
1777 # Retrieve video webpage to extract further information
1778 request = urllib2.Request(url)
1779 request.add_header('Cookie', 'family_filter=off')
1781 self.report_download_webpage(video_id)
1782 webpage = urllib2.urlopen(request).read()
1783 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1784 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1787 # Extract URL, uploader and title from webpage
1788 self.report_extraction(video_id)
1789 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1791 self._downloader.trouble(u'ERROR: unable to extract media URL')
1793 sequence = urllib.unquote(mobj.group(1))
1794 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1796 self._downloader.trouble(u'ERROR: unable to extract media URL')
1798 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1800 # if needed add http://www.dailymotion.com/ if relative URL
1802 video_url = mediaURL
1804 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1806 self._downloader.trouble(u'ERROR: unable to extract title')
1808 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1809 video_title = sanitize_title(video_title)
1810 simple_title = _simplify_title(video_title)
1812 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1814 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1816 video_uploader = mobj.group(1)
1819 # Process video information
1820 self._downloader.process_info({
1821 'id': video_id.decode('utf-8'),
1822 'url': video_url.decode('utf-8'),
1823 'uploader': video_uploader.decode('utf-8'),
1824 'upload_date': u'NA',
1825 'title': video_title,
1826 'stitle': simple_title,
1827 'ext': video_extension.decode('utf-8'),
1831 except UnavailableVideoError:
1832 self._downloader.trouble(u'\nERROR: unable to download video')
1835 class GoogleIE(InfoExtractor):
1836 """Information extractor for video.google.com."""
1838 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1839 IE_NAME = u'video.google'
1841 def __init__(self, downloader=None):
1842 InfoExtractor.__init__(self, downloader)
1844 def report_download_webpage(self, video_id):
1845 """Report webpage download."""
1846 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1848 def report_extraction(self, video_id):
1849 """Report information extraction."""
1850 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1852 def _real_extract(self, url):
1853 # Extract id from URL
1854 mobj = re.match(self._VALID_URL, url)
1856 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1859 # At this point we have a new video
1860 self._downloader.increment_downloads()
1861 video_id = mobj.group(1)
1863 video_extension = 'mp4'
1865 # Retrieve video webpage to extract further information
1866 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1868 self.report_download_webpage(video_id)
1869 webpage = urllib2.urlopen(request).read()
1870 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1871 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1874 # Extract URL, uploader, and title from webpage
1875 self.report_extraction(video_id)
1876 mobj = re.search(r"download_url:'([^']+)'", webpage)
1878 video_extension = 'flv'
1879 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1881 self._downloader.trouble(u'ERROR: unable to extract media URL')
1883 mediaURL = urllib.unquote(mobj.group(1))
1884 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1885 mediaURL = mediaURL.replace('\\x26', '\x26')
1887 video_url = mediaURL
1889 mobj = re.search(r'<title>(.*)</title>', webpage)
1891 self._downloader.trouble(u'ERROR: unable to extract title')
1893 video_title = mobj.group(1).decode('utf-8')
1894 video_title = sanitize_title(video_title)
1895 simple_title = _simplify_title(video_title)
1897 # Extract video description
1898 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1900 self._downloader.trouble(u'ERROR: unable to extract video description')
1902 video_description = mobj.group(1).decode('utf-8')
1903 if not video_description:
1904 video_description = 'No description available.'
1906 # Extract video thumbnail
1907 if self._downloader.params.get('forcethumbnail', False):
1908 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1910 webpage = urllib2.urlopen(request).read()
1911 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1912 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1914 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1916 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1918 video_thumbnail = mobj.group(1)
1919 else: # we need something to pass to process_info
1920 video_thumbnail = ''
1923 # Process video information
1924 self._downloader.process_info({
1925 'id': video_id.decode('utf-8'),
1926 'url': video_url.decode('utf-8'),
1928 'upload_date': u'NA',
1929 'title': video_title,
1930 'stitle': simple_title,
1931 'ext': video_extension.decode('utf-8'),
1935 except UnavailableVideoError:
1936 self._downloader.trouble(u'\nERROR: unable to download video')
1939 class PhotobucketIE(InfoExtractor):
1940 """Information extractor for photobucket.com."""
1942 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1943 IE_NAME = u'photobucket'
1945 def __init__(self, downloader=None):
1946 InfoExtractor.__init__(self, downloader)
1948 def report_download_webpage(self, video_id):
1949 """Report webpage download."""
1950 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1952 def report_extraction(self, video_id):
1953 """Report information extraction."""
1954 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1956 def _real_extract(self, url):
1957 # Extract id from URL
1958 mobj = re.match(self._VALID_URL, url)
1960 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1963 # At this point we have a new video
1964 self._downloader.increment_downloads()
1965 video_id = mobj.group(1)
1967 video_extension = 'flv'
1969 # Retrieve video webpage to extract further information
1970 request = urllib2.Request(url)
1972 self.report_download_webpage(video_id)
1973 webpage = urllib2.urlopen(request).read()
1974 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1975 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1978 # Extract URL, uploader, and title from webpage
1979 self.report_extraction(video_id)
1980 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1982 self._downloader.trouble(u'ERROR: unable to extract media URL')
1984 mediaURL = urllib.unquote(mobj.group(1))
1986 video_url = mediaURL
1988 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1990 self._downloader.trouble(u'ERROR: unable to extract title')
1992 video_title = mobj.group(1).decode('utf-8')
1993 video_title = sanitize_title(video_title)
1994 simple_title = _simplify_title(vide_title)
1996 video_uploader = mobj.group(2).decode('utf-8')
1999 # Process video information
2000 self._downloader.process_info({
2001 'id': video_id.decode('utf-8'),
2002 'url': video_url.decode('utf-8'),
2003 'uploader': video_uploader,
2004 'upload_date': u'NA',
2005 'title': video_title,
2006 'stitle': simple_title,
2007 'ext': video_extension.decode('utf-8'),
2011 except UnavailableVideoError:
2012 self._downloader.trouble(u'\nERROR: unable to download video')
2015 class YahooIE(InfoExtractor):
2016 """Information extractor for video.yahoo.com."""
2018 # _VALID_URL matches all Yahoo! Video URLs
2019 # _VPAGE_URL matches only the extractable '/watch/' URLs
2020 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
2021 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
2022 IE_NAME = u'video.yahoo'
2024 def __init__(self, downloader=None):
2025 InfoExtractor.__init__(self, downloader)
2027 def report_download_webpage(self, video_id):
2028 """Report webpage download."""
2029 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
2031 def report_extraction(self, video_id):
2032 """Report information extraction."""
2033 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
2035 def _real_extract(self, url, new_video=True):
2036 # Extract ID from URL
2037 mobj = re.match(self._VALID_URL, url)
2039 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2042 # At this point we have a new video
2043 self._downloader.increment_downloads()
2044 video_id = mobj.group(2)
2045 video_extension = 'flv'
2047 # Rewrite valid but non-extractable URLs as
2048 # extractable English language /watch/ URLs
2049 if re.match(self._VPAGE_URL, url) is None:
2050 request = urllib2.Request(url)
2052 webpage = urllib2.urlopen(request).read()
2053 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2054 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2057 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
2059 self._downloader.trouble(u'ERROR: Unable to extract id field')
2061 yahoo_id = mobj.group(1)
2063 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2065 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2067 yahoo_vid = mobj.group(1)
2069 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2070 return self._real_extract(url, new_video=False)
2072 # Retrieve video webpage to extract further information
2073 request = urllib2.Request(url)
2075 self.report_download_webpage(video_id)
2076 webpage = urllib2.urlopen(request).read()
2077 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2078 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2081 # Extract uploader and title from webpage
2082 self.report_extraction(video_id)
2083 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2085 self._downloader.trouble(u'ERROR: unable to extract video title')
2087 video_title = mobj.group(1).decode('utf-8')
2088 simple_title = _simplify_title(video_title)
2090 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2092 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2094 video_uploader = mobj.group(1).decode('utf-8')
2096 # Extract video thumbnail
2097 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2099 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2101 video_thumbnail = mobj.group(1).decode('utf-8')
2103 # Extract video description
2104 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2106 self._downloader.trouble(u'ERROR: unable to extract video description')
2108 video_description = mobj.group(1).decode('utf-8')
2109 if not video_description:
2110 video_description = 'No description available.'
2112 # Extract video height and width
2113 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2115 self._downloader.trouble(u'ERROR: unable to extract video height')
2117 yv_video_height = mobj.group(1)
2119 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2121 self._downloader.trouble(u'ERROR: unable to extract video width')
2123 yv_video_width = mobj.group(1)
2125 # Retrieve video playlist to extract media URL
2126 # I'm not completely sure what all these options are, but we
2127 # seem to need most of them, otherwise the server sends a 401.
2128 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
2129 yv_bitrate = '700' # according to Wikipedia this is hard-coded
2130 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2131 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2132 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2134 self.report_download_webpage(video_id)
2135 webpage = urllib2.urlopen(request).read()
2136 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2137 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2140 # Extract media URL from playlist XML
2141 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2143 self._downloader.trouble(u'ERROR: Unable to extract media URL')
2145 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2146 video_url = _unescapeHTML(video_url)
2149 # Process video information
2150 self._downloader.process_info({
2151 'id': video_id.decode('utf-8'),
2153 'uploader': video_uploader,
2154 'upload_date': u'NA',
2155 'title': video_title,
2156 'stitle': simple_title,
2157 'ext': video_extension.decode('utf-8'),
2158 'thumbnail': video_thumbnail.decode('utf-8'),
2159 'description': video_description,
2160 'thumbnail': video_thumbnail,
2163 except UnavailableVideoError:
2164 self._downloader.trouble(u'\nERROR: unable to download video')
2167 class VimeoIE(InfoExtractor):
2168 """Information extractor for vimeo.com."""
2170 # _VALID_URL matches Vimeo URLs
2171 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2174 def __init__(self, downloader=None):
2175 InfoExtractor.__init__(self, downloader)
2177 def report_download_webpage(self, video_id):
2178 """Report webpage download."""
2179 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2181 def report_extraction(self, video_id):
2182 """Report information extraction."""
2183 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2185 def _real_extract(self, url, new_video=True):
2186 # Extract ID from URL
2187 mobj = re.match(self._VALID_URL, url)
2189 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2192 # At this point we have a new video
2193 self._downloader.increment_downloads()
2194 video_id = mobj.group(1)
2196 # Retrieve video webpage to extract further information
2197 request = urllib2.Request(url, None, std_headers)
2199 self.report_download_webpage(video_id)
2200 webpage = urllib2.urlopen(request).read()
2201 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2202 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2205 # Now we begin extracting as much information as we can from what we
2206 # retrieved. First we extract the information common to all extractors,
2207 # and latter we extract those that are Vimeo specific.
2208 self.report_extraction(video_id)
2210 # Extract the config JSON
2211 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2213 config = json.loads(config)
2215 self._downloader.trouble(u'ERROR: unable to extract info section')
2219 video_title = config["video"]["title"]
2220 simple_title = _simplify_title(video_title)
2223 video_uploader = config["video"]["owner"]["name"]
2225 # Extract video thumbnail
2226 video_thumbnail = config["video"]["thumbnail"]
2228 # Extract video description
2229 video_description = get_element_by_id("description", webpage)
2230 if video_description: video_description = clean_html(video_description.decode('utf8'))
2231 else: video_description = ''
2233 # Extract upload date
2234 video_upload_date = u'NA'
2235 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2236 if mobj is not None:
2237 video_upload_date = mobj.group(1)
2239 # Vimeo specific: extract request signature and timestamp
2240 sig = config['request']['signature']
2241 timestamp = config['request']['timestamp']
2243 # Vimeo specific: extract video codec and quality information
2244 # TODO bind to format param
2245 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2246 for codec in codecs:
2247 if codec[0] in config["video"]["files"]:
2248 video_codec = codec[0]
2249 video_extension = codec[1]
2250 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2251 else: quality = 'sd'
2254 self._downloader.trouble(u'ERROR: no known codec found')
2257 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2258 %(video_id, sig, timestamp, quality, video_codec.upper())
2261 # Process video information
2262 self._downloader.process_info({
2265 'uploader': video_uploader,
2266 'upload_date': video_upload_date,
2267 'title': video_title,
2268 'stitle': simple_title,
2269 'ext': video_extension,
2270 'thumbnail': video_thumbnail,
2271 'description': video_description,
2274 except UnavailableVideoError:
2275 self._downloader.trouble(u'ERROR: unable to download video')
2278 class GenericIE(InfoExtractor):
2279 """Generic last-resort information extractor."""
2282 IE_NAME = u'generic'
2284 def __init__(self, downloader=None):
2285 InfoExtractor.__init__(self, downloader)
2287 def report_download_webpage(self, video_id):
2288 """Report webpage download."""
2289 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2290 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2292 def report_extraction(self, video_id):
2293 """Report information extraction."""
2294 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2296 def _real_extract(self, url):
2297 # At this point we have a new video
2298 self._downloader.increment_downloads()
2300 video_id = url.split('/')[-1]
2301 request = urllib2.Request(url)
2303 self.report_download_webpage(video_id)
2304 webpage = urllib2.urlopen(request).read()
2305 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2306 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2308 except ValueError, err:
2309 # since this is the last-resort InfoExtractor, if
2310 # this error is thrown, it'll be thrown here
2311 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2314 self.report_extraction(video_id)
2315 # Start with something easy: JW Player in SWFObject
2316 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2318 # Broaden the search a little bit
2319 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2321 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2324 # It's possible that one of the regexes
2325 # matched, but returned an empty group:
2326 if mobj.group(1) is None:
2327 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2330 video_url = urllib.unquote(mobj.group(1))
2331 video_id = os.path.basename(video_url)
2333 # here's a fun little line of code for you:
2334 video_extension = os.path.splitext(video_id)[1][1:]
2335 video_id = os.path.splitext(video_id)[0]
2337 # it's tempting to parse this further, but you would
2338 # have to take into account all the variations like
2339 # Video Title - Site Name
2340 # Site Name | Video Title
2341 # Video Title - Tagline | Site Name
2342 # and so on and so forth; it's just not practical
2343 mobj = re.search(r'<title>(.*)</title>', webpage)
2345 self._downloader.trouble(u'ERROR: unable to extract title')
2347 video_title = mobj.group(1).decode('utf-8')
2348 video_title = sanitize_title(video_title)
2349 simple_title = _simplify_title(video_title)
2351 # video uploader is domain name
2352 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2354 self._downloader.trouble(u'ERROR: unable to extract title')
2356 video_uploader = mobj.group(1).decode('utf-8')
2359 # Process video information
2360 self._downloader.process_info({
2361 'id': video_id.decode('utf-8'),
2362 'url': video_url.decode('utf-8'),
2363 'uploader': video_uploader,
2364 'upload_date': u'NA',
2365 'title': video_title,
2366 'stitle': simple_title,
2367 'ext': video_extension.decode('utf-8'),
2371 except UnavailableVideoError, err:
2372 self._downloader.trouble(u'\nERROR: unable to download video')
2375 class YoutubeSearchIE(InfoExtractor):
2376 """Information Extractor for YouTube search queries."""
2377 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2378 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
2380 _max_youtube_results = 1000
2381 IE_NAME = u'youtube:search'
2383 def __init__(self, youtube_ie, downloader=None):
2384 InfoExtractor.__init__(self, downloader)
2385 self._youtube_ie = youtube_ie
2387 def report_download_page(self, query, pagenum):
2388 """Report attempt to download playlist page with given number."""
2389 query = query.decode(preferredencoding())
2390 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2392 def _real_initialize(self):
2393 self._youtube_ie.initialize()
2395 def _real_extract(self, query):
2396 mobj = re.match(self._VALID_URL, query)
2398 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2401 prefix, query = query.split(':')
2403 query = query.encode('utf-8')
2405 self._download_n_results(query, 1)
2407 elif prefix == 'all':
2408 self._download_n_results(query, self._max_youtube_results)
2414 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2416 elif n > self._max_youtube_results:
2417 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2418 n = self._max_youtube_results
2419 self._download_n_results(query, n)
2421 except ValueError: # parsing prefix as integer fails
2422 self._download_n_results(query, 1)
2425 def _download_n_results(self, query, n):
2426 """Downloads a specified number of results for a query"""
2432 while (50 * pagenum) < limit:
2433 self.report_download_page(query, pagenum+1)
2434 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
2435 request = urllib2.Request(result_url)
2437 data = urllib2.urlopen(request).read()
2438 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2439 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
2441 api_response = json.loads(data)['data']
2443 new_ids = list(video['id'] for video in api_response['items'])
2444 video_ids += new_ids
2446 limit = min(n, api_response['totalItems'])
2449 if len(video_ids) > n:
2450 video_ids = video_ids[:n]
2451 for id in video_ids:
2452 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2456 class GoogleSearchIE(InfoExtractor):
2457 """Information Extractor for Google Video search queries."""
2458 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2459 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2460 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2461 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2463 _max_google_results = 1000
2464 IE_NAME = u'video.google:search'
2466 def __init__(self, google_ie, downloader=None):
2467 InfoExtractor.__init__(self, downloader)
2468 self._google_ie = google_ie
2470 def report_download_page(self, query, pagenum):
2471 """Report attempt to download playlist page with given number."""
2472 query = query.decode(preferredencoding())
2473 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2475 def _real_initialize(self):
2476 self._google_ie.initialize()
2478 def _real_extract(self, query):
2479 mobj = re.match(self._VALID_URL, query)
2481 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2484 prefix, query = query.split(':')
2486 query = query.encode('utf-8')
2488 self._download_n_results(query, 1)
2490 elif prefix == 'all':
2491 self._download_n_results(query, self._max_google_results)
2497 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2499 elif n > self._max_google_results:
2500 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2501 n = self._max_google_results
2502 self._download_n_results(query, n)
2504 except ValueError: # parsing prefix as integer fails
2505 self._download_n_results(query, 1)
2508 def _download_n_results(self, query, n):
2509 """Downloads a specified number of results for a query"""
2515 self.report_download_page(query, pagenum)
2516 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2517 request = urllib2.Request(result_url)
2519 page = urllib2.urlopen(request).read()
2520 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2521 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2524 # Extract video identifiers
2525 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2526 video_id = mobj.group(1)
2527 if video_id not in video_ids:
2528 video_ids.append(video_id)
2529 if len(video_ids) == n:
2530 # Specified n videos reached
2531 for id in video_ids:
2532 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2535 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2536 for id in video_ids:
2537 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2540 pagenum = pagenum + 1
2543 class YahooSearchIE(InfoExtractor):
2544 """Information Extractor for Yahoo! Video search queries."""
2545 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2546 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2547 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2548 _MORE_PAGES_INDICATOR = r'\s*Next'
2550 _max_yahoo_results = 1000
2551 IE_NAME = u'video.yahoo:search'
2553 def __init__(self, yahoo_ie, downloader=None):
2554 InfoExtractor.__init__(self, downloader)
2555 self._yahoo_ie = yahoo_ie
2557 def report_download_page(self, query, pagenum):
2558 """Report attempt to download playlist page with given number."""
2559 query = query.decode(preferredencoding())
2560 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2562 def _real_initialize(self):
2563 self._yahoo_ie.initialize()
2565 def _real_extract(self, query):
2566 mobj = re.match(self._VALID_URL, query)
2568 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2571 prefix, query = query.split(':')
2573 query = query.encode('utf-8')
2575 self._download_n_results(query, 1)
2577 elif prefix == 'all':
2578 self._download_n_results(query, self._max_yahoo_results)
2584 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2586 elif n > self._max_yahoo_results:
2587 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2588 n = self._max_yahoo_results
2589 self._download_n_results(query, n)
2591 except ValueError: # parsing prefix as integer fails
2592 self._download_n_results(query, 1)
2595 def _download_n_results(self, query, n):
2596 """Downloads a specified number of results for a query"""
2599 already_seen = set()
2603 self.report_download_page(query, pagenum)
2604 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2605 request = urllib2.Request(result_url)
2607 page = urllib2.urlopen(request).read()
2608 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2609 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2612 # Extract video identifiers
2613 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2614 video_id = mobj.group(1)
2615 if video_id not in already_seen:
2616 video_ids.append(video_id)
2617 already_seen.add(video_id)
2618 if len(video_ids) == n:
2619 # Specified n videos reached
2620 for id in video_ids:
2621 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2624 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2625 for id in video_ids:
2626 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2629 pagenum = pagenum + 1
2632 class YoutubePlaylistIE(InfoExtractor):
2633 """Information Extractor for YouTube playlists."""
2635 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2636 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2637 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=PL%s&'
2638 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2640 IE_NAME = u'youtube:playlist'
2642 def __init__(self, youtube_ie, downloader=None):
2643 InfoExtractor.__init__(self, downloader)
2644 self._youtube_ie = youtube_ie
2646 def report_download_page(self, playlist_id, pagenum):
2647 """Report attempt to download playlist page with given number."""
2648 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2650 def _real_initialize(self):
2651 self._youtube_ie.initialize()
2653 def _real_extract(self, url):
2654 # Extract playlist id
2655 mobj = re.match(self._VALID_URL, url)
2657 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2661 if mobj.group(3) is not None:
2662 self._youtube_ie.extract(mobj.group(3))
2665 # Download playlist pages
2666 # prefix is 'p' as default for playlists but there are other types that need extra care
2667 playlist_prefix = mobj.group(1)
2668 if playlist_prefix == 'a':
2669 playlist_access = 'artist'
2671 playlist_prefix = 'p'
2672 playlist_access = 'view_play_list'
2673 playlist_id = mobj.group(2)
2678 self.report_download_page(playlist_id, pagenum)
2679 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2680 request = urllib2.Request(url)
2682 page = urllib2.urlopen(request).read()
2683 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2684 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2687 # Extract video identifiers
2689 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
2690 if mobj.group(1) not in ids_in_page:
2691 ids_in_page.append(mobj.group(1))
2692 video_ids.extend(ids_in_page)
2694 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2696 pagenum = pagenum + 1
2698 playliststart = self._downloader.params.get('playliststart', 1) - 1
2699 playlistend = self._downloader.params.get('playlistend', -1)
2700 if playlistend == -1:
2701 video_ids = video_ids[playliststart:]
2703 video_ids = video_ids[playliststart:playlistend]
2705 for id in video_ids:
2706 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2710 class YoutubeUserIE(InfoExtractor):
2711 """Information Extractor for YouTube users."""
2713 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2714 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2715 _GDATA_PAGE_SIZE = 50
2716 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2717 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2719 IE_NAME = u'youtube:user'
2721 def __init__(self, youtube_ie, downloader=None):
2722 InfoExtractor.__init__(self, downloader)
2723 self._youtube_ie = youtube_ie
2725 def report_download_page(self, username, start_index):
2726 """Report attempt to download user page."""
2727 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2728 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2730 def _real_initialize(self):
2731 self._youtube_ie.initialize()
2733 def _real_extract(self, url):
2735 mobj = re.match(self._VALID_URL, url)
2737 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2740 username = mobj.group(1)
2742 # Download video ids using YouTube Data API. Result size per
2743 # query is limited (currently to 50 videos) so we need to query
2744 # page by page until there are no video ids - it means we got
2751 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2752 self.report_download_page(username, start_index)
2754 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2757 page = urllib2.urlopen(request).read()
2758 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2759 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2762 # Extract video identifiers
2765 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2766 if mobj.group(1) not in ids_in_page:
2767 ids_in_page.append(mobj.group(1))
2769 video_ids.extend(ids_in_page)
2771 # A little optimization - if current page is not
2772 # "full", ie. does not contain PAGE_SIZE video ids then
2773 # we can assume that this page is the last one - there
2774 # are no more ids on further pages - no need to query
2777 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2782 all_ids_count = len(video_ids)
2783 playliststart = self._downloader.params.get('playliststart', 1) - 1
2784 playlistend = self._downloader.params.get('playlistend', -1)
2786 if playlistend == -1:
2787 video_ids = video_ids[playliststart:]
2789 video_ids = video_ids[playliststart:playlistend]
2791 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2792 (username, all_ids_count, len(video_ids)))
2794 for video_id in video_ids:
2795 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2798 class DepositFilesIE(InfoExtractor):
2799 """Information extractor for depositfiles.com"""
2801 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2802 IE_NAME = u'DepositFiles'
2804 def __init__(self, downloader=None):
2805 InfoExtractor.__init__(self, downloader)
2807 def report_download_webpage(self, file_id):
2808 """Report webpage download."""
2809 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2811 def report_extraction(self, file_id):
2812 """Report information extraction."""
2813 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2815 def _real_extract(self, url):
2816 # At this point we have a new file
2817 self._downloader.increment_downloads()
2819 file_id = url.split('/')[-1]
2820 # Rebuild url in english locale
2821 url = 'http://depositfiles.com/en/files/' + file_id
2823 # Retrieve file webpage with 'Free download' button pressed
2824 free_download_indication = { 'gateway_result' : '1' }
2825 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2827 self.report_download_webpage(file_id)
2828 webpage = urllib2.urlopen(request).read()
2829 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2830 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2833 # Search for the real file URL
2834 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2835 if (mobj is None) or (mobj.group(1) is None):
2836 # Try to figure out reason of the error.
2837 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2838 if (mobj is not None) and (mobj.group(1) is not None):
2839 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2840 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2842 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2845 file_url = mobj.group(1)
2846 file_extension = os.path.splitext(file_url)[1][1:]
2848 # Search for file title
2849 mobj = re.search(r'<b title="(.*?)">', webpage)
2851 self._downloader.trouble(u'ERROR: unable to extract title')
2853 file_title = mobj.group(1).decode('utf-8')
2856 # Process file information
2857 self._downloader.process_info({
2858 'id': file_id.decode('utf-8'),
2859 'url': file_url.decode('utf-8'),
2861 'upload_date': u'NA',
2862 'title': file_title,
2863 'stitle': file_title,
2864 'ext': file_extension.decode('utf-8'),
2868 except UnavailableVideoError, err:
2869 self._downloader.trouble(u'ERROR: unable to download file')
2872 class FacebookIE(InfoExtractor):
2873 """Information Extractor for Facebook"""
2875 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2876 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2877 _NETRC_MACHINE = 'facebook'
2878 _available_formats = ['video', 'highqual', 'lowqual']
2879 _video_extensions = {
2884 IE_NAME = u'facebook'
2886 def __init__(self, downloader=None):
2887 InfoExtractor.__init__(self, downloader)
2889 def _reporter(self, message):
2890 """Add header and report message."""
2891 self._downloader.to_screen(u'[facebook] %s' % message)
2893 def report_login(self):
2894 """Report attempt to log in."""
2895 self._reporter(u'Logging in')
2897 def report_video_webpage_download(self, video_id):
2898 """Report attempt to download video webpage."""
2899 self._reporter(u'%s: Downloading video webpage' % video_id)
2901 def report_information_extraction(self, video_id):
2902 """Report attempt to extract video information."""
2903 self._reporter(u'%s: Extracting video information' % video_id)
2905 def _parse_page(self, video_webpage):
2906 """Extract video information from page"""
2908 data = {'title': r'\("video_title", "(.*?)"\)',
2909 'description': r'<div class="datawrap">(.*?)</div>',
2910 'owner': r'\("video_owner_name", "(.*?)"\)',
2911 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2914 for piece in data.keys():
2915 mobj = re.search(data[piece], video_webpage)
2916 if mobj is not None:
2917 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2921 for fmt in self._available_formats:
2922 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2923 if mobj is not None:
2924 # URL is in a Javascript segment inside an escaped Unicode format within
2925 # the generally utf-8 page
2926 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2927 video_info['video_urls'] = video_urls
2931 def _real_initialize(self):
2932 if self._downloader is None:
2937 downloader_params = self._downloader.params
2939 # Attempt to use provided username and password or .netrc data
2940 if downloader_params.get('username', None) is not None:
2941 useremail = downloader_params['username']
2942 password = downloader_params['password']
2943 elif downloader_params.get('usenetrc', False):
2945 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2946 if info is not None:
2950 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2951 except (IOError, netrc.NetrcParseError), err:
2952 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2955 if useremail is None:
2964 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2967 login_results = urllib2.urlopen(request).read()
2968 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2969 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2971 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2972 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2975 def _real_extract(self, url):
2976 mobj = re.match(self._VALID_URL, url)
2978 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2980 video_id = mobj.group('ID')
2983 self.report_video_webpage_download(video_id)
2984 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2986 page = urllib2.urlopen(request)
2987 video_webpage = page.read()
2988 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2989 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2992 # Start extracting information
2993 self.report_information_extraction(video_id)
2995 # Extract information
2996 video_info = self._parse_page(video_webpage)
2999 if 'owner' not in video_info:
3000 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
3002 video_uploader = video_info['owner']
3005 if 'title' not in video_info:
3006 self._downloader.trouble(u'ERROR: unable to extract video title')
3008 video_title = video_info['title']
3009 video_title = video_title.decode('utf-8')
3010 video_title = sanitize_title(video_title)
3012 simple_title = _simplify_title(video_title)
3015 if 'thumbnail' not in video_info:
3016 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
3017 video_thumbnail = ''
3019 video_thumbnail = video_info['thumbnail']
3023 if 'upload_date' in video_info:
3024 upload_time = video_info['upload_date']
3025 timetuple = email.utils.parsedate_tz(upload_time)
3026 if timetuple is not None:
3028 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
3033 video_description = video_info.get('description', 'No description available.')
3035 url_map = video_info['video_urls']
3036 if len(url_map.keys()) > 0:
3037 # Decide which formats to download
3038 req_format = self._downloader.params.get('format', None)
3039 format_limit = self._downloader.params.get('format_limit', None)
3041 if format_limit is not None and format_limit in self._available_formats:
3042 format_list = self._available_formats[self._available_formats.index(format_limit):]
3044 format_list = self._available_formats
3045 existing_formats = [x for x in format_list if x in url_map]
3046 if len(existing_formats) == 0:
3047 self._downloader.trouble(u'ERROR: no known formats available for video')
3049 if req_format is None:
3050 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
3051 elif req_format == 'worst':
3052 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3053 elif req_format == '-1':
3054 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3057 if req_format not in url_map:
3058 self._downloader.trouble(u'ERROR: requested format not available')
3060 video_url_list = [(req_format, url_map[req_format])] # Specific format
3062 for format_param, video_real_url in video_url_list:
3064 # At this point we have a new video
3065 self._downloader.increment_downloads()
3068 video_extension = self._video_extensions.get(format_param, 'mp4')
3071 # Process video information
3072 self._downloader.process_info({
3073 'id': video_id.decode('utf-8'),
3074 'url': video_real_url.decode('utf-8'),
3075 'uploader': video_uploader.decode('utf-8'),
3076 'upload_date': upload_date,
3077 'title': video_title,
3078 'stitle': simple_title,
3079 'ext': video_extension.decode('utf-8'),
3080 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3081 'thumbnail': video_thumbnail.decode('utf-8'),
3082 'description': video_description.decode('utf-8'),
3085 except UnavailableVideoError, err:
3086 self._downloader.trouble(u'\nERROR: unable to download video')
3088 class BlipTVIE(InfoExtractor):
3089 """Information extractor for blip.tv"""
3091 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3092 _URL_EXT = r'^.*\.([a-z0-9]+)$'
3093 IE_NAME = u'blip.tv'
3095 def report_extraction(self, file_id):
3096 """Report information extraction."""
3097 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3099 def report_direct_download(self, title):
3100 """Report information extraction."""
3101 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3103 def _real_extract(self, url):
3104 mobj = re.match(self._VALID_URL, url)
3106 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3113 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3114 request = urllib2.Request(json_url)
3115 self.report_extraction(mobj.group(1))
3118 urlh = urllib2.urlopen(request)
3119 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3120 basename = url.split('/')[-1]
3121 title,ext = os.path.splitext(basename)
3122 title = title.decode('UTF-8')
3123 ext = ext.replace('.', '')
3124 self.report_direct_download(title)
3129 'stitle': _simplify_title(title),
3133 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3134 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3136 if info is None: # Regular URL
3138 json_code = urlh.read()
3139 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3140 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3144 json_data = json.loads(json_code)
3145 if 'Post' in json_data:
3146 data = json_data['Post']
3150 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3151 video_url = data['media']['url']
3152 umobj = re.match(self._URL_EXT, video_url)
3154 raise ValueError('Can not determine filename extension')
3155 ext = umobj.group(1)
3158 'id': data['item_id'],
3160 'uploader': data['display_name'],
3161 'upload_date': upload_date,
3162 'title': data['title'],
3163 'stitle': _simplify_title(data['title']),
3165 'format': data['media']['mimeType'],
3166 'thumbnail': data['thumbnailUrl'],
3167 'description': data['description'],
3168 'player_url': data['embedUrl']
3170 except (ValueError,KeyError), err:
3171 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3174 self._downloader.increment_downloads()
3177 self._downloader.process_info(info)
3178 except UnavailableVideoError, err:
3179 self._downloader.trouble(u'\nERROR: unable to download video')
3182 class MyVideoIE(InfoExtractor):
3183 """Information Extractor for myvideo.de."""
3185 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3186 IE_NAME = u'myvideo'
3188 def __init__(self, downloader=None):
3189 InfoExtractor.__init__(self, downloader)
3191 def report_download_webpage(self, video_id):
3192 """Report webpage download."""
3193 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3195 def report_extraction(self, video_id):
3196 """Report information extraction."""
3197 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3199 def _real_extract(self,url):
3200 mobj = re.match(self._VALID_URL, url)
3202 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3205 video_id = mobj.group(1)
3208 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3210 self.report_download_webpage(video_id)
3211 webpage = urllib2.urlopen(request).read()
3212 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3213 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3216 self.report_extraction(video_id)
3217 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3220 self._downloader.trouble(u'ERROR: unable to extract media URL')
3222 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3224 mobj = re.search('<title>([^<]+)</title>', webpage)
3226 self._downloader.trouble(u'ERROR: unable to extract title')
3229 video_title = mobj.group(1)
3230 video_title = sanitize_title(video_title)
3232 simple_title = _simplify_title(video_title)
3235 self._downloader.process_info({
3239 'upload_date': u'NA',
3240 'title': video_title,
3241 'stitle': simple_title,
3246 except UnavailableVideoError:
3247 self._downloader.trouble(u'\nERROR: Unable to download video')
3249 class ComedyCentralIE(InfoExtractor):
3250 """Information extractor for The Daily Show and Colbert Report """
3252 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3253 IE_NAME = u'comedycentral'
3255 def report_extraction(self, episode_id):
3256 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3258 def report_config_download(self, episode_id):
3259 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3261 def report_index_download(self, episode_id):
3262 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3264 def report_player_url(self, episode_id):
3265 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3267 def _real_extract(self, url):
3268 mobj = re.match(self._VALID_URL, url)
3270 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3273 if mobj.group('shortname'):
3274 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3275 url = u'http://www.thedailyshow.com/full-episodes/'
3277 url = u'http://www.colbertnation.com/full-episodes/'
3278 mobj = re.match(self._VALID_URL, url)
3279 assert mobj is not None
3281 dlNewest = not mobj.group('episode')
3283 epTitle = mobj.group('showname')
3285 epTitle = mobj.group('episode')
3287 req = urllib2.Request(url)
3288 self.report_extraction(epTitle)
3290 htmlHandle = urllib2.urlopen(req)
3291 html = htmlHandle.read()
3292 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3293 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3296 url = htmlHandle.geturl()
3297 mobj = re.match(self._VALID_URL, url)
3299 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3301 if mobj.group('episode') == '':
3302 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3304 epTitle = mobj.group('episode')
3306 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3307 if len(mMovieParams) == 0:
3308 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3311 playerUrl_raw = mMovieParams[0][0]
3312 self.report_player_url(epTitle)
3314 urlHandle = urllib2.urlopen(playerUrl_raw)
3315 playerUrl = urlHandle.geturl()
3316 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3317 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3320 uri = mMovieParams[0][1]
3321 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3322 self.report_index_download(epTitle)
3324 indexXml = urllib2.urlopen(indexUrl).read()
3325 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3326 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3329 idoc = xml.etree.ElementTree.fromstring(indexXml)
3330 itemEls = idoc.findall('.//item')
3331 for itemEl in itemEls:
3332 mediaId = itemEl.findall('./guid')[0].text
3333 shortMediaId = mediaId.split(':')[-1]
3334 showId = mediaId.split(':')[-2].replace('.com', '')
3335 officialTitle = itemEl.findall('./title')[0].text
3336 officialDate = itemEl.findall('./pubDate')[0].text
3338 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3339 urllib.urlencode({'uri': mediaId}))
3340 configReq = urllib2.Request(configUrl)
3341 self.report_config_download(epTitle)
3343 configXml = urllib2.urlopen(configReq).read()
3344 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3345 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3348 cdoc = xml.etree.ElementTree.fromstring(configXml)
3350 for rendition in cdoc.findall('.//rendition'):
3351 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3355 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3358 # For now, just pick the highest bitrate
3359 format,video_url = turls[-1]
3361 self._downloader.increment_downloads()
3363 effTitle = showId + u'-' + epTitle
3368 'upload_date': officialDate,
3370 'stitle': _simplify_title(effTitle),
3374 'description': officialTitle,
3375 'player_url': playerUrl
3379 self._downloader.process_info(info)
3380 except UnavailableVideoError, err:
3381 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3385 class EscapistIE(InfoExtractor):
3386 """Information extractor for The Escapist """
3388 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3389 IE_NAME = u'escapist'
3391 def report_extraction(self, showName):
3392 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3394 def report_config_download(self, showName):
3395 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3397 def _real_extract(self, url):
3398 mobj = re.match(self._VALID_URL, url)
3400 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3402 showName = mobj.group('showname')
3403 videoId = mobj.group('episode')
3405 self.report_extraction(showName)
3407 webPage = urllib2.urlopen(url).read()
3408 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3409 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3412 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3413 description = _unescapeHTML(descMatch.group(1))
3414 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3415 imgUrl = _unescapeHTML(imgMatch.group(1))
3416 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3417 playerUrl = _unescapeHTML(playerUrlMatch.group(1))
3418 configUrlMatch = re.search('config=(.*)$', playerUrl)
3419 configUrl = urllib2.unquote(configUrlMatch.group(1))
3421 self.report_config_download(showName)
3423 configJSON = urllib2.urlopen(configUrl).read()
3424 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3425 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3428 # Technically, it's JavaScript, not JSON
3429 configJSON = configJSON.replace("'", '"')
3432 config = json.loads(configJSON)
3433 except (ValueError,), err:
3434 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3437 playlist = config['playlist']
3438 videoUrl = playlist[1]['url']
3440 self._downloader.increment_downloads()
3444 'uploader': showName,
3445 'upload_date': None,
3447 'stitle': _simplify_title(showName),
3450 'thumbnail': imgUrl,
3451 'description': description,
3452 'player_url': playerUrl,
3456 self._downloader.process_info(info)
3457 except UnavailableVideoError, err:
3458 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3461 class CollegeHumorIE(InfoExtractor):
3462 """Information extractor for collegehumor.com"""
3464 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3465 IE_NAME = u'collegehumor'
3467 def report_webpage(self, video_id):
3468 """Report information extraction."""
3469 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3471 def report_extraction(self, video_id):
3472 """Report information extraction."""
3473 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3475 def _real_extract(self, url):
3476 mobj = re.match(self._VALID_URL, url)
3478 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3480 video_id = mobj.group('videoid')
3482 self.report_webpage(video_id)
3483 request = urllib2.Request(url)
3485 webpage = urllib2.urlopen(request).read()
3486 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3487 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3490 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3492 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3494 internal_video_id = m.group('internalvideoid')
3498 'internal_id': internal_video_id,
3501 self.report_extraction(video_id)
3502 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3504 metaXml = urllib2.urlopen(xmlUrl).read()
3505 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3506 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3509 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3511 videoNode = mdoc.findall('./video')[0]
3512 info['description'] = videoNode.findall('./description')[0].text
3513 info['title'] = videoNode.findall('./caption')[0].text
3514 info['stitle'] = _simplify_title(info['title'])
3515 info['url'] = videoNode.findall('./file')[0].text
3516 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3517 info['ext'] = info['url'].rpartition('.')[2]
3518 info['format'] = info['ext']
3520 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3523 self._downloader.increment_downloads()
3526 self._downloader.process_info(info)
3527 except UnavailableVideoError, err:
3528 self._downloader.trouble(u'\nERROR: unable to download video')
3531 class XVideosIE(InfoExtractor):
3532 """Information extractor for xvideos.com"""
3534 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3535 IE_NAME = u'xvideos'
3537 def report_webpage(self, video_id):
3538 """Report information extraction."""
3539 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3541 def report_extraction(self, video_id):
3542 """Report information extraction."""
3543 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3545 def _real_extract(self, url):
3546 mobj = re.match(self._VALID_URL, url)
3548 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3550 video_id = mobj.group(1).decode('utf-8')
3552 self.report_webpage(video_id)
3554 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3556 webpage = urllib2.urlopen(request).read()
3557 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3558 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3561 self.report_extraction(video_id)
3565 mobj = re.search(r'flv_url=(.+?)&', webpage)
3567 self._downloader.trouble(u'ERROR: unable to extract video url')
3569 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3573 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3575 self._downloader.trouble(u'ERROR: unable to extract video title')
3577 video_title = mobj.group(1).decode('utf-8')
3580 # Extract video thumbnail
3581 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3583 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3585 video_thumbnail = mobj.group(1).decode('utf-8')
3589 self._downloader.increment_downloads()
3594 'upload_date': None,
3595 'title': video_title,
3596 'stitle': _simplify_title(video_title),
3599 'thumbnail': video_thumbnail,
3600 'description': None,
3605 self._downloader.process_info(info)
3606 except UnavailableVideoError, err:
3607 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3610 class SoundcloudIE(InfoExtractor):
3611 """Information extractor for soundcloud.com
3612 To access the media, the uid of the song and a stream token
3613 must be extracted from the page source and the script must make
3614 a request to media.soundcloud.com/crossdomain.xml. Then
3615 the media can be grabbed by requesting from an url composed
3616 of the stream token and uid
3619 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3620 IE_NAME = u'soundcloud'
3622 def __init__(self, downloader=None):
3623 InfoExtractor.__init__(self, downloader)
3625 def report_webpage(self, video_id):
3626 """Report information extraction."""
3627 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3629 def report_extraction(self, video_id):
3630 """Report information extraction."""
3631 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3633 def _real_extract(self, url):
3634 mobj = re.match(self._VALID_URL, url)
3636 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3639 # extract uploader (which is in the url)
3640 uploader = mobj.group(1).decode('utf-8')
3641 # extract simple title (uploader + slug of song title)
3642 slug_title = mobj.group(2).decode('utf-8')
3643 simple_title = uploader + '-' + slug_title
3645 self.report_webpage('%s/%s' % (uploader, slug_title))
3647 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3649 webpage = urllib2.urlopen(request).read()
3650 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3651 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3654 self.report_extraction('%s/%s' % (uploader, slug_title))
3656 # extract uid and stream token that soundcloud hands out for access
3657 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3659 video_id = mobj.group(1)
3660 stream_token = mobj.group(2)
3662 # extract unsimplified title
3663 mobj = re.search('"title":"(.*?)",', webpage)
3665 title = mobj.group(1)
3667 # construct media url (with uid/token)
3668 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3669 mediaURL = mediaURL % (video_id, stream_token)
3672 description = u'No description available'
3673 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3675 description = mobj.group(1)
3679 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3682 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3683 except Exception, e:
3686 # for soundcloud, a request to a cross domain is required for cookies
3687 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3690 self._downloader.process_info({
3691 'id': video_id.decode('utf-8'),
3693 'uploader': uploader.decode('utf-8'),
3694 'upload_date': upload_date,
3695 'title': simple_title.decode('utf-8'),
3696 'stitle': simple_title.decode('utf-8'),
3700 'description': description.decode('utf-8')
3702 except UnavailableVideoError:
3703 self._downloader.trouble(u'\nERROR: unable to download video')
3706 class InfoQIE(InfoExtractor):
3707 """Information extractor for infoq.com"""
3709 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3712 def report_webpage(self, video_id):
3713 """Report information extraction."""
3714 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3716 def report_extraction(self, video_id):
3717 """Report information extraction."""
3718 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3720 def _real_extract(self, url):
3721 mobj = re.match(self._VALID_URL, url)
3723 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3726 self.report_webpage(url)
3728 request = urllib2.Request(url)
3730 webpage = urllib2.urlopen(request).read()
3731 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3732 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3735 self.report_extraction(url)
3739 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3741 self._downloader.trouble(u'ERROR: unable to extract video url')
3743 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3747 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3749 self._downloader.trouble(u'ERROR: unable to extract video title')
3751 video_title = mobj.group(1).decode('utf-8')
3753 # Extract description
3754 video_description = u'No description available.'
3755 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3756 if mobj is not None:
3757 video_description = mobj.group(1).decode('utf-8')
3759 video_filename = video_url.split('/')[-1]
3760 video_id, extension = video_filename.split('.')
3762 self._downloader.increment_downloads()
3767 'upload_date': None,
3768 'title': video_title,
3769 'stitle': _simplify_title(video_title),
3771 'format': extension, # Extension is always(?) mp4, but seems to be flv
3773 'description': video_description,
3778 self._downloader.process_info(info)
3779 except UnavailableVideoError, err:
3780 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3782 class MixcloudIE(InfoExtractor):
3783 """Information extractor for www.mixcloud.com"""
3784 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3785 IE_NAME = u'mixcloud'
3787 def __init__(self, downloader=None):
3788 InfoExtractor.__init__(self, downloader)
3790 def report_download_json(self, file_id):
3791 """Report JSON download."""
3792 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3794 def report_extraction(self, file_id):
3795 """Report information extraction."""
3796 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3798 def get_urls(self, jsonData, fmt, bitrate='best'):
3799 """Get urls from 'audio_formats' section in json"""
3802 bitrate_list = jsonData[fmt]
3803 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3804 bitrate = max(bitrate_list) # select highest
3806 url_list = jsonData[fmt][bitrate]
3807 except TypeError: # we have no bitrate info.
3808 url_list = jsonData[fmt]
3812 def check_urls(self, url_list):
3813 """Returns 1st active url from list"""
3814 for url in url_list:
3816 urllib2.urlopen(url)
3818 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3823 def _print_formats(self, formats):
3824 print 'Available formats:'
3825 for fmt in formats.keys():
3826 for b in formats[fmt]:
3828 ext = formats[fmt][b][0]
3829 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3830 except TypeError: # we have no bitrate info
3831 ext = formats[fmt][0]
3832 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3835 def _real_extract(self, url):
3836 mobj = re.match(self._VALID_URL, url)
3838 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3840 # extract uploader & filename from url
3841 uploader = mobj.group(1).decode('utf-8')
3842 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3844 # construct API request
3845 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3846 # retrieve .json file with links to files
3847 request = urllib2.Request(file_url)
3849 self.report_download_json(file_url)
3850 jsonData = urllib2.urlopen(request).read()
3851 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3852 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3856 json_data = json.loads(jsonData)
3857 player_url = json_data['player_swf_url']
3858 formats = dict(json_data['audio_formats'])
3860 req_format = self._downloader.params.get('format', None)
3863 if self._downloader.params.get('listformats', None):
3864 self._print_formats(formats)
3867 if req_format is None or req_format == 'best':
3868 for format_param in formats.keys():
3869 url_list = self.get_urls(formats, format_param)
3871 file_url = self.check_urls(url_list)
3872 if file_url is not None:
3875 if req_format not in formats.keys():
3876 self._downloader.trouble(u'ERROR: format is not available')
3879 url_list = self.get_urls(formats, req_format)
3880 file_url = self.check_urls(url_list)
3881 format_param = req_format
3884 self._downloader.increment_downloads()
3886 # Process file information
3887 self._downloader.process_info({
3888 'id': file_id.decode('utf-8'),
3889 'url': file_url.decode('utf-8'),
3890 'uploader': uploader.decode('utf-8'),
3891 'upload_date': u'NA',
3892 'title': json_data['name'],
3893 'stitle': _simplify_title(json_data['name']),
3894 'ext': file_url.split('.')[-1].decode('utf-8'),
3895 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3896 'thumbnail': json_data['thumbnail_url'],
3897 'description': json_data['description'],
3898 'player_url': player_url.decode('utf-8'),
3900 except UnavailableVideoError, err:
3901 self._downloader.trouble(u'ERROR: unable to download file')
3903 class StanfordOpenClassroomIE(InfoExtractor):
3904 """Information extractor for Stanford's Open ClassRoom"""
3906 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3907 IE_NAME = u'stanfordoc'
3909 def report_download_webpage(self, objid):
3910 """Report information extraction."""
3911 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3913 def report_extraction(self, video_id):
3914 """Report information extraction."""
3915 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3917 def _real_extract(self, url):
3918 mobj = re.match(self._VALID_URL, url)
3920 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3923 if mobj.group('course') and mobj.group('video'): # A specific video
3924 course = mobj.group('course')
3925 video = mobj.group('video')
3927 'id': _simplify_title(course + '_' + video),
3930 self.report_extraction(info['id'])
3931 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3932 xmlUrl = baseUrl + video + '.xml'
3934 metaXml = urllib2.urlopen(xmlUrl).read()
3935 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3936 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3938 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3940 info['title'] = mdoc.findall('./title')[0].text
3941 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3943 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3945 info['stitle'] = _simplify_title(info['title'])
3946 info['ext'] = info['url'].rpartition('.')[2]
3947 info['format'] = info['ext']
3948 self._downloader.increment_downloads()
3950 self._downloader.process_info(info)
3951 except UnavailableVideoError, err:
3952 self._downloader.trouble(u'\nERROR: unable to download video')
3953 elif mobj.group('course'): # A course page
3954 course = mobj.group('course')
3956 'id': _simplify_title(course),
3960 self.report_download_webpage(info['id'])
3962 coursepage = urllib2.urlopen(url).read()
3963 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3964 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3967 m = re.search('<h1>([^<]+)</h1>', coursepage)
3969 info['title'] = _unescapeHTML(m.group(1))
3971 info['title'] = info['id']
3972 info['stitle'] = _simplify_title(info['title'])
3974 m = re.search('<description>([^<]+)</description>', coursepage)
3976 info['description'] = _unescapeHTML(m.group(1))
3978 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3981 'type': 'reference',
3982 'url': 'http://openclassroom.stanford.edu/MainFolder/' + _unescapeHTML(vpage),
3986 for entry in info['list']:
3987 assert entry['type'] == 'reference'
3988 self.extract(entry['url'])
3991 'id': 'Stanford OpenClassroom',
3995 self.report_download_webpage(info['id'])
3996 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3998 rootpage = urllib2.urlopen(rootURL).read()
3999 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4000 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
4003 info['title'] = info['id']
4004 info['stitle'] = _simplify_title(info['title'])
4006 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
4009 'type': 'reference',
4010 'url': 'http://openclassroom.stanford.edu/MainFolder/' + _unescapeHTML(cpage),
4014 for entry in info['list']:
4015 assert entry['type'] == 'reference'
4016 self.extract(entry['url'])
4018 class MTVIE(InfoExtractor):
4019 """Information extractor for MTV.com"""
4021 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
4024 def report_webpage(self, video_id):
4025 """Report information extraction."""
4026 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
4028 def report_extraction(self, video_id):
4029 """Report information extraction."""
4030 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
4032 def _real_extract(self, url):
4033 mobj = re.match(self._VALID_URL, url)
4035 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4037 if not mobj.group('proto'):
4038 url = 'http://' + url
4039 video_id = mobj.group('videoid')
4040 self.report_webpage(video_id)
4042 request = urllib2.Request(url)
4044 webpage = urllib2.urlopen(request).read()
4045 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4046 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4049 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4051 self._downloader.trouble(u'ERROR: unable to extract song name')
4053 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4054 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4056 self._downloader.trouble(u'ERROR: unable to extract performer')
4058 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4059 video_title = performer + ' - ' + song_name
4061 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4063 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4065 mtvn_uri = mobj.group(1)
4067 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4069 self._downloader.trouble(u'ERROR: unable to extract content id')
4071 content_id = mobj.group(1)
4073 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4074 self.report_extraction(video_id)
4075 request = urllib2.Request(videogen_url)
4077 metadataXml = urllib2.urlopen(request).read()
4078 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4079 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4082 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4083 renditions = mdoc.findall('.//rendition')
4085 # For now, always pick the highest quality.
4086 rendition = renditions[-1]
4089 _,_,ext = rendition.attrib['type'].partition('/')
4090 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4091 video_url = rendition.find('./src').text
4093 self._downloader.trouble('Invalid rendition field.')
4096 self._downloader.increment_downloads()
4100 'uploader': performer,
4101 'title': video_title,
4102 'stitle': _simplify_title(video_title),
4108 self._downloader.process_info(info)
4109 except UnavailableVideoError, err:
4110 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4113 class PostProcessor(object):
4114 """Post Processor class.
4116 PostProcessor objects can be added to downloaders with their
4117 add_post_processor() method. When the downloader has finished a
4118 successful download, it will take its internal chain of PostProcessors
4119 and start calling the run() method on each one of them, first with
4120 an initial argument and then with the returned value of the previous
4123 The chain will be stopped if one of them ever returns None or the end
4124 of the chain is reached.
4126 PostProcessor objects follow a "mutual registration" process similar
4127 to InfoExtractor objects.
4132 def __init__(self, downloader=None):
4133 self._downloader = downloader
4135 def set_downloader(self, downloader):
4136 """Sets the downloader for this PP."""
4137 self._downloader = downloader
4139 def run(self, information):
4140 """Run the PostProcessor.
4142 The "information" argument is a dictionary like the ones
4143 composed by InfoExtractors. The only difference is that this
4144 one has an extra field called "filepath" that points to the
4147 When this method returns None, the postprocessing chain is
4148 stopped. However, this method may return an information
4149 dictionary that will be passed to the next postprocessing
4150 object in the chain. It can be the one it received after
4151 changing some fields.
4153 In addition, this method may raise a PostProcessingError
4154 exception that will be taken into account by the downloader
4157 return information # by default, do nothing
4159 class AudioConversionError(BaseException):
4160 def __init__(self, message):
4161 self.message = message
4163 class FFmpegExtractAudioPP(PostProcessor):
4165 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4166 PostProcessor.__init__(self, downloader)
4167 if preferredcodec is None:
4168 preferredcodec = 'best'
4169 self._preferredcodec = preferredcodec
4170 self._preferredquality = preferredquality
4171 self._keepvideo = keepvideo
4174 def get_audio_codec(path):
4176 cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4177 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4178 output = handle.communicate()[0]
4179 if handle.wait() != 0:
4181 except (IOError, OSError):
4184 for line in output.split('\n'):
4185 if line.startswith('codec_name='):
4186 audio_codec = line.split('=')[1].strip()
4187 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4192 def run_ffmpeg(path, out_path, codec, more_opts):
4196 acodec_opts = ['-acodec', codec]
4197 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4199 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4200 stdout,stderr = p.communicate()
4201 except (IOError, OSError):
4202 e = sys.exc_info()[1]
4203 if isinstance(e, OSError) and e.errno == 2:
4204 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4207 if p.returncode != 0:
4208 msg = stderr.strip().split('\n')[-1]
4209 raise AudioConversionError(msg)
4211 def run(self, information):
4212 path = information['filepath']
4214 filecodec = self.get_audio_codec(path)
4215 if filecodec is None:
4216 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4220 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4221 if self._preferredcodec == 'm4a' and filecodec == 'aac':
4222 # Lossless, but in another container
4224 extension = self._preferredcodec
4225 more_opts = ['-absf', 'aac_adtstoasc']
4226 elif filecodec in ['aac', 'mp3', 'vorbis']:
4227 # Lossless if possible
4229 extension = filecodec
4230 if filecodec == 'aac':
4231 more_opts = ['-f', 'adts']
4232 if filecodec == 'vorbis':
4236 acodec = 'libmp3lame'
4239 if self._preferredquality is not None:
4240 more_opts += ['-ab', self._preferredquality]
4242 # We convert the audio (lossy)
4243 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4244 extension = self._preferredcodec
4246 if self._preferredquality is not None:
4247 more_opts += ['-ab', self._preferredquality]
4248 if self._preferredcodec == 'aac':
4249 more_opts += ['-f', 'adts']
4250 if self._preferredcodec == 'm4a':
4251 more_opts += ['-absf', 'aac_adtstoasc']
4252 if self._preferredcodec == 'vorbis':
4254 if self._preferredcodec == 'wav':
4256 more_opts += ['-f', 'wav']
4258 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4259 new_path = prefix + sep + extension
4260 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4262 self.run_ffmpeg(path, new_path, acodec, more_opts)
4264 etype,e,tb = sys.exc_info()
4265 if isinstance(e, AudioConversionError):
4266 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4268 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4271 # Try to update the date time for extracted audio file.
4272 if information.get('filetime') is not None:
4274 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4276 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4278 if not self._keepvideo:
4280 os.remove(_encodeFilename(path))
4281 except (IOError, OSError):
4282 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4285 information['filepath'] = new_path
4289 def updateSelf(downloader, filename):
4290 ''' Update the program file with the latest version from the repository '''
4291 # Note: downloader only used for options
4292 if not os.access(filename, os.W_OK):
4293 sys.exit('ERROR: no write permissions on %s' % filename)
4295 downloader.to_screen(u'Updating to latest version...')
4299 urlh = urllib.urlopen(UPDATE_URL)
4300 newcontent = urlh.read()
4302 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4303 if vmatch is not None and vmatch.group(1) == __version__:
4304 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4308 except (IOError, OSError), err:
4309 sys.exit('ERROR: unable to download latest version')
4312 outf = open(filename, 'wb')
4314 outf.write(newcontent)
4317 except (IOError, OSError), err:
4318 sys.exit('ERROR: unable to overwrite current version')
4320 downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4323 def _readOptions(filename_bytes):
4325 optionf = open(filename_bytes)
4327 return [] # silently skip if file is not present
4331 res += shlex.split(l, comments=True)
4336 def _format_option_string(option):
4337 ''' ('-o', '--option') -> -o, --format METAVAR'''
4341 if option._short_opts: opts.append(option._short_opts[0])
4342 if option._long_opts: opts.append(option._long_opts[0])
4343 if len(opts) > 1: opts.insert(1, ', ')
4345 if option.takes_value(): opts.append(' %s' % option.metavar)
4347 return "".join(opts)
4349 def _find_term_columns():
4350 columns = os.environ.get('COLUMNS', None)
4355 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4356 out,err = sp.communicate()
4357 return int(out.split()[1])
4363 max_help_position = 80
4365 # No need to wrap help messages if we're on a wide console
4366 columns = _find_term_columns()
4367 if columns: max_width = columns
4369 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4370 fmt.format_option_strings = _format_option_string
4373 'version' : __version__,
4375 'usage' : '%prog [options] url [url...]',
4376 'conflict_handler' : 'resolve',
4379 parser = optparse.OptionParser(**kw)
4382 general = optparse.OptionGroup(parser, 'General Options')
4383 selection = optparse.OptionGroup(parser, 'Video Selection')
4384 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4385 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4386 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4387 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4388 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4390 general.add_option('-h', '--help',
4391 action='help', help='print this help text and exit')
4392 general.add_option('-v', '--version',
4393 action='version', help='print program version and exit')
4394 general.add_option('-U', '--update',
4395 action='store_true', dest='update_self', help='update this program to latest version')
4396 general.add_option('-i', '--ignore-errors',
4397 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4398 general.add_option('-r', '--rate-limit',
4399 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4400 general.add_option('-R', '--retries',
4401 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4402 general.add_option('--dump-user-agent',
4403 action='store_true', dest='dump_user_agent',
4404 help='display the current browser identification', default=False)
4405 general.add_option('--list-extractors',
4406 action='store_true', dest='list_extractors',
4407 help='List all supported extractors and the URLs they would handle', default=False)
4409 selection.add_option('--playlist-start',
4410 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4411 selection.add_option('--playlist-end',
4412 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4413 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4414 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4415 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4417 authentication.add_option('-u', '--username',
4418 dest='username', metavar='USERNAME', help='account username')
4419 authentication.add_option('-p', '--password',
4420 dest='password', metavar='PASSWORD', help='account password')
4421 authentication.add_option('-n', '--netrc',
4422 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4425 video_format.add_option('-f', '--format',
4426 action='store', dest='format', metavar='FORMAT', help='video format code')
4427 video_format.add_option('--all-formats',
4428 action='store_const', dest='format', help='download all available video formats', const='all')
4429 video_format.add_option('--prefer-free-formats',
4430 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4431 video_format.add_option('--max-quality',
4432 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4433 video_format.add_option('-F', '--list-formats',
4434 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4435 video_format.add_option('--write-srt',
4436 action='store_true', dest='writesubtitles',
4437 help='write video closed captions to a .srt file (currently youtube only)', default=False)
4438 video_format.add_option('--srt-lang',
4439 action='store', dest='subtitleslang', metavar='LANG',
4440 help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
4443 verbosity.add_option('-q', '--quiet',
4444 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4445 verbosity.add_option('-s', '--simulate',
4446 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4447 verbosity.add_option('--skip-download',
4448 action='store_true', dest='skip_download', help='do not download the video', default=False)
4449 verbosity.add_option('-g', '--get-url',
4450 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4451 verbosity.add_option('-e', '--get-title',
4452 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4453 verbosity.add_option('--get-thumbnail',
4454 action='store_true', dest='getthumbnail',
4455 help='simulate, quiet but print thumbnail URL', default=False)
4456 verbosity.add_option('--get-description',
4457 action='store_true', dest='getdescription',
4458 help='simulate, quiet but print video description', default=False)
4459 verbosity.add_option('--get-filename',
4460 action='store_true', dest='getfilename',
4461 help='simulate, quiet but print output filename', default=False)
4462 verbosity.add_option('--get-format',
4463 action='store_true', dest='getformat',
4464 help='simulate, quiet but print output format', default=False)
4465 verbosity.add_option('--no-progress',
4466 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4467 verbosity.add_option('--console-title',
4468 action='store_true', dest='consoletitle',
4469 help='display progress in console titlebar', default=False)
4470 verbosity.add_option('-v', '--verbose',
4471 action='store_true', dest='verbose', help='print various debugging information', default=False)
4474 filesystem.add_option('-t', '--title',
4475 action='store_true', dest='usetitle', help='use title in file name', default=False)
4476 filesystem.add_option('-l', '--literal',
4477 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4478 filesystem.add_option('-A', '--auto-number',
4479 action='store_true', dest='autonumber',
4480 help='number downloaded files starting from 00000', default=False)
4481 filesystem.add_option('-o', '--output',
4482 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4483 filesystem.add_option('-a', '--batch-file',
4484 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4485 filesystem.add_option('-w', '--no-overwrites',
4486 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4487 filesystem.add_option('-c', '--continue',
4488 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4489 filesystem.add_option('--no-continue',
4490 action='store_false', dest='continue_dl',
4491 help='do not resume partially downloaded files (restart from beginning)')
4492 filesystem.add_option('--cookies',
4493 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4494 filesystem.add_option('--no-part',
4495 action='store_true', dest='nopart', help='do not use .part files', default=False)
4496 filesystem.add_option('--no-mtime',
4497 action='store_false', dest='updatetime',
4498 help='do not use the Last-modified header to set the file modification time', default=True)
4499 filesystem.add_option('--write-description',
4500 action='store_true', dest='writedescription',
4501 help='write video description to a .description file', default=False)
4502 filesystem.add_option('--write-info-json',
4503 action='store_true', dest='writeinfojson',
4504 help='write video metadata to a .info.json file', default=False)
4507 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4508 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4509 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4510 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4511 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4512 help='ffmpeg audio bitrate specification, 128k by default')
4513 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4514 help='keeps the video file on disk after the post-processing; the video is erased by default')
4517 parser.add_option_group(general)
4518 parser.add_option_group(selection)
4519 parser.add_option_group(filesystem)
4520 parser.add_option_group(verbosity)
4521 parser.add_option_group(video_format)
4522 parser.add_option_group(authentication)
4523 parser.add_option_group(postproc)
4525 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4527 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4529 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4530 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4531 opts, args = parser.parse_args(argv)
4533 return parser, opts, args
4535 def gen_extractors():
4536 """ Return a list of an instance of every supported extractor.
4537 The order does matter; the first extractor matched is the one handling the URL.
4539 youtube_ie = YoutubeIE()
4540 google_ie = GoogleIE()
4541 yahoo_ie = YahooIE()
4543 YoutubePlaylistIE(youtube_ie),
4544 YoutubeUserIE(youtube_ie),
4545 YoutubeSearchIE(youtube_ie),
4547 MetacafeIE(youtube_ie),
4550 GoogleSearchIE(google_ie),
4553 YahooSearchIE(yahoo_ie),
4566 StanfordOpenClassroomIE(),
4573 parser, opts, args = parseOpts()
4575 # Open appropriate CookieJar
4576 if opts.cookiefile is None:
4577 jar = cookielib.CookieJar()
4580 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4581 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4583 except (IOError, OSError), err:
4584 sys.exit(u'ERROR: unable to open cookie file')
4587 if opts.dump_user_agent:
4588 print std_headers['User-Agent']
4591 # Batch file verification
4593 if opts.batchfile is not None:
4595 if opts.batchfile == '-':
4598 batchfd = open(opts.batchfile, 'r')
4599 batchurls = batchfd.readlines()
4600 batchurls = [x.strip() for x in batchurls]
4601 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4603 sys.exit(u'ERROR: batch file could not be read')
4604 all_urls = batchurls + args
4606 # General configuration
4607 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4608 proxy_handler = urllib2.ProxyHandler()
4609 opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4610 urllib2.install_opener(opener)
4611 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4614 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4616 extractors = gen_extractors()
4618 if opts.list_extractors:
4619 for ie in extractors:
4621 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4622 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4623 for mu in matchedUrls:
4627 # Conflicting, missing and erroneous options
4628 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4629 parser.error(u'using .netrc conflicts with giving username/password')
4630 if opts.password is not None and opts.username is None:
4631 parser.error(u'account username missing')
4632 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4633 parser.error(u'using output template conflicts with using title, literal title or auto number')
4634 if opts.usetitle and opts.useliteral:
4635 parser.error(u'using title conflicts with using literal title')
4636 if opts.username is not None and opts.password is None:
4637 opts.password = getpass.getpass(u'Type account password and press return:')
4638 if opts.ratelimit is not None:
4639 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4640 if numeric_limit is None:
4641 parser.error(u'invalid rate limit specified')
4642 opts.ratelimit = numeric_limit
4643 if opts.retries is not None:
4645 opts.retries = long(opts.retries)
4646 except (TypeError, ValueError), err:
4647 parser.error(u'invalid retry count specified')
4649 opts.playliststart = int(opts.playliststart)
4650 if opts.playliststart <= 0:
4651 raise ValueError(u'Playlist start must be positive')
4652 except (TypeError, ValueError), err:
4653 parser.error(u'invalid playlist start number specified')
4655 opts.playlistend = int(opts.playlistend)
4656 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4657 raise ValueError(u'Playlist end must be greater than playlist start')
4658 except (TypeError, ValueError), err:
4659 parser.error(u'invalid playlist end number specified')
4660 if opts.extractaudio:
4661 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4662 parser.error(u'invalid audio format specified')
4665 fd = FileDownloader({
4666 'usenetrc': opts.usenetrc,
4667 'username': opts.username,
4668 'password': opts.password,
4669 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4670 'forceurl': opts.geturl,
4671 'forcetitle': opts.gettitle,
4672 'forcethumbnail': opts.getthumbnail,
4673 'forcedescription': opts.getdescription,
4674 'forcefilename': opts.getfilename,
4675 'forceformat': opts.getformat,
4676 'simulate': opts.simulate,
4677 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4678 'format': opts.format,
4679 'format_limit': opts.format_limit,
4680 'listformats': opts.listformats,
4681 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4682 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4683 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4684 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4685 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4686 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4687 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4688 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4689 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4690 or u'%(id)s.%(ext)s'),
4691 'ignoreerrors': opts.ignoreerrors,
4692 'ratelimit': opts.ratelimit,
4693 'nooverwrites': opts.nooverwrites,
4694 'retries': opts.retries,
4695 'continuedl': opts.continue_dl,
4696 'noprogress': opts.noprogress,
4697 'playliststart': opts.playliststart,
4698 'playlistend': opts.playlistend,
4699 'logtostderr': opts.outtmpl == '-',
4700 'consoletitle': opts.consoletitle,
4701 'nopart': opts.nopart,
4702 'updatetime': opts.updatetime,
4703 'writedescription': opts.writedescription,
4704 'writeinfojson': opts.writeinfojson,
4705 'writesubtitles': opts.writesubtitles,
4706 'subtitleslang': opts.subtitleslang,
4707 'matchtitle': opts.matchtitle,
4708 'rejecttitle': opts.rejecttitle,
4709 'max_downloads': opts.max_downloads,
4710 'prefer_free_formats': opts.prefer_free_formats,
4711 'verbose': opts.verbose,
4713 for extractor in extractors:
4714 fd.add_info_extractor(extractor)
4717 if opts.extractaudio:
4718 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4721 if opts.update_self:
4722 updateSelf(fd, sys.argv[0])
4725 if len(all_urls) < 1:
4726 if not opts.update_self:
4727 parser.error(u'you must provide at least one URL')
4732 retcode = fd.download(all_urls)
4733 except MaxDownloadsReached:
4734 fd.to_screen(u'--max-download limit reached, aborting.')
4737 # Dump cookie jar if requested
4738 if opts.cookiefile is not None:
4741 except (IOError, OSError), err:
4742 sys.exit(u'ERROR: unable to save cookie jar')
4749 except DownloadError:
4751 except SameFileError:
4752 sys.exit(u'ERROR: fixed output name but more than one file to download')
4753 except KeyboardInterrupt:
4754 sys.exit(u'\nERROR: Interrupted by user')
4756 if __name__ == '__main__':
4759 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: