2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
20 __license__ = 'Public Domain'
21 __version__ = '2011.11.23'
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
52 except ImportError: # Python 2.4
55 import cStringIO as StringIO
59 # parse_qs was moved from the cgi module to the urlparse module recently.
61 from urlparse import parse_qs
63 from cgi import parse_qs
71 import xml.etree.ElementTree
72 except ImportError: # Python<2.5: Not officially supported, but let it slip
73 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
76 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
77 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
78 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
79 'Accept-Encoding': 'gzip, deflate',
80 'Accept-Language': 'en-us,en;q=0.5',
85 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
91 def raiseError(msg, i):
92 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
93 def skipSpace(i, expectMore=True):
94 while i < len(s) and s[i] in ' \t\r\n':
98 raiseError('Premature end', i)
100 def decodeEscape(match):
116 return unichr(int(esc[1:5], 16))
117 if len(esc) == 5+6 and esc[5:7] == '\\u':
118 hi = int(esc[1:5], 16)
119 low = int(esc[7:11], 16)
120 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
121 raise ValueError('Unknown escape ' + str(esc))
128 while s[e-bslashes-1] == '\\':
130 if bslashes % 2 == 1:
134 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
135 stri = rexp.sub(decodeEscape, s[i:e])
141 if s[i] == '}': # Empty dictionary
145 raiseError('Expected a string object key', i)
146 i,key = parseString(i)
148 if i >= len(s) or s[i] != ':':
149 raiseError('Expected a colon', i)
156 raiseError('Expected comma or closing curly brace', i)
161 if s[i] == ']': # Empty array
166 i = skipSpace(i) # Raise exception if premature end
170 raiseError('Expected a comma or closing bracket', i)
172 def parseDiscrete(i):
173 for k,v in {'true': True, 'false': False, 'null': None}.items():
174 if s.startswith(k, i):
176 raiseError('Not a boolean (or null)', i)
178 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
180 raiseError('Not a number', i)
182 if '.' in nums or 'e' in nums or 'E' in nums:
183 return (i+len(nums), float(nums))
184 return (i+len(nums), int(nums))
185 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
188 i,res = CHARMAP.get(s[i], parseNumber)(i)
189 i = skipSpace(i, False)
193 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
196 def preferredencoding():
197 """Get preferred encoding.
199 Returns the best encoding scheme for the system, based on
200 locale.getpreferredencoding() and some further tweaks.
202 def yield_preferredencoding():
204 pref = locale.getpreferredencoding()
210 return yield_preferredencoding().next()
213 def htmlentity_transform(matchobj):
214 """Transforms an HTML entity to a Unicode character.
216 This function receives a match object and is intended to be used with
217 the re.sub() function.
219 entity = matchobj.group(1)
221 # Known non-numeric HTML entity
222 if entity in htmlentitydefs.name2codepoint:
223 return unichr(htmlentitydefs.name2codepoint[entity])
226 mobj = re.match(ur'(?u)#(x?\d+)', entity)
228 numstr = mobj.group(1)
229 if numstr.startswith(u'x'):
231 numstr = u'0%s' % numstr
234 return unichr(long(numstr, base))
236 # Unknown entity in name, return its literal representation
237 return (u'&%s;' % entity)
240 def sanitize_title(utitle):
241 """Sanitizes a video title so it could be used as part of a filename."""
242 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
243 return utitle.replace(unicode(os.sep), u'%')
246 def sanitize_open(filename, open_mode):
247 """Try to open the given filename, and slightly tweak it if this fails.
249 Attempts to open the given filename. If this fails, it tries to change
250 the filename slightly, step by step, until it's either able to open it
251 or it fails and raises a final exception, like the standard open()
254 It returns the tuple (stream, definitive_file_name).
258 if sys.platform == 'win32':
260 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
261 return (sys.stdout, filename)
262 stream = open(filename, open_mode)
263 return (stream, filename)
264 except (IOError, OSError), err:
265 # In case of error, try to remove win32 forbidden chars
266 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
268 # An exception here should be caught in the caller
269 stream = open(filename, open_mode)
270 return (stream, filename)
273 def timeconvert(timestr):
274 """Convert RFC 2822 defined time string into system timestamp"""
276 timetuple = email.utils.parsedate_tz(timestr)
277 if timetuple is not None:
278 timestamp = email.utils.mktime_tz(timetuple)
281 def _simplify_title(title):
282 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
283 return expr.sub(u'_', title).strip(u'_')
285 class DownloadError(Exception):
286 """Download Error exception.
288 This exception may be thrown by FileDownloader objects if they are not
289 configured to continue on errors. They will contain the appropriate
295 class SameFileError(Exception):
296 """Same File exception.
298 This exception will be thrown by FileDownloader objects if they detect
299 multiple files would have to be downloaded to the same file on disk.
304 class PostProcessingError(Exception):
305 """Post Processing exception.
307 This exception may be raised by PostProcessor's .run() method to
308 indicate an error in the postprocessing task.
313 class UnavailableVideoError(Exception):
314 """Unavailable Format exception.
316 This exception will be thrown when a video is requested
317 in a format that is not available for that video.
322 class ContentTooShortError(Exception):
323 """Content Too Short exception.
325 This exception may be raised by FileDownloader objects when a file they
326 download is too small for what the server announced first, indicating
327 the connection was probably interrupted.
333 def __init__(self, downloaded, expected):
334 self.downloaded = downloaded
335 self.expected = expected
338 class YoutubeDLHandler(urllib2.HTTPHandler):
339 """Handler for HTTP requests and responses.
341 This class, when installed with an OpenerDirector, automatically adds
342 the standard headers to every HTTP request and handles gzipped and
343 deflated responses from web servers. If compression is to be avoided in
344 a particular request, the original request in the program code only has
345 to include the HTTP header "Youtubedl-No-Compression", which will be
346 removed before making the real request.
348 Part of this code was copied from:
350 http://techknack.net/python-urllib2-handlers/
352 Andrew Rowls, the author of that code, agreed to release it to the
359 return zlib.decompress(data, -zlib.MAX_WBITS)
361 return zlib.decompress(data)
364 def addinfourl_wrapper(stream, headers, url, code):
365 if hasattr(urllib2.addinfourl, 'getcode'):
366 return urllib2.addinfourl(stream, headers, url, code)
367 ret = urllib2.addinfourl(stream, headers, url)
371 def http_request(self, req):
372 for h in std_headers:
375 req.add_header(h, std_headers[h])
376 if 'Youtubedl-no-compression' in req.headers:
377 if 'Accept-encoding' in req.headers:
378 del req.headers['Accept-encoding']
379 del req.headers['Youtubedl-no-compression']
382 def http_response(self, req, resp):
385 if resp.headers.get('Content-encoding', '') == 'gzip':
386 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
387 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
388 resp.msg = old_resp.msg
390 if resp.headers.get('Content-encoding', '') == 'deflate':
391 gz = StringIO.StringIO(self.deflate(resp.read()))
392 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
393 resp.msg = old_resp.msg
397 class FileDownloader(object):
398 """File Downloader class.
400 File downloader objects are the ones responsible of downloading the
401 actual video file and writing it to disk if the user has requested
402 it, among some other tasks. In most cases there should be one per
403 program. As, given a video URL, the downloader doesn't know how to
404 extract all the needed information, task that InfoExtractors do, it
405 has to pass the URL to one of them.
407 For this, file downloader objects have a method that allows
408 InfoExtractors to be registered in a given order. When it is passed
409 a URL, the file downloader handles it to the first InfoExtractor it
410 finds that reports being able to handle it. The InfoExtractor extracts
411 all the information about the video or videos the URL refers to, and
412 asks the FileDownloader to process the video information, possibly
413 downloading the video.
415 File downloaders accept a lot of parameters. In order not to saturate
416 the object constructor with arguments, it receives a dictionary of
417 options instead. These options are available through the params
418 attribute for the InfoExtractors to use. The FileDownloader also
419 registers itself as the downloader in charge for the InfoExtractors
420 that are added to it, so this is a "mutual registration".
424 username: Username for authentication purposes.
425 password: Password for authentication purposes.
426 usenetrc: Use netrc for authentication instead.
427 quiet: Do not print messages to stdout.
428 forceurl: Force printing final URL.
429 forcetitle: Force printing title.
430 forcethumbnail: Force printing thumbnail URL.
431 forcedescription: Force printing description.
432 forcefilename: Force printing final filename.
433 simulate: Do not download the video files.
434 format: Video format code.
435 format_limit: Highest quality format to try.
436 outtmpl: Template for output names.
437 ignoreerrors: Do not stop on download errors.
438 ratelimit: Download speed limit, in bytes/sec.
439 nooverwrites: Prevent overwriting files.
440 retries: Number of times to retry for HTTP error 5xx
441 continuedl: Try to continue downloads if possible.
442 noprogress: Do not print the progress bar.
443 playliststart: Playlist item to start at.
444 playlistend: Playlist item to end at.
445 matchtitle: Download only matching titles.
446 rejecttitle: Reject downloads for matching titles.
447 logtostderr: Log messages to stderr instead of stdout.
448 consoletitle: Display progress in console window's titlebar.
449 nopart: Do not use temporary .part files.
450 updatetime: Use the Last-modified header to set output file timestamps.
451 writedescription: Write the video description to a .description file
452 writeinfojson: Write the video description to a .info.json file
458 _download_retcode = None
459 _num_downloads = None
462 def __init__(self, params):
463 """Create a FileDownloader object with the given options."""
466 self._download_retcode = 0
467 self._num_downloads = 0
468 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
472 def format_bytes(bytes):
475 if type(bytes) is str:
480 exponent = long(math.log(bytes, 1024.0))
481 suffix = 'bkMGTPEZY'[exponent]
482 converted = float(bytes) / float(1024 ** exponent)
483 return '%.2f%s' % (converted, suffix)
486 def calc_percent(byte_counter, data_len):
489 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
492 def calc_eta(start, now, total, current):
496 if current == 0 or dif < 0.001: # One millisecond
498 rate = float(current) / dif
499 eta = long((float(total) - float(current)) / rate)
500 (eta_mins, eta_secs) = divmod(eta, 60)
503 return '%02d:%02d' % (eta_mins, eta_secs)
506 def calc_speed(start, now, bytes):
508 if bytes == 0 or dif < 0.001: # One millisecond
509 return '%10s' % '---b/s'
510 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
513 def best_block_size(elapsed_time, bytes):
514 new_min = max(bytes / 2.0, 1.0)
515 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
516 if elapsed_time < 0.001:
518 rate = bytes / elapsed_time
526 def parse_bytes(bytestr):
527 """Parse a string indicating a byte quantity into a long integer."""
528 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
531 number = float(matchobj.group(1))
532 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
533 return long(round(number * multiplier))
535 def add_info_extractor(self, ie):
536 """Add an InfoExtractor object to the end of the list."""
538 ie.set_downloader(self)
540 def add_post_processor(self, pp):
541 """Add a PostProcessor object to the end of the chain."""
543 pp.set_downloader(self)
545 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
546 """Print message to stdout if not in quiet mode."""
548 if not self.params.get('quiet', False):
549 terminator = [u'\n', u''][skip_eol]
550 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
551 self._screen_file.flush()
552 except (UnicodeEncodeError), err:
553 if not ignore_encoding_errors:
556 def to_stderr(self, message):
557 """Print message to stderr."""
558 print >>sys.stderr, message.encode(preferredencoding())
560 def to_cons_title(self, message):
561 """Set console/terminal window title to message."""
562 if not self.params.get('consoletitle', False):
564 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
565 # c_wchar_p() might not be necessary if `message` is
566 # already of type unicode()
567 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
568 elif 'TERM' in os.environ:
569 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
571 def fixed_template(self):
572 """Checks if the output template is fixed."""
573 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
575 def trouble(self, message=None):
576 """Determine action to take when a download problem appears.
578 Depending on if the downloader has been configured to ignore
579 download errors or not, this method may throw an exception or
580 not when errors are found, after printing the message.
582 if message is not None:
583 self.to_stderr(message)
584 if not self.params.get('ignoreerrors', False):
585 raise DownloadError(message)
586 self._download_retcode = 1
588 def slow_down(self, start_time, byte_counter):
589 """Sleep if the download speed is over the rate limit."""
590 rate_limit = self.params.get('ratelimit', None)
591 if rate_limit is None or byte_counter == 0:
594 elapsed = now - start_time
597 speed = float(byte_counter) / elapsed
598 if speed > rate_limit:
599 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
601 def temp_name(self, filename):
602 """Returns a temporary filename for the given filename."""
603 if self.params.get('nopart', False) or filename == u'-' or \
604 (os.path.exists(filename) and not os.path.isfile(filename)):
606 return filename + u'.part'
608 def undo_temp_name(self, filename):
609 if filename.endswith(u'.part'):
610 return filename[:-len(u'.part')]
613 def try_rename(self, old_filename, new_filename):
615 if old_filename == new_filename:
617 os.rename(old_filename, new_filename)
618 except (IOError, OSError), err:
619 self.trouble(u'ERROR: unable to rename file')
621 def try_utime(self, filename, last_modified_hdr):
622 """Try to set the last-modified time of the given file."""
623 if last_modified_hdr is None:
625 if not os.path.isfile(filename):
627 timestr = last_modified_hdr
630 filetime = timeconvert(timestr)
634 os.utime(filename, (time.time(), filetime))
639 def report_writedescription(self, descfn):
640 """ Report that the description file is being written """
641 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
643 def report_writeinfojson(self, infofn):
644 """ Report that the metadata file has been written """
645 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
647 def report_destination(self, filename):
648 """Report destination filename."""
649 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
651 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
652 """Report download progress."""
653 if self.params.get('noprogress', False):
655 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
656 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
657 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
658 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
660 def report_resuming_byte(self, resume_len):
661 """Report attempt to resume at given byte."""
662 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
664 def report_retry(self, count, retries):
665 """Report retry in case of HTTP error 5xx"""
666 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
668 def report_file_already_downloaded(self, file_name):
669 """Report file has already been fully downloaded."""
671 self.to_screen(u'[download] %s has already been downloaded' % file_name)
672 except (UnicodeEncodeError), err:
673 self.to_screen(u'[download] The file has already been downloaded')
675 def report_unable_to_resume(self):
676 """Report it was impossible to resume download."""
677 self.to_screen(u'[download] Unable to resume')
679 def report_finish(self):
680 """Report download finished."""
681 if self.params.get('noprogress', False):
682 self.to_screen(u'[download] Download completed')
686 def increment_downloads(self):
687 """Increment the ordinal that assigns a number to each file."""
688 self._num_downloads += 1
690 def prepare_filename(self, info_dict):
691 """Generate the output filename."""
693 template_dict = dict(info_dict)
694 template_dict['epoch'] = unicode(long(time.time()))
695 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
696 filename = self.params['outtmpl'] % template_dict
698 except (ValueError, KeyError), err:
699 self.trouble(u'ERROR: invalid system charset or erroneous output template')
702 def _match_entry(self, info_dict):
703 """ Returns None iff the file should be downloaded """
705 title = info_dict['title']
706 matchtitle = self.params.get('matchtitle', False)
707 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
708 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
709 rejecttitle = self.params.get('rejecttitle', False)
710 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
711 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
714 def process_dict(self, info_dict):
715 """ Download and handle the extracted information.
716 For details on the specification of the various types of content, refer to the _process_* functions. """
717 if info_dict['type'] == 'playlist':
718 self._process_playlist(info_dict)
719 elif info_dict['type'] == 'legacy-video':
720 self.process_info(info_dict)
722 raise ValueError('Invalid item type')
724 def _process_playlist(self, info_dict):
725 assert info_dict['type'] == 'playlist'
726 assert 'title' in info_dict
727 assert 'stitle' in info_dict
728 entries = info_dict['list']
733 def process_info(self, info_dict):
734 """Process a single dictionary returned by an InfoExtractor."""
736 reason = self._match_entry(info_dict)
737 if reason is not None:
738 self.to_screen(u'[download] ' + reason)
741 max_downloads = self.params.get('max_downloads')
742 if max_downloads is not None:
743 if self._num_downloads > int(max_downloads):
744 self.to_screen(u'[download] Maximum number of downloads reached. Skipping ' + info_dict['title'])
747 filename = self.prepare_filename(info_dict)
750 if self.params.get('forcetitle', False):
751 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
752 if self.params.get('forceurl', False):
753 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
754 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
755 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
756 if self.params.get('forcedescription', False) and 'description' in info_dict:
757 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
758 if self.params.get('forcefilename', False) and filename is not None:
759 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
760 if self.params.get('forceformat', False):
761 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
763 # Do nothing else if in simulate mode
764 if self.params.get('simulate', False):
770 if self.params.get('nooverwrites', False) and os.path.exists(filename):
771 self.to_stderr(u'WARNING: file exists and will be skipped')
775 dn = os.path.dirname(filename)
776 if dn != '' and not os.path.exists(dn):
778 except (OSError, IOError), err:
779 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
782 if self.params.get('writedescription', False):
784 descfn = filename + '.description'
785 self.report_writedescription(descfn)
786 descfile = open(descfn, 'wb')
788 descfile.write(info_dict['description'].encode('utf-8'))
791 except (OSError, IOError):
792 self.trouble(u'ERROR: Cannot write description file ' + descfn)
795 if self.params.get('writeinfojson', False):
796 infofn = filename + '.info.json'
797 self.report_writeinfojson(infofn)
800 except (NameError,AttributeError):
801 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
804 infof = open(infofn, 'wb')
806 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
807 json.dump(json_info_dict, infof)
810 except (OSError, IOError):
811 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
814 if not self.params.get('skip_download', False):
816 success = self._do_download(filename, info_dict)
817 except (OSError, IOError), err:
818 raise UnavailableVideoError
819 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
820 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
822 except (ContentTooShortError, ), err:
823 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
828 self.post_process(filename, info_dict)
829 except (PostProcessingError), err:
830 self.trouble(u'ERROR: postprocessing: %s' % str(err))
833 def download(self, url_list):
834 """Download a given list of URLs."""
835 if len(url_list) > 1 and self.fixed_template():
836 raise SameFileError(self.params['outtmpl'])
839 suitable_found = False
841 # Go to next InfoExtractor if not suitable
842 if not ie.suitable(url):
845 # Suitable InfoExtractor found
846 suitable_found = True
848 # Extract information from URL and process it
851 # Suitable InfoExtractor had been found; go to next URL
854 if not suitable_found:
855 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
857 return self._download_retcode
859 def post_process(self, filename, ie_info):
860 """Run the postprocessing chain on the given file."""
862 info['filepath'] = filename
868 def _download_with_rtmpdump(self, filename, url, player_url):
869 self.report_destination(filename)
870 tmpfilename = self.temp_name(filename)
872 # Check for rtmpdump first
874 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
875 except (OSError, IOError):
876 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
879 # Download using rtmpdump. rtmpdump returns exit code 2 when
880 # the connection was interrumpted and resuming appears to be
881 # possible. This is part of rtmpdump's normal usage, AFAIK.
882 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
883 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
884 while retval == 2 or retval == 1:
885 prevsize = os.path.getsize(tmpfilename)
886 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
887 time.sleep(5.0) # This seems to be needed
888 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
889 cursize = os.path.getsize(tmpfilename)
890 if prevsize == cursize and retval == 1:
892 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
893 if prevsize == cursize and retval == 2 and cursize > 1024:
894 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
898 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
899 self.try_rename(tmpfilename, filename)
902 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
905 def _do_download(self, filename, info_dict):
906 url = info_dict['url']
907 player_url = info_dict.get('player_url', None)
909 # Check file already present
910 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
911 self.report_file_already_downloaded(filename)
914 # Attempt to download using rtmpdump
915 if url.startswith('rtmp'):
916 return self._download_with_rtmpdump(filename, url, player_url)
918 tmpfilename = self.temp_name(filename)
921 # Do not include the Accept-Encoding header
922 headers = {'Youtubedl-no-compression': 'True'}
923 basic_request = urllib2.Request(url, None, headers)
924 request = urllib2.Request(url, None, headers)
926 # Establish possible resume length
927 if os.path.isfile(tmpfilename):
928 resume_len = os.path.getsize(tmpfilename)
934 if self.params.get('continuedl', False):
935 self.report_resuming_byte(resume_len)
936 request.add_header('Range','bytes=%d-' % resume_len)
942 retries = self.params.get('retries', 0)
943 while count <= retries:
944 # Establish connection
946 if count == 0 and 'urlhandle' in info_dict:
947 data = info_dict['urlhandle']
948 data = urllib2.urlopen(request)
950 except (urllib2.HTTPError, ), err:
951 if (err.code < 500 or err.code >= 600) and err.code != 416:
952 # Unexpected HTTP error
954 elif err.code == 416:
955 # Unable to resume (requested range not satisfiable)
957 # Open the connection again without the range header
958 data = urllib2.urlopen(basic_request)
959 content_length = data.info()['Content-Length']
960 except (urllib2.HTTPError, ), err:
961 if err.code < 500 or err.code >= 600:
964 # Examine the reported length
965 if (content_length is not None and
966 (resume_len - 100 < long(content_length) < resume_len + 100)):
967 # The file had already been fully downloaded.
968 # Explanation to the above condition: in issue #175 it was revealed that
969 # YouTube sometimes adds or removes a few bytes from the end of the file,
970 # changing the file size slightly and causing problems for some users. So
971 # I decided to implement a suggested change and consider the file
972 # completely downloaded if the file size differs less than 100 bytes from
973 # the one in the hard drive.
974 self.report_file_already_downloaded(filename)
975 self.try_rename(tmpfilename, filename)
978 # The length does not match, we start the download over
979 self.report_unable_to_resume()
985 self.report_retry(count, retries)
988 self.trouble(u'ERROR: giving up after %s retries' % retries)
991 data_len = data.info().get('Content-length', None)
992 if data_len is not None:
993 data_len = long(data_len) + resume_len
994 data_len_str = self.format_bytes(data_len)
995 byte_counter = 0 + resume_len
1000 before = time.time()
1001 data_block = data.read(block_size)
1003 if len(data_block) == 0:
1005 byte_counter += len(data_block)
1007 # Open file just in time
1010 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1011 assert stream is not None
1012 filename = self.undo_temp_name(tmpfilename)
1013 self.report_destination(filename)
1014 except (OSError, IOError), err:
1015 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1018 stream.write(data_block)
1019 except (IOError, OSError), err:
1020 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1022 block_size = self.best_block_size(after - before, len(data_block))
1025 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1026 if data_len is None:
1027 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1029 percent_str = self.calc_percent(byte_counter, data_len)
1030 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1031 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1034 self.slow_down(start, byte_counter - resume_len)
1037 self.trouble(u'\nERROR: Did not get any data blocks')
1040 self.report_finish()
1041 if data_len is not None and byte_counter != data_len:
1042 raise ContentTooShortError(byte_counter, long(data_len))
1043 self.try_rename(tmpfilename, filename)
1045 # Update file modification time
1046 if self.params.get('updatetime', True):
1047 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1052 class InfoExtractor(object):
1053 """Information Extractor class.
1055 Information extractors are the classes that, given a URL, extract
1056 information from the video (or videos) the URL refers to. This
1057 information includes the real video URL, the video title and simplified
1058 title, author and others. The information is stored in a dictionary
1059 which is then passed to the FileDownloader. The FileDownloader
1060 processes this information possibly downloading the video to the file
1061 system, among other possible outcomes. The dictionaries must include
1062 the following fields:
1064 id: Video identifier.
1065 url: Final video URL.
1066 uploader: Nickname of the video uploader.
1067 title: Literal title.
1068 stitle: Simplified title.
1069 ext: Video filename extension.
1070 format: Video format.
1071 player_url: SWF Player URL (may be None).
1073 The following fields are optional. Their primary purpose is to allow
1074 youtube-dl to serve as the backend for a video search function, such
1075 as the one in youtube2mp3. They are only used when their respective
1076 forced printing functions are called:
1078 thumbnail: Full URL to a video thumbnail image.
1079 description: One-line video description.
1081 Subclasses of this one should re-define the _real_initialize() and
1082 _real_extract() methods and define a _VALID_URL regexp.
1083 Probably, they should also be added to the list of extractors.
1089 def __init__(self, downloader=None):
1090 """Constructor. Receives an optional downloader."""
1092 self.set_downloader(downloader)
1094 def suitable(self, url):
1095 """Receives a URL and returns True if suitable for this IE."""
1096 return re.match(self._VALID_URL, url) is not None
1098 def initialize(self):
1099 """Initializes an instance (authentication, etc)."""
1101 self._real_initialize()
1104 def extract(self, url):
1105 """Extracts URL information and returns it in list of dicts."""
1107 return self._real_extract(url)
1109 def set_downloader(self, downloader):
1110 """Sets the downloader for this IE."""
1111 self._downloader = downloader
1113 def _real_initialize(self):
1114 """Real initialization process. Redefine in subclasses."""
1117 def _real_extract(self, url):
1118 """Real extraction process. Redefine in subclasses."""
1122 class YoutubeIE(InfoExtractor):
1123 """Information extractor for youtube.com."""
1125 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1126 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1127 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1128 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1129 _NETRC_MACHINE = 'youtube'
1130 # Listed in order of quality
1131 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1132 _video_extensions = {
1138 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1143 _video_dimensions = {
1158 IE_NAME = u'youtube'
1160 def report_lang(self):
1161 """Report attempt to set language."""
1162 self._downloader.to_screen(u'[youtube] Setting language')
1164 def report_login(self):
1165 """Report attempt to log in."""
1166 self._downloader.to_screen(u'[youtube] Logging in')
1168 def report_age_confirmation(self):
1169 """Report attempt to confirm age."""
1170 self._downloader.to_screen(u'[youtube] Confirming age')
1172 def report_video_webpage_download(self, video_id):
1173 """Report attempt to download video webpage."""
1174 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1176 def report_video_info_webpage_download(self, video_id):
1177 """Report attempt to download video info webpage."""
1178 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1180 def report_information_extraction(self, video_id):
1181 """Report attempt to extract video information."""
1182 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1184 def report_unavailable_format(self, video_id, format):
1185 """Report extracted video URL."""
1186 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1188 def report_rtmp_download(self):
1189 """Indicate the download will use the RTMP protocol."""
1190 self._downloader.to_screen(u'[youtube] RTMP download detected')
1192 def _print_formats(self, formats):
1193 print 'Available formats:'
1195 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1197 def _real_initialize(self):
1198 if self._downloader is None:
1203 downloader_params = self._downloader.params
1205 # Attempt to use provided username and password or .netrc data
1206 if downloader_params.get('username', None) is not None:
1207 username = downloader_params['username']
1208 password = downloader_params['password']
1209 elif downloader_params.get('usenetrc', False):
1211 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1212 if info is not None:
1216 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1217 except (IOError, netrc.NetrcParseError), err:
1218 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1222 request = urllib2.Request(self._LANG_URL)
1225 urllib2.urlopen(request).read()
1226 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1227 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1230 # No authentication to be performed
1231 if username is None:
1236 'current_form': 'loginForm',
1238 'action_login': 'Log In',
1239 'username': username,
1240 'password': password,
1242 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1245 login_results = urllib2.urlopen(request).read()
1246 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1247 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1249 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1250 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1256 'action_confirm': 'Confirm',
1258 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1260 self.report_age_confirmation()
1261 age_results = urllib2.urlopen(request).read()
1262 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1263 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1266 def _real_extract(self, url):
1267 # Extract video id from URL
1268 mobj = re.match(self._VALID_URL, url)
1270 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1272 video_id = mobj.group(2)
1275 self.report_video_webpage_download(video_id)
1276 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1278 video_webpage = urllib2.urlopen(request).read()
1279 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1280 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1283 # Attempt to extract SWF player URL
1284 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1285 if mobj is not None:
1286 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1291 self.report_video_info_webpage_download(video_id)
1292 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1293 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1294 % (video_id, el_type))
1295 request = urllib2.Request(video_info_url)
1297 video_info_webpage = urllib2.urlopen(request).read()
1298 video_info = parse_qs(video_info_webpage)
1299 if 'token' in video_info:
1301 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1302 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1304 if 'token' not in video_info:
1305 if 'reason' in video_info:
1306 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1308 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1311 # Start extracting information
1312 self.report_information_extraction(video_id)
1315 if 'author' not in video_info:
1316 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1318 video_uploader = urllib.unquote_plus(video_info['author'][0])
1321 if 'title' not in video_info:
1322 self._downloader.trouble(u'ERROR: unable to extract video title')
1324 video_title = urllib.unquote_plus(video_info['title'][0])
1325 video_title = video_title.decode('utf-8')
1326 video_title = sanitize_title(video_title)
1329 simple_title = _simplify_title(video_title)
1332 if 'thumbnail_url' not in video_info:
1333 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1334 video_thumbnail = ''
1335 else: # don't panic if we can't find it
1336 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1340 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1341 if mobj is not None:
1342 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1343 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1344 for expression in format_expressions:
1346 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1354 video_description = u'No description available.'
1355 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1356 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1357 if mobj is not None:
1358 video_description = mobj.group(1).decode('utf-8')
1360 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1361 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1362 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1363 # TODO use another parser
1366 video_token = urllib.unquote_plus(video_info['token'][0])
1368 # Decide which formats to download
1369 req_format = self._downloader.params.get('format', None)
1371 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1372 self.report_rtmp_download()
1373 video_url_list = [(None, video_info['conn'][0])]
1374 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1375 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1376 url_data = [parse_qs(uds) for uds in url_data_strs]
1377 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1378 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1380 format_limit = self._downloader.params.get('format_limit', None)
1381 if format_limit is not None and format_limit in self._available_formats:
1382 format_list = self._available_formats[self._available_formats.index(format_limit):]
1384 format_list = self._available_formats
1385 existing_formats = [x for x in format_list if x in url_map]
1386 if len(existing_formats) == 0:
1387 self._downloader.trouble(u'ERROR: no known formats available for video')
1389 if self._downloader.params.get('listformats', None):
1390 self._print_formats(existing_formats)
1392 if req_format is None or req_format == 'best':
1393 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1394 elif req_format == 'worst':
1395 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1396 elif req_format in ('-1', 'all'):
1397 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1399 # Specific formats. We pick the first in a slash-delimeted sequence.
1400 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1401 req_formats = req_format.split('/')
1402 video_url_list = None
1403 for rf in req_formats:
1405 video_url_list = [(rf, url_map[rf])]
1407 if video_url_list is None:
1408 self._downloader.trouble(u'ERROR: requested format not available')
1411 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1414 for format_param, video_real_url in video_url_list:
1415 # At this point we have a new video
1416 self._downloader.increment_downloads()
1419 video_extension = self._video_extensions.get(format_param, 'flv')
1422 # Process video information
1423 self._downloader.process_info({
1424 'id': video_id.decode('utf-8'),
1425 'url': video_real_url.decode('utf-8'),
1426 'uploader': video_uploader.decode('utf-8'),
1427 'upload_date': upload_date,
1428 'title': video_title,
1429 'stitle': simple_title,
1430 'ext': video_extension.decode('utf-8'),
1431 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1432 'thumbnail': video_thumbnail.decode('utf-8'),
1433 'description': video_description,
1434 'player_url': player_url,
1436 except UnavailableVideoError, err:
1437 self._downloader.trouble(u'\nERROR: unable to download video')
1440 class MetacafeIE(InfoExtractor):
1441 """Information Extractor for metacafe.com."""
1443 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1444 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1445 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1447 IE_NAME = u'metacafe'
1449 def __init__(self, youtube_ie, downloader=None):
1450 InfoExtractor.__init__(self, downloader)
1451 self._youtube_ie = youtube_ie
1453 def report_disclaimer(self):
1454 """Report disclaimer retrieval."""
1455 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1457 def report_age_confirmation(self):
1458 """Report attempt to confirm age."""
1459 self._downloader.to_screen(u'[metacafe] Confirming age')
1461 def report_download_webpage(self, video_id):
1462 """Report webpage download."""
1463 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1465 def report_extraction(self, video_id):
1466 """Report information extraction."""
1467 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1469 def _real_initialize(self):
1470 # Retrieve disclaimer
1471 request = urllib2.Request(self._DISCLAIMER)
1473 self.report_disclaimer()
1474 disclaimer = urllib2.urlopen(request).read()
1475 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1476 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1482 'submit': "Continue - I'm over 18",
1484 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1486 self.report_age_confirmation()
1487 disclaimer = urllib2.urlopen(request).read()
1488 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1489 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1492 def _real_extract(self, url):
1493 # Extract id and simplified title from URL
1494 mobj = re.match(self._VALID_URL, url)
1496 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1499 video_id = mobj.group(1)
1501 # Check if video comes from YouTube
1502 mobj2 = re.match(r'^yt-(.*)$', video_id)
1503 if mobj2 is not None:
1504 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1507 # At this point we have a new video
1508 self._downloader.increment_downloads()
1510 simple_title = mobj.group(2).decode('utf-8')
1512 # Retrieve video webpage to extract further information
1513 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1515 self.report_download_webpage(video_id)
1516 webpage = urllib2.urlopen(request).read()
1517 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1518 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1521 # Extract URL, uploader and title from webpage
1522 self.report_extraction(video_id)
1523 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1524 if mobj is not None:
1525 mediaURL = urllib.unquote(mobj.group(1))
1526 video_extension = mediaURL[-3:]
1528 # Extract gdaKey if available
1529 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1531 video_url = mediaURL
1533 gdaKey = mobj.group(1)
1534 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1536 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1538 self._downloader.trouble(u'ERROR: unable to extract media URL')
1540 vardict = parse_qs(mobj.group(1))
1541 if 'mediaData' not in vardict:
1542 self._downloader.trouble(u'ERROR: unable to extract media URL')
1544 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1546 self._downloader.trouble(u'ERROR: unable to extract media URL')
1548 mediaURL = mobj.group(1).replace('\\/', '/')
1549 video_extension = mediaURL[-3:]
1550 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1552 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1554 self._downloader.trouble(u'ERROR: unable to extract title')
1556 video_title = mobj.group(1).decode('utf-8')
1557 video_title = sanitize_title(video_title)
1559 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1561 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1563 video_uploader = mobj.group(1)
1566 # Process video information
1567 self._downloader.process_info({
1568 'id': video_id.decode('utf-8'),
1569 'url': video_url.decode('utf-8'),
1570 'uploader': video_uploader.decode('utf-8'),
1571 'upload_date': u'NA',
1572 'title': video_title,
1573 'stitle': simple_title,
1574 'ext': video_extension.decode('utf-8'),
1578 except UnavailableVideoError:
1579 self._downloader.trouble(u'\nERROR: unable to download video')
1582 class DailymotionIE(InfoExtractor):
1583 """Information Extractor for Dailymotion"""
1585 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1586 IE_NAME = u'dailymotion'
1588 def __init__(self, downloader=None):
1589 InfoExtractor.__init__(self, downloader)
1591 def report_download_webpage(self, video_id):
1592 """Report webpage download."""
1593 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1595 def report_extraction(self, video_id):
1596 """Report information extraction."""
1597 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1599 def _real_extract(self, url):
1600 # Extract id and simplified title from URL
1601 mobj = re.match(self._VALID_URL, url)
1603 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1606 # At this point we have a new video
1607 self._downloader.increment_downloads()
1608 video_id = mobj.group(1)
1610 simple_title = mobj.group(2).decode('utf-8')
1611 video_extension = 'flv'
1613 # Retrieve video webpage to extract further information
1614 request = urllib2.Request(url)
1615 request.add_header('Cookie', 'family_filter=off')
1617 self.report_download_webpage(video_id)
1618 webpage = urllib2.urlopen(request).read()
1619 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1620 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1623 # Extract URL, uploader and title from webpage
1624 self.report_extraction(video_id)
1625 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1627 self._downloader.trouble(u'ERROR: unable to extract media URL')
1629 sequence = urllib.unquote(mobj.group(1))
1630 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1632 self._downloader.trouble(u'ERROR: unable to extract media URL')
1634 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1636 # if needed add http://www.dailymotion.com/ if relative URL
1638 video_url = mediaURL
1640 mobj = re.search(r'(?im)<title>\s*(.+)\s*-\s*Video\s+Dailymotion</title>', webpage)
1642 self._downloader.trouble(u'ERROR: unable to extract title')
1644 video_title = mobj.group(1).decode('utf-8')
1645 video_title = sanitize_title(video_title)
1647 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1649 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1651 video_uploader = mobj.group(1)
1654 # Process video information
1655 self._downloader.process_info({
1656 'id': video_id.decode('utf-8'),
1657 'url': video_url.decode('utf-8'),
1658 'uploader': video_uploader.decode('utf-8'),
1659 'upload_date': u'NA',
1660 'title': video_title,
1661 'stitle': simple_title,
1662 'ext': video_extension.decode('utf-8'),
1666 except UnavailableVideoError:
1667 self._downloader.trouble(u'\nERROR: unable to download video')
1670 class GoogleIE(InfoExtractor):
1671 """Information extractor for video.google.com."""
1673 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1674 IE_NAME = u'video.google'
1676 def __init__(self, downloader=None):
1677 InfoExtractor.__init__(self, downloader)
1679 def report_download_webpage(self, video_id):
1680 """Report webpage download."""
1681 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1683 def report_extraction(self, video_id):
1684 """Report information extraction."""
1685 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1687 def _real_extract(self, url):
1688 # Extract id from URL
1689 mobj = re.match(self._VALID_URL, url)
1691 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1694 # At this point we have a new video
1695 self._downloader.increment_downloads()
1696 video_id = mobj.group(1)
1698 video_extension = 'mp4'
1700 # Retrieve video webpage to extract further information
1701 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1703 self.report_download_webpage(video_id)
1704 webpage = urllib2.urlopen(request).read()
1705 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1706 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1709 # Extract URL, uploader, and title from webpage
1710 self.report_extraction(video_id)
1711 mobj = re.search(r"download_url:'([^']+)'", webpage)
1713 video_extension = 'flv'
1714 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1716 self._downloader.trouble(u'ERROR: unable to extract media URL')
1718 mediaURL = urllib.unquote(mobj.group(1))
1719 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1720 mediaURL = mediaURL.replace('\\x26', '\x26')
1722 video_url = mediaURL
1724 mobj = re.search(r'<title>(.*)</title>', webpage)
1726 self._downloader.trouble(u'ERROR: unable to extract title')
1728 video_title = mobj.group(1).decode('utf-8')
1729 video_title = sanitize_title(video_title)
1730 simple_title = _simplify_title(video_title)
1732 # Extract video description
1733 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1735 self._downloader.trouble(u'ERROR: unable to extract video description')
1737 video_description = mobj.group(1).decode('utf-8')
1738 if not video_description:
1739 video_description = 'No description available.'
1741 # Extract video thumbnail
1742 if self._downloader.params.get('forcethumbnail', False):
1743 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1745 webpage = urllib2.urlopen(request).read()
1746 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1747 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1749 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1751 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1753 video_thumbnail = mobj.group(1)
1754 else: # we need something to pass to process_info
1755 video_thumbnail = ''
1758 # Process video information
1759 self._downloader.process_info({
1760 'id': video_id.decode('utf-8'),
1761 'url': video_url.decode('utf-8'),
1763 'upload_date': u'NA',
1764 'title': video_title,
1765 'stitle': simple_title,
1766 'ext': video_extension.decode('utf-8'),
1770 except UnavailableVideoError:
1771 self._downloader.trouble(u'\nERROR: unable to download video')
1774 class PhotobucketIE(InfoExtractor):
1775 """Information extractor for photobucket.com."""
1777 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1778 IE_NAME = u'photobucket'
1780 def __init__(self, downloader=None):
1781 InfoExtractor.__init__(self, downloader)
1783 def report_download_webpage(self, video_id):
1784 """Report webpage download."""
1785 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1787 def report_extraction(self, video_id):
1788 """Report information extraction."""
1789 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1791 def _real_extract(self, url):
1792 # Extract id from URL
1793 mobj = re.match(self._VALID_URL, url)
1795 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1798 # At this point we have a new video
1799 self._downloader.increment_downloads()
1800 video_id = mobj.group(1)
1802 video_extension = 'flv'
1804 # Retrieve video webpage to extract further information
1805 request = urllib2.Request(url)
1807 self.report_download_webpage(video_id)
1808 webpage = urllib2.urlopen(request).read()
1809 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1810 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1813 # Extract URL, uploader, and title from webpage
1814 self.report_extraction(video_id)
1815 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1817 self._downloader.trouble(u'ERROR: unable to extract media URL')
1819 mediaURL = urllib.unquote(mobj.group(1))
1821 video_url = mediaURL
1823 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1825 self._downloader.trouble(u'ERROR: unable to extract title')
1827 video_title = mobj.group(1).decode('utf-8')
1828 video_title = sanitize_title(video_title)
1829 simple_title = _simplify_title(vide_title)
1831 video_uploader = mobj.group(2).decode('utf-8')
1834 # Process video information
1835 self._downloader.process_info({
1836 'id': video_id.decode('utf-8'),
1837 'url': video_url.decode('utf-8'),
1838 'uploader': video_uploader,
1839 'upload_date': u'NA',
1840 'title': video_title,
1841 'stitle': simple_title,
1842 'ext': video_extension.decode('utf-8'),
1846 except UnavailableVideoError:
1847 self._downloader.trouble(u'\nERROR: unable to download video')
1850 class YahooIE(InfoExtractor):
1851 """Information extractor for video.yahoo.com."""
1853 # _VALID_URL matches all Yahoo! Video URLs
1854 # _VPAGE_URL matches only the extractable '/watch/' URLs
1855 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1856 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1857 IE_NAME = u'video.yahoo'
1859 def __init__(self, downloader=None):
1860 InfoExtractor.__init__(self, downloader)
1862 def report_download_webpage(self, video_id):
1863 """Report webpage download."""
1864 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1866 def report_extraction(self, video_id):
1867 """Report information extraction."""
1868 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1870 def _real_extract(self, url, new_video=True):
1871 # Extract ID from URL
1872 mobj = re.match(self._VALID_URL, url)
1874 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1877 # At this point we have a new video
1878 self._downloader.increment_downloads()
1879 video_id = mobj.group(2)
1880 video_extension = 'flv'
1882 # Rewrite valid but non-extractable URLs as
1883 # extractable English language /watch/ URLs
1884 if re.match(self._VPAGE_URL, url) is None:
1885 request = urllib2.Request(url)
1887 webpage = urllib2.urlopen(request).read()
1888 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1889 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1892 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1894 self._downloader.trouble(u'ERROR: Unable to extract id field')
1896 yahoo_id = mobj.group(1)
1898 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1900 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1902 yahoo_vid = mobj.group(1)
1904 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1905 return self._real_extract(url, new_video=False)
1907 # Retrieve video webpage to extract further information
1908 request = urllib2.Request(url)
1910 self.report_download_webpage(video_id)
1911 webpage = urllib2.urlopen(request).read()
1912 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1913 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1916 # Extract uploader and title from webpage
1917 self.report_extraction(video_id)
1918 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1920 self._downloader.trouble(u'ERROR: unable to extract video title')
1922 video_title = mobj.group(1).decode('utf-8')
1923 simple_title = _simplify_title(video_title)
1925 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1927 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1929 video_uploader = mobj.group(1).decode('utf-8')
1931 # Extract video thumbnail
1932 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1934 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1936 video_thumbnail = mobj.group(1).decode('utf-8')
1938 # Extract video description
1939 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1941 self._downloader.trouble(u'ERROR: unable to extract video description')
1943 video_description = mobj.group(1).decode('utf-8')
1944 if not video_description:
1945 video_description = 'No description available.'
1947 # Extract video height and width
1948 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1950 self._downloader.trouble(u'ERROR: unable to extract video height')
1952 yv_video_height = mobj.group(1)
1954 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1956 self._downloader.trouble(u'ERROR: unable to extract video width')
1958 yv_video_width = mobj.group(1)
1960 # Retrieve video playlist to extract media URL
1961 # I'm not completely sure what all these options are, but we
1962 # seem to need most of them, otherwise the server sends a 401.
1963 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1964 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1965 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1966 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1967 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1969 self.report_download_webpage(video_id)
1970 webpage = urllib2.urlopen(request).read()
1971 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1972 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1975 # Extract media URL from playlist XML
1976 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1978 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1980 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1981 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1984 # Process video information
1985 self._downloader.process_info({
1986 'id': video_id.decode('utf-8'),
1988 'uploader': video_uploader,
1989 'upload_date': u'NA',
1990 'title': video_title,
1991 'stitle': simple_title,
1992 'ext': video_extension.decode('utf-8'),
1993 'thumbnail': video_thumbnail.decode('utf-8'),
1994 'description': video_description,
1995 'thumbnail': video_thumbnail,
1998 except UnavailableVideoError:
1999 self._downloader.trouble(u'\nERROR: unable to download video')
2002 class VimeoIE(InfoExtractor):
2003 """Information extractor for vimeo.com."""
2005 # _VALID_URL matches Vimeo URLs
2006 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2009 def __init__(self, downloader=None):
2010 InfoExtractor.__init__(self, downloader)
2012 def report_download_webpage(self, video_id):
2013 """Report webpage download."""
2014 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2016 def report_extraction(self, video_id):
2017 """Report information extraction."""
2018 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2020 def _real_extract(self, url, new_video=True):
2021 # Extract ID from URL
2022 mobj = re.match(self._VALID_URL, url)
2024 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2027 # At this point we have a new video
2028 self._downloader.increment_downloads()
2029 video_id = mobj.group(1)
2031 # Retrieve video webpage to extract further information
2032 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2034 self.report_download_webpage(video_id)
2035 webpage = urllib2.urlopen(request).read()
2036 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2037 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2040 # Now we begin extracting as much information as we can from what we
2041 # retrieved. First we extract the information common to all extractors,
2042 # and latter we extract those that are Vimeo specific.
2043 self.report_extraction(video_id)
2046 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2048 self._downloader.trouble(u'ERROR: unable to extract video title')
2050 video_title = mobj.group(1).decode('utf-8')
2051 simple_title = _simplify_title(video_title)
2054 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2056 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2058 video_uploader = mobj.group(1).decode('utf-8')
2060 # Extract video thumbnail
2061 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2063 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2065 video_thumbnail = mobj.group(1).decode('utf-8')
2067 # # Extract video description
2068 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2070 # self._downloader.trouble(u'ERROR: unable to extract video description')
2072 # video_description = mobj.group(1).decode('utf-8')
2073 # if not video_description: video_description = 'No description available.'
2074 video_description = 'Foo.'
2076 # Vimeo specific: extract request signature
2077 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2079 self._downloader.trouble(u'ERROR: unable to extract request signature')
2081 sig = mobj.group(1).decode('utf-8')
2083 # Vimeo specific: extract video quality information
2084 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2086 self._downloader.trouble(u'ERROR: unable to extract video quality information')
2088 quality = mobj.group(1).decode('utf-8')
2090 if int(quality) == 1:
2095 # Vimeo specific: Extract request signature expiration
2096 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2098 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2100 sig_exp = mobj.group(1).decode('utf-8')
2102 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2105 # Process video information
2106 self._downloader.process_info({
2107 'id': video_id.decode('utf-8'),
2109 'uploader': video_uploader,
2110 'upload_date': u'NA',
2111 'title': video_title,
2112 'stitle': simple_title,
2114 'thumbnail': video_thumbnail.decode('utf-8'),
2115 'description': video_description,
2116 'thumbnail': video_thumbnail,
2117 'description': video_description,
2120 except UnavailableVideoError:
2121 self._downloader.trouble(u'ERROR: unable to download video')
2124 class GenericIE(InfoExtractor):
2125 """Generic last-resort information extractor."""
2128 IE_NAME = u'generic'
2130 def __init__(self, downloader=None):
2131 InfoExtractor.__init__(self, downloader)
2133 def report_download_webpage(self, video_id):
2134 """Report webpage download."""
2135 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2136 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2138 def report_extraction(self, video_id):
2139 """Report information extraction."""
2140 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2142 def _real_extract(self, url):
2143 # At this point we have a new video
2144 self._downloader.increment_downloads()
2146 video_id = url.split('/')[-1]
2147 request = urllib2.Request(url)
2149 self.report_download_webpage(video_id)
2150 webpage = urllib2.urlopen(request).read()
2151 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2152 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2154 except ValueError, err:
2155 # since this is the last-resort InfoExtractor, if
2156 # this error is thrown, it'll be thrown here
2157 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2160 self.report_extraction(video_id)
2161 # Start with something easy: JW Player in SWFObject
2162 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2164 # Broaden the search a little bit
2165 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2167 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2170 # It's possible that one of the regexes
2171 # matched, but returned an empty group:
2172 if mobj.group(1) is None:
2173 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2176 video_url = urllib.unquote(mobj.group(1))
2177 video_id = os.path.basename(video_url)
2179 # here's a fun little line of code for you:
2180 video_extension = os.path.splitext(video_id)[1][1:]
2181 video_id = os.path.splitext(video_id)[0]
2183 # it's tempting to parse this further, but you would
2184 # have to take into account all the variations like
2185 # Video Title - Site Name
2186 # Site Name | Video Title
2187 # Video Title - Tagline | Site Name
2188 # and so on and so forth; it's just not practical
2189 mobj = re.search(r'<title>(.*)</title>', webpage)
2191 self._downloader.trouble(u'ERROR: unable to extract title')
2193 video_title = mobj.group(1).decode('utf-8')
2194 video_title = sanitize_title(video_title)
2195 simple_title = _simplify_title(video_title)
2197 # video uploader is domain name
2198 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2200 self._downloader.trouble(u'ERROR: unable to extract title')
2202 video_uploader = mobj.group(1).decode('utf-8')
2205 # Process video information
2206 self._downloader.process_info({
2207 'id': video_id.decode('utf-8'),
2208 'url': video_url.decode('utf-8'),
2209 'uploader': video_uploader,
2210 'upload_date': u'NA',
2211 'title': video_title,
2212 'stitle': simple_title,
2213 'ext': video_extension.decode('utf-8'),
2217 except UnavailableVideoError, err:
2218 self._downloader.trouble(u'\nERROR: unable to download video')
2221 class YoutubeSearchIE(InfoExtractor):
2222 """Information Extractor for YouTube search queries."""
2223 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2224 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2225 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2226 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2228 _max_youtube_results = 1000
2229 IE_NAME = u'youtube:search'
2231 def __init__(self, youtube_ie, downloader=None):
2232 InfoExtractor.__init__(self, downloader)
2233 self._youtube_ie = youtube_ie
2235 def report_download_page(self, query, pagenum):
2236 """Report attempt to download playlist page with given number."""
2237 query = query.decode(preferredencoding())
2238 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2240 def _real_initialize(self):
2241 self._youtube_ie.initialize()
2243 def _real_extract(self, query):
2244 mobj = re.match(self._VALID_URL, query)
2246 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2249 prefix, query = query.split(':')
2251 query = query.encode('utf-8')
2253 self._download_n_results(query, 1)
2255 elif prefix == 'all':
2256 self._download_n_results(query, self._max_youtube_results)
2262 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2264 elif n > self._max_youtube_results:
2265 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2266 n = self._max_youtube_results
2267 self._download_n_results(query, n)
2269 except ValueError: # parsing prefix as integer fails
2270 self._download_n_results(query, 1)
2273 def _download_n_results(self, query, n):
2274 """Downloads a specified number of results for a query"""
2277 already_seen = set()
2281 self.report_download_page(query, pagenum)
2282 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2283 request = urllib2.Request(result_url)
2285 page = urllib2.urlopen(request).read()
2286 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2287 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2290 # Extract video identifiers
2291 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2292 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2293 if video_id not in already_seen:
2294 video_ids.append(video_id)
2295 already_seen.add(video_id)
2296 if len(video_ids) == n:
2297 # Specified n videos reached
2298 for id in video_ids:
2299 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2302 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2303 for id in video_ids:
2304 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2307 pagenum = pagenum + 1
2310 class GoogleSearchIE(InfoExtractor):
2311 """Information Extractor for Google Video search queries."""
2312 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2313 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2314 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2315 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2317 _max_google_results = 1000
2318 IE_NAME = u'video.google:search'
2320 def __init__(self, google_ie, downloader=None):
2321 InfoExtractor.__init__(self, downloader)
2322 self._google_ie = google_ie
2324 def report_download_page(self, query, pagenum):
2325 """Report attempt to download playlist page with given number."""
2326 query = query.decode(preferredencoding())
2327 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2329 def _real_initialize(self):
2330 self._google_ie.initialize()
2332 def _real_extract(self, query):
2333 mobj = re.match(self._VALID_URL, query)
2335 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2338 prefix, query = query.split(':')
2340 query = query.encode('utf-8')
2342 self._download_n_results(query, 1)
2344 elif prefix == 'all':
2345 self._download_n_results(query, self._max_google_results)
2351 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2353 elif n > self._max_google_results:
2354 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2355 n = self._max_google_results
2356 self._download_n_results(query, n)
2358 except ValueError: # parsing prefix as integer fails
2359 self._download_n_results(query, 1)
2362 def _download_n_results(self, query, n):
2363 """Downloads a specified number of results for a query"""
2366 already_seen = set()
2370 self.report_download_page(query, pagenum)
2371 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2372 request = urllib2.Request(result_url)
2374 page = urllib2.urlopen(request).read()
2375 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2376 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2379 # Extract video identifiers
2380 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2381 video_id = mobj.group(1)
2382 if video_id not in already_seen:
2383 video_ids.append(video_id)
2384 already_seen.add(video_id)
2385 if len(video_ids) == n:
2386 # Specified n videos reached
2387 for id in video_ids:
2388 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2391 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2392 for id in video_ids:
2393 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2396 pagenum = pagenum + 1
2399 class YahooSearchIE(InfoExtractor):
2400 """Information Extractor for Yahoo! Video search queries."""
2401 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2402 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2403 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2404 _MORE_PAGES_INDICATOR = r'\s*Next'
2406 _max_yahoo_results = 1000
2407 IE_NAME = u'video.yahoo:search'
2409 def __init__(self, yahoo_ie, downloader=None):
2410 InfoExtractor.__init__(self, downloader)
2411 self._yahoo_ie = yahoo_ie
2413 def report_download_page(self, query, pagenum):
2414 """Report attempt to download playlist page with given number."""
2415 query = query.decode(preferredencoding())
2416 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2418 def _real_initialize(self):
2419 self._yahoo_ie.initialize()
2421 def _real_extract(self, query):
2422 mobj = re.match(self._VALID_URL, query)
2424 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2427 prefix, query = query.split(':')
2429 query = query.encode('utf-8')
2431 self._download_n_results(query, 1)
2433 elif prefix == 'all':
2434 self._download_n_results(query, self._max_yahoo_results)
2440 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2442 elif n > self._max_yahoo_results:
2443 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2444 n = self._max_yahoo_results
2445 self._download_n_results(query, n)
2447 except ValueError: # parsing prefix as integer fails
2448 self._download_n_results(query, 1)
2451 def _download_n_results(self, query, n):
2452 """Downloads a specified number of results for a query"""
2455 already_seen = set()
2459 self.report_download_page(query, pagenum)
2460 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2461 request = urllib2.Request(result_url)
2463 page = urllib2.urlopen(request).read()
2464 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2465 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2468 # Extract video identifiers
2469 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2470 video_id = mobj.group(1)
2471 if video_id not in already_seen:
2472 video_ids.append(video_id)
2473 already_seen.add(video_id)
2474 if len(video_ids) == n:
2475 # Specified n videos reached
2476 for id in video_ids:
2477 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2480 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2481 for id in video_ids:
2482 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2485 pagenum = pagenum + 1
2488 class YoutubePlaylistIE(InfoExtractor):
2489 """Information Extractor for YouTube playlists."""
2491 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2492 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2493 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2494 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2496 IE_NAME = u'youtube:playlist'
2498 def __init__(self, youtube_ie, downloader=None):
2499 InfoExtractor.__init__(self, downloader)
2500 self._youtube_ie = youtube_ie
2502 def report_download_page(self, playlist_id, pagenum):
2503 """Report attempt to download playlist page with given number."""
2504 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2506 def _real_initialize(self):
2507 self._youtube_ie.initialize()
2509 def _real_extract(self, url):
2510 # Extract playlist id
2511 mobj = re.match(self._VALID_URL, url)
2513 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2517 if mobj.group(3) is not None:
2518 self._youtube_ie.extract(mobj.group(3))
2521 # Download playlist pages
2522 # prefix is 'p' as default for playlists but there are other types that need extra care
2523 playlist_prefix = mobj.group(1)
2524 if playlist_prefix == 'a':
2525 playlist_access = 'artist'
2527 playlist_prefix = 'p'
2528 playlist_access = 'view_play_list'
2529 playlist_id = mobj.group(2)
2534 self.report_download_page(playlist_id, pagenum)
2535 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2536 request = urllib2.Request(url)
2538 page = urllib2.urlopen(request).read()
2539 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2540 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2543 # Extract video identifiers
2545 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2546 if mobj.group(1) not in ids_in_page:
2547 ids_in_page.append(mobj.group(1))
2548 video_ids.extend(ids_in_page)
2550 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2552 pagenum = pagenum + 1
2554 playliststart = self._downloader.params.get('playliststart', 1) - 1
2555 playlistend = self._downloader.params.get('playlistend', -1)
2556 video_ids = video_ids[playliststart:playlistend]
2558 for id in video_ids:
2559 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2563 class YoutubeUserIE(InfoExtractor):
2564 """Information Extractor for YouTube users."""
2566 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2567 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2568 _GDATA_PAGE_SIZE = 50
2569 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2570 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2572 IE_NAME = u'youtube:user'
2574 def __init__(self, youtube_ie, downloader=None):
2575 InfoExtractor.__init__(self, downloader)
2576 self._youtube_ie = youtube_ie
2578 def report_download_page(self, username, start_index):
2579 """Report attempt to download user page."""
2580 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2581 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2583 def _real_initialize(self):
2584 self._youtube_ie.initialize()
2586 def _real_extract(self, url):
2588 mobj = re.match(self._VALID_URL, url)
2590 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2593 username = mobj.group(1)
2595 # Download video ids using YouTube Data API. Result size per
2596 # query is limited (currently to 50 videos) so we need to query
2597 # page by page until there are no video ids - it means we got
2604 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2605 self.report_download_page(username, start_index)
2607 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2610 page = urllib2.urlopen(request).read()
2611 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2612 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2615 # Extract video identifiers
2618 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2619 if mobj.group(1) not in ids_in_page:
2620 ids_in_page.append(mobj.group(1))
2622 video_ids.extend(ids_in_page)
2624 # A little optimization - if current page is not
2625 # "full", ie. does not contain PAGE_SIZE video ids then
2626 # we can assume that this page is the last one - there
2627 # are no more ids on further pages - no need to query
2630 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2635 all_ids_count = len(video_ids)
2636 playliststart = self._downloader.params.get('playliststart', 1) - 1
2637 playlistend = self._downloader.params.get('playlistend', -1)
2639 if playlistend == -1:
2640 video_ids = video_ids[playliststart:]
2642 video_ids = video_ids[playliststart:playlistend]
2644 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2645 (username, all_ids_count, len(video_ids)))
2647 for video_id in video_ids:
2648 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2651 class DepositFilesIE(InfoExtractor):
2652 """Information extractor for depositfiles.com"""
2654 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2655 IE_NAME = u'DepositFiles'
2657 def __init__(self, downloader=None):
2658 InfoExtractor.__init__(self, downloader)
2660 def report_download_webpage(self, file_id):
2661 """Report webpage download."""
2662 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2664 def report_extraction(self, file_id):
2665 """Report information extraction."""
2666 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2668 def _real_extract(self, url):
2669 # At this point we have a new file
2670 self._downloader.increment_downloads()
2672 file_id = url.split('/')[-1]
2673 # Rebuild url in english locale
2674 url = 'http://depositfiles.com/en/files/' + file_id
2676 # Retrieve file webpage with 'Free download' button pressed
2677 free_download_indication = { 'gateway_result' : '1' }
2678 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2680 self.report_download_webpage(file_id)
2681 webpage = urllib2.urlopen(request).read()
2682 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2683 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2686 # Search for the real file URL
2687 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2688 if (mobj is None) or (mobj.group(1) is None):
2689 # Try to figure out reason of the error.
2690 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2691 if (mobj is not None) and (mobj.group(1) is not None):
2692 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2693 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2695 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2698 file_url = mobj.group(1)
2699 file_extension = os.path.splitext(file_url)[1][1:]
2701 # Search for file title
2702 mobj = re.search(r'<b title="(.*?)">', webpage)
2704 self._downloader.trouble(u'ERROR: unable to extract title')
2706 file_title = mobj.group(1).decode('utf-8')
2709 # Process file information
2710 self._downloader.process_info({
2711 'id': file_id.decode('utf-8'),
2712 'url': file_url.decode('utf-8'),
2714 'upload_date': u'NA',
2715 'title': file_title,
2716 'stitle': file_title,
2717 'ext': file_extension.decode('utf-8'),
2721 except UnavailableVideoError, err:
2722 self._downloader.trouble(u'ERROR: unable to download file')
2725 class FacebookIE(InfoExtractor):
2726 """Information Extractor for Facebook"""
2728 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2729 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2730 _NETRC_MACHINE = 'facebook'
2731 _available_formats = ['video', 'highqual', 'lowqual']
2732 _video_extensions = {
2737 IE_NAME = u'facebook'
2739 def __init__(self, downloader=None):
2740 InfoExtractor.__init__(self, downloader)
2742 def _reporter(self, message):
2743 """Add header and report message."""
2744 self._downloader.to_screen(u'[facebook] %s' % message)
2746 def report_login(self):
2747 """Report attempt to log in."""
2748 self._reporter(u'Logging in')
2750 def report_video_webpage_download(self, video_id):
2751 """Report attempt to download video webpage."""
2752 self._reporter(u'%s: Downloading video webpage' % video_id)
2754 def report_information_extraction(self, video_id):
2755 """Report attempt to extract video information."""
2756 self._reporter(u'%s: Extracting video information' % video_id)
2758 def _parse_page(self, video_webpage):
2759 """Extract video information from page"""
2761 data = {'title': r'\("video_title", "(.*?)"\)',
2762 'description': r'<div class="datawrap">(.*?)</div>',
2763 'owner': r'\("video_owner_name", "(.*?)"\)',
2764 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2767 for piece in data.keys():
2768 mobj = re.search(data[piece], video_webpage)
2769 if mobj is not None:
2770 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2774 for fmt in self._available_formats:
2775 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2776 if mobj is not None:
2777 # URL is in a Javascript segment inside an escaped Unicode format within
2778 # the generally utf-8 page
2779 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2780 video_info['video_urls'] = video_urls
2784 def _real_initialize(self):
2785 if self._downloader is None:
2790 downloader_params = self._downloader.params
2792 # Attempt to use provided username and password or .netrc data
2793 if downloader_params.get('username', None) is not None:
2794 useremail = downloader_params['username']
2795 password = downloader_params['password']
2796 elif downloader_params.get('usenetrc', False):
2798 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2799 if info is not None:
2803 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2804 except (IOError, netrc.NetrcParseError), err:
2805 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2808 if useremail is None:
2817 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2820 login_results = urllib2.urlopen(request).read()
2821 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2822 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2824 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2825 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2828 def _real_extract(self, url):
2829 mobj = re.match(self._VALID_URL, url)
2831 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2833 video_id = mobj.group('ID')
2836 self.report_video_webpage_download(video_id)
2837 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2839 page = urllib2.urlopen(request)
2840 video_webpage = page.read()
2841 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2842 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2845 # Start extracting information
2846 self.report_information_extraction(video_id)
2848 # Extract information
2849 video_info = self._parse_page(video_webpage)
2852 if 'owner' not in video_info:
2853 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2855 video_uploader = video_info['owner']
2858 if 'title' not in video_info:
2859 self._downloader.trouble(u'ERROR: unable to extract video title')
2861 video_title = video_info['title']
2862 video_title = video_title.decode('utf-8')
2863 video_title = sanitize_title(video_title)
2865 simple_title = _simplify_title(video_title)
2868 if 'thumbnail' not in video_info:
2869 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2870 video_thumbnail = ''
2872 video_thumbnail = video_info['thumbnail']
2876 if 'upload_date' in video_info:
2877 upload_time = video_info['upload_date']
2878 timetuple = email.utils.parsedate_tz(upload_time)
2879 if timetuple is not None:
2881 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2886 video_description = video_info.get('description', 'No description available.')
2888 url_map = video_info['video_urls']
2889 if len(url_map.keys()) > 0:
2890 # Decide which formats to download
2891 req_format = self._downloader.params.get('format', None)
2892 format_limit = self._downloader.params.get('format_limit', None)
2894 if format_limit is not None and format_limit in self._available_formats:
2895 format_list = self._available_formats[self._available_formats.index(format_limit):]
2897 format_list = self._available_formats
2898 existing_formats = [x for x in format_list if x in url_map]
2899 if len(existing_formats) == 0:
2900 self._downloader.trouble(u'ERROR: no known formats available for video')
2902 if req_format is None:
2903 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2904 elif req_format == 'worst':
2905 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2906 elif req_format == '-1':
2907 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2910 if req_format not in url_map:
2911 self._downloader.trouble(u'ERROR: requested format not available')
2913 video_url_list = [(req_format, url_map[req_format])] # Specific format
2915 for format_param, video_real_url in video_url_list:
2917 # At this point we have a new video
2918 self._downloader.increment_downloads()
2921 video_extension = self._video_extensions.get(format_param, 'mp4')
2924 # Process video information
2925 self._downloader.process_info({
2926 'id': video_id.decode('utf-8'),
2927 'url': video_real_url.decode('utf-8'),
2928 'uploader': video_uploader.decode('utf-8'),
2929 'upload_date': upload_date,
2930 'title': video_title,
2931 'stitle': simple_title,
2932 'ext': video_extension.decode('utf-8'),
2933 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2934 'thumbnail': video_thumbnail.decode('utf-8'),
2935 'description': video_description.decode('utf-8'),
2938 except UnavailableVideoError, err:
2939 self._downloader.trouble(u'\nERROR: unable to download video')
2941 class BlipTVIE(InfoExtractor):
2942 """Information extractor for blip.tv"""
2944 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2945 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2946 IE_NAME = u'blip.tv'
2948 def report_extraction(self, file_id):
2949 """Report information extraction."""
2950 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2952 def report_direct_download(self, title):
2953 """Report information extraction."""
2954 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2956 def _real_extract(self, url):
2957 mobj = re.match(self._VALID_URL, url)
2959 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2966 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2967 request = urllib2.Request(json_url)
2968 self.report_extraction(mobj.group(1))
2971 urlh = urllib2.urlopen(request)
2972 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2973 basename = url.split('/')[-1]
2974 title,ext = os.path.splitext(basename)
2975 title = title.decode('UTF-8')
2976 ext = ext.replace('.', '')
2977 self.report_direct_download(title)
2982 'stitle': _simplify_title(title),
2986 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2987 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2989 if info is None: # Regular URL
2991 json_code = urlh.read()
2992 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2993 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2997 json_data = json.loads(json_code)
2998 if 'Post' in json_data:
2999 data = json_data['Post']
3003 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3004 video_url = data['media']['url']
3005 umobj = re.match(self._URL_EXT, video_url)
3007 raise ValueError('Can not determine filename extension')
3008 ext = umobj.group(1)
3011 'id': data['item_id'],
3013 'uploader': data['display_name'],
3014 'upload_date': upload_date,
3015 'title': data['title'],
3016 'stitle': _simplify_title(data['title']),
3018 'format': data['media']['mimeType'],
3019 'thumbnail': data['thumbnailUrl'],
3020 'description': data['description'],
3021 'player_url': data['embedUrl']
3023 except (ValueError,KeyError), err:
3024 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3027 self._downloader.increment_downloads()
3030 self._downloader.process_info(info)
3031 except UnavailableVideoError, err:
3032 self._downloader.trouble(u'\nERROR: unable to download video')
3035 class MyVideoIE(InfoExtractor):
3036 """Information Extractor for myvideo.de."""
3038 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3039 IE_NAME = u'myvideo'
3041 def __init__(self, downloader=None):
3042 InfoExtractor.__init__(self, downloader)
3044 def report_download_webpage(self, video_id):
3045 """Report webpage download."""
3046 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3048 def report_extraction(self, video_id):
3049 """Report information extraction."""
3050 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3052 def _real_extract(self,url):
3053 mobj = re.match(self._VALID_URL, url)
3055 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3058 video_id = mobj.group(1)
3061 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3063 self.report_download_webpage(video_id)
3064 webpage = urllib2.urlopen(request).read()
3065 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3066 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3069 self.report_extraction(video_id)
3070 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3073 self._downloader.trouble(u'ERROR: unable to extract media URL')
3075 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3077 mobj = re.search('<title>([^<]+)</title>', webpage)
3079 self._downloader.trouble(u'ERROR: unable to extract title')
3082 video_title = mobj.group(1)
3083 video_title = sanitize_title(video_title)
3085 simple_title = _simplify_title(video_title)
3088 self._downloader.process_info({
3092 'upload_date': u'NA',
3093 'title': video_title,
3094 'stitle': simple_title,
3099 except UnavailableVideoError:
3100 self._downloader.trouble(u'\nERROR: Unable to download video')
3102 class ComedyCentralIE(InfoExtractor):
3103 """Information extractor for The Daily Show and Colbert Report """
3105 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3106 IE_NAME = u'comedycentral'
3108 def report_extraction(self, episode_id):
3109 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3111 def report_config_download(self, episode_id):
3112 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3114 def report_index_download(self, episode_id):
3115 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3117 def report_player_url(self, episode_id):
3118 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3120 def _real_extract(self, url):
3121 mobj = re.match(self._VALID_URL, url)
3123 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3126 if mobj.group('shortname'):
3127 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3128 url = u'http://www.thedailyshow.com/full-episodes/'
3130 url = u'http://www.colbertnation.com/full-episodes/'
3131 mobj = re.match(self._VALID_URL, url)
3132 assert mobj is not None
3134 dlNewest = not mobj.group('episode')
3136 epTitle = mobj.group('showname')
3138 epTitle = mobj.group('episode')
3140 req = urllib2.Request(url)
3141 self.report_extraction(epTitle)
3143 htmlHandle = urllib2.urlopen(req)
3144 html = htmlHandle.read()
3145 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3146 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3149 url = htmlHandle.geturl()
3150 mobj = re.match(self._VALID_URL, url)
3152 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3154 if mobj.group('episode') == '':
3155 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3157 epTitle = mobj.group('episode')
3159 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3160 if len(mMovieParams) == 0:
3161 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3164 playerUrl_raw = mMovieParams[0][0]
3165 self.report_player_url(epTitle)
3167 urlHandle = urllib2.urlopen(playerUrl_raw)
3168 playerUrl = urlHandle.geturl()
3169 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3170 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3173 uri = mMovieParams[0][1]
3174 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3175 self.report_index_download(epTitle)
3177 indexXml = urllib2.urlopen(indexUrl).read()
3178 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3179 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3182 idoc = xml.etree.ElementTree.fromstring(indexXml)
3183 itemEls = idoc.findall('.//item')
3184 for itemEl in itemEls:
3185 mediaId = itemEl.findall('./guid')[0].text
3186 shortMediaId = mediaId.split(':')[-1]
3187 showId = mediaId.split(':')[-2].replace('.com', '')
3188 officialTitle = itemEl.findall('./title')[0].text
3189 officialDate = itemEl.findall('./pubDate')[0].text
3191 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3192 urllib.urlencode({'uri': mediaId}))
3193 configReq = urllib2.Request(configUrl)
3194 self.report_config_download(epTitle)
3196 configXml = urllib2.urlopen(configReq).read()
3197 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3198 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3201 cdoc = xml.etree.ElementTree.fromstring(configXml)
3203 for rendition in cdoc.findall('.//rendition'):
3204 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3208 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3211 # For now, just pick the highest bitrate
3212 format,video_url = turls[-1]
3214 self._downloader.increment_downloads()
3216 effTitle = showId + u'-' + epTitle
3221 'upload_date': officialDate,
3223 'stitle': _simplify_title(effTitle),
3227 'description': officialTitle,
3228 'player_url': playerUrl
3232 self._downloader.process_info(info)
3233 except UnavailableVideoError, err:
3234 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3238 class EscapistIE(InfoExtractor):
3239 """Information extractor for The Escapist """
3241 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3242 IE_NAME = u'escapist'
3244 def report_extraction(self, showName):
3245 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3247 def report_config_download(self, showName):
3248 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3250 def _real_extract(self, url):
3251 htmlParser = HTMLParser.HTMLParser()
3253 mobj = re.match(self._VALID_URL, url)
3255 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3257 showName = mobj.group('showname')
3258 videoId = mobj.group('episode')
3260 self.report_extraction(showName)
3262 webPage = urllib2.urlopen(url).read()
3263 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3264 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3267 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3268 description = htmlParser.unescape(descMatch.group(1))
3269 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3270 imgUrl = htmlParser.unescape(imgMatch.group(1))
3271 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3272 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3273 configUrlMatch = re.search('config=(.*)$', playerUrl)
3274 configUrl = urllib2.unquote(configUrlMatch.group(1))
3276 self.report_config_download(showName)
3278 configJSON = urllib2.urlopen(configUrl).read()
3279 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3280 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3283 # Technically, it's JavaScript, not JSON
3284 configJSON = configJSON.replace("'", '"')
3287 config = json.loads(configJSON)
3288 except (ValueError,), err:
3289 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3292 playlist = config['playlist']
3293 videoUrl = playlist[1]['url']
3295 self._downloader.increment_downloads()
3299 'uploader': showName,
3300 'upload_date': None,
3302 'stitle': _simplify_title(showName),
3305 'thumbnail': imgUrl,
3306 'description': description,
3307 'player_url': playerUrl,
3311 self._downloader.process_info(info)
3312 except UnavailableVideoError, err:
3313 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3316 class CollegeHumorIE(InfoExtractor):
3317 """Information extractor for collegehumor.com"""
3319 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3320 IE_NAME = u'collegehumor'
3322 def report_webpage(self, video_id):
3323 """Report information extraction."""
3324 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3326 def report_extraction(self, video_id):
3327 """Report information extraction."""
3328 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3330 def _real_extract(self, url):
3331 htmlParser = HTMLParser.HTMLParser()
3333 mobj = re.match(self._VALID_URL, url)
3335 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3337 video_id = mobj.group('videoid')
3339 self.report_webpage(video_id)
3340 request = urllib2.Request(url)
3342 webpage = urllib2.urlopen(request).read()
3343 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3344 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3347 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3349 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3351 internal_video_id = m.group('internalvideoid')
3355 'internal_id': internal_video_id,
3358 self.report_extraction(video_id)
3359 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3361 metaXml = urllib2.urlopen(xmlUrl).read()
3362 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3363 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3366 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3368 videoNode = mdoc.findall('./video')[0]
3369 info['description'] = videoNode.findall('./description')[0].text
3370 info['title'] = videoNode.findall('./caption')[0].text
3371 info['stitle'] = _simplify_title(info['title'])
3372 info['url'] = videoNode.findall('./file')[0].text
3373 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3374 info['ext'] = info['url'].rpartition('.')[2]
3375 info['format'] = info['ext']
3377 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3380 self._downloader.increment_downloads()
3383 self._downloader.process_info(info)
3384 except UnavailableVideoError, err:
3385 self._downloader.trouble(u'\nERROR: unable to download video')
3388 class XVideosIE(InfoExtractor):
3389 """Information extractor for xvideos.com"""
3391 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3392 IE_NAME = u'xvideos'
3394 def report_webpage(self, video_id):
3395 """Report information extraction."""
3396 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3398 def report_extraction(self, video_id):
3399 """Report information extraction."""
3400 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3402 def _real_extract(self, url):
3403 htmlParser = HTMLParser.HTMLParser()
3405 mobj = re.match(self._VALID_URL, url)
3407 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3409 video_id = mobj.group(1).decode('utf-8')
3411 self.report_webpage(video_id)
3413 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3415 webpage = urllib2.urlopen(request).read()
3416 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3417 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3420 self.report_extraction(video_id)
3424 mobj = re.search(r'flv_url=(.+?)&', webpage)
3426 self._downloader.trouble(u'ERROR: unable to extract video url')
3428 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3432 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3434 self._downloader.trouble(u'ERROR: unable to extract video title')
3436 video_title = mobj.group(1).decode('utf-8')
3439 # Extract video thumbnail
3440 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3442 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3444 video_thumbnail = mobj.group(1).decode('utf-8')
3448 self._downloader.increment_downloads()
3453 'upload_date': None,
3454 'title': video_title,
3455 'stitle': _simplify_title(video_title),
3458 'thumbnail': video_thumbnail,
3459 'description': None,
3464 self._downloader.process_info(info)
3465 except UnavailableVideoError, err:
3466 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3469 class SoundcloudIE(InfoExtractor):
3470 """Information extractor for soundcloud.com
3471 To access the media, the uid of the song and a stream token
3472 must be extracted from the page source and the script must make
3473 a request to media.soundcloud.com/crossdomain.xml. Then
3474 the media can be grabbed by requesting from an url composed
3475 of the stream token and uid
3478 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3479 IE_NAME = u'soundcloud'
3481 def __init__(self, downloader=None):
3482 InfoExtractor.__init__(self, downloader)
3484 def report_webpage(self, video_id):
3485 """Report information extraction."""
3486 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3488 def report_extraction(self, video_id):
3489 """Report information extraction."""
3490 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3492 def _real_extract(self, url):
3493 htmlParser = HTMLParser.HTMLParser()
3495 mobj = re.match(self._VALID_URL, url)
3497 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3500 # extract uploader (which is in the url)
3501 uploader = mobj.group(1).decode('utf-8')
3502 # extract simple title (uploader + slug of song title)
3503 slug_title = mobj.group(2).decode('utf-8')
3504 simple_title = uploader + '-' + slug_title
3506 self.report_webpage('%s/%s' % (uploader, slug_title))
3508 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3510 webpage = urllib2.urlopen(request).read()
3511 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3512 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3515 self.report_extraction('%s/%s' % (uploader, slug_title))
3517 # extract uid and stream token that soundcloud hands out for access
3518 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3520 video_id = mobj.group(1)
3521 stream_token = mobj.group(2)
3523 # extract unsimplified title
3524 mobj = re.search('"title":"(.*?)",', webpage)
3526 title = mobj.group(1)
3528 # construct media url (with uid/token)
3529 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3530 mediaURL = mediaURL % (video_id, stream_token)
3533 description = u'No description available'
3534 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3536 description = mobj.group(1)
3540 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3543 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3544 except Exception, e:
3547 # for soundcloud, a request to a cross domain is required for cookies
3548 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3551 self._downloader.process_info({
3552 'id': video_id.decode('utf-8'),
3554 'uploader': uploader.decode('utf-8'),
3555 'upload_date': upload_date,
3556 'title': simple_title.decode('utf-8'),
3557 'stitle': simple_title.decode('utf-8'),
3561 'description': description.decode('utf-8')
3563 except UnavailableVideoError:
3564 self._downloader.trouble(u'\nERROR: unable to download video')
3567 class InfoQIE(InfoExtractor):
3568 """Information extractor for infoq.com"""
3570 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3573 def report_webpage(self, video_id):
3574 """Report information extraction."""
3575 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3577 def report_extraction(self, video_id):
3578 """Report information extraction."""
3579 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3581 def _real_extract(self, url):
3582 htmlParser = HTMLParser.HTMLParser()
3584 mobj = re.match(self._VALID_URL, url)
3586 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3589 self.report_webpage(url)
3591 request = urllib2.Request(url)
3593 webpage = urllib2.urlopen(request).read()
3594 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3595 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3598 self.report_extraction(url)
3602 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3604 self._downloader.trouble(u'ERROR: unable to extract video url')
3606 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3610 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3612 self._downloader.trouble(u'ERROR: unable to extract video title')
3614 video_title = mobj.group(1).decode('utf-8')
3616 # Extract description
3617 video_description = u'No description available.'
3618 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3619 if mobj is not None:
3620 video_description = mobj.group(1).decode('utf-8')
3622 video_filename = video_url.split('/')[-1]
3623 video_id, extension = video_filename.split('.')
3625 self._downloader.increment_downloads()
3630 'upload_date': None,
3631 'title': video_title,
3632 'stitle': _simplify_title(video_title),
3634 'format': extension, # Extension is always(?) mp4, but seems to be flv
3636 'description': video_description,
3641 self._downloader.process_info(info)
3642 except UnavailableVideoError, err:
3643 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3645 class MixcloudIE(InfoExtractor):
3646 """Information extractor for www.mixcloud.com"""
3647 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3648 IE_NAME = u'mixcloud'
3650 def __init__(self, downloader=None):
3651 InfoExtractor.__init__(self, downloader)
3653 def report_download_json(self, file_id):
3654 """Report JSON download."""
3655 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3657 def report_extraction(self, file_id):
3658 """Report information extraction."""
3659 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3661 def get_urls(self, jsonData, fmt, bitrate='best'):
3662 """Get urls from 'audio_formats' section in json"""
3665 bitrate_list = jsonData[fmt]
3666 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3667 bitrate = max(bitrate_list) # select highest
3669 url_list = jsonData[fmt][bitrate]
3670 except TypeError: # we have no bitrate info.
3671 url_list = jsonData[fmt]
3675 def check_urls(self, url_list):
3676 """Returns 1st active url from list"""
3677 for url in url_list:
3679 urllib2.urlopen(url)
3681 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3686 def _print_formats(self, formats):
3687 print 'Available formats:'
3688 for fmt in formats.keys():
3689 for b in formats[fmt]:
3691 ext = formats[fmt][b][0]
3692 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3693 except TypeError: # we have no bitrate info
3694 ext = formats[fmt][0]
3695 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3698 def _real_extract(self, url):
3699 mobj = re.match(self._VALID_URL, url)
3701 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3703 # extract uploader & filename from url
3704 uploader = mobj.group(1).decode('utf-8')
3705 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3707 # construct API request
3708 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3709 # retrieve .json file with links to files
3710 request = urllib2.Request(file_url)
3712 self.report_download_json(file_url)
3713 jsonData = urllib2.urlopen(request).read()
3714 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3715 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3719 json_data = json.loads(jsonData)
3720 player_url = json_data['player_swf_url']
3721 formats = dict(json_data['audio_formats'])
3723 req_format = self._downloader.params.get('format', None)
3726 if self._downloader.params.get('listformats', None):
3727 self._print_formats(formats)
3730 if req_format is None or req_format == 'best':
3731 for format_param in formats.keys():
3732 url_list = self.get_urls(formats, format_param)
3734 file_url = self.check_urls(url_list)
3735 if file_url is not None:
3738 if req_format not in formats.keys():
3739 self._downloader.trouble(u'ERROR: format is not available')
3742 url_list = self.get_urls(formats, req_format)
3743 file_url = self.check_urls(url_list)
3744 format_param = req_format
3747 self._downloader.increment_downloads()
3749 # Process file information
3750 self._downloader.process_info({
3751 'id': file_id.decode('utf-8'),
3752 'url': file_url.decode('utf-8'),
3753 'uploader': uploader.decode('utf-8'),
3754 'upload_date': u'NA',
3755 'title': json_data['name'],
3756 'stitle': _simplify_title(json_data['name']),
3757 'ext': file_url.split('.')[-1].decode('utf-8'),
3758 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3759 'thumbnail': json_data['thumbnail_url'],
3760 'description': json_data['description'],
3761 'player_url': player_url.decode('utf-8'),
3763 except UnavailableVideoError, err:
3764 self._downloader.trouble(u'ERROR: unable to download file')
3766 class StanfordOpenClassroomIE(InfoExtractor):
3767 """Information extractor for Stanford's Open ClassRoom"""
3769 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3770 IE_NAME = u'stanfordoc'
3772 def report_extraction(self, video_id):
3773 """Report information extraction."""
3774 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3776 def _real_extract(self, url):
3777 mobj = re.match(self._VALID_URL, url)
3779 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3782 if mobj.group('course') and mobj.group('video'): # A specific video
3783 course = mobj.group('course')
3784 video = mobj.group('video')
3786 'id': _simplify_title(course + '_' + video),
3789 self.report_extraction(info['id'])
3790 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3791 xmlUrl = baseUrl + video + '.xml'
3793 metaXml = urllib2.urlopen(xmlUrl).read()
3794 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3795 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3797 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3799 info['title'] = mdoc.findall('./title')[0].text
3800 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3802 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3804 info['stitle'] = _simplify_title(info['title'])
3805 info['ext'] = info['url'].rpartition('.')[2]
3806 info['format'] = info['ext']
3807 self._downloader.increment_downloads()
3809 self._downloader.process_info(info)
3810 except UnavailableVideoError, err:
3811 self._downloader.trouble(u'\nERROR: unable to download video')
3813 print('TODO: Not yet implemented')
3821 class PostProcessor(object):
3822 """Post Processor class.
3824 PostProcessor objects can be added to downloaders with their
3825 add_post_processor() method. When the downloader has finished a
3826 successful download, it will take its internal chain of PostProcessors
3827 and start calling the run() method on each one of them, first with
3828 an initial argument and then with the returned value of the previous
3831 The chain will be stopped if one of them ever returns None or the end
3832 of the chain is reached.
3834 PostProcessor objects follow a "mutual registration" process similar
3835 to InfoExtractor objects.
3840 def __init__(self, downloader=None):
3841 self._downloader = downloader
3843 def set_downloader(self, downloader):
3844 """Sets the downloader for this PP."""
3845 self._downloader = downloader
3847 def run(self, information):
3848 """Run the PostProcessor.
3850 The "information" argument is a dictionary like the ones
3851 composed by InfoExtractors. The only difference is that this
3852 one has an extra field called "filepath" that points to the
3855 When this method returns None, the postprocessing chain is
3856 stopped. However, this method may return an information
3857 dictionary that will be passed to the next postprocessing
3858 object in the chain. It can be the one it received after
3859 changing some fields.
3861 In addition, this method may raise a PostProcessingError
3862 exception that will be taken into account by the downloader
3865 return information # by default, do nothing
3868 class FFmpegExtractAudioPP(PostProcessor):
3870 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3871 PostProcessor.__init__(self, downloader)
3872 if preferredcodec is None:
3873 preferredcodec = 'best'
3874 self._preferredcodec = preferredcodec
3875 self._preferredquality = preferredquality
3876 self._keepvideo = keepvideo
3879 def get_audio_codec(path):
3881 cmd = ['ffprobe', '-show_streams', '--', path]
3882 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3883 output = handle.communicate()[0]
3884 if handle.wait() != 0:
3886 except (IOError, OSError):
3889 for line in output.split('\n'):
3890 if line.startswith('codec_name='):
3891 audio_codec = line.split('=')[1].strip()
3892 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3897 def run_ffmpeg(path, out_path, codec, more_opts):
3899 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3900 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3902 except (IOError, OSError):
3905 def run(self, information):
3906 path = information['filepath']
3908 filecodec = self.get_audio_codec(path)
3909 if filecodec is None:
3910 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3914 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3915 if filecodec in ['aac', 'mp3', 'vorbis']:
3916 # Lossless if possible
3918 extension = filecodec
3919 if filecodec == 'aac':
3920 more_opts = ['-f', 'adts']
3921 if filecodec == 'vorbis':
3925 acodec = 'libmp3lame'
3928 if self._preferredquality is not None:
3929 more_opts += ['-ab', self._preferredquality]
3931 # We convert the audio (lossy)
3932 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3933 extension = self._preferredcodec
3935 if self._preferredquality is not None:
3936 more_opts += ['-ab', self._preferredquality]
3937 if self._preferredcodec == 'aac':
3938 more_opts += ['-f', 'adts']
3939 if self._preferredcodec == 'vorbis':
3942 (prefix, ext) = os.path.splitext(path)
3943 new_path = prefix + '.' + extension
3944 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3945 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3948 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3951 # Try to update the date time for extracted audio file.
3952 if information.get('filetime') is not None:
3954 os.utime(new_path, (time.time(), information['filetime']))
3956 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3958 if not self._keepvideo:
3961 except (IOError, OSError):
3962 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3965 information['filepath'] = new_path
3969 def updateSelf(downloader, filename):
3970 ''' Update the program file with the latest version from the repository '''
3971 # Note: downloader only used for options
3972 if not os.access(filename, os.W_OK):
3973 sys.exit('ERROR: no write permissions on %s' % filename)
3975 downloader.to_screen('Updating to latest version...')
3979 urlh = urllib.urlopen(UPDATE_URL)
3980 newcontent = urlh.read()
3982 vmatch = re.search("__version__ = '([^']+)'", newcontent)
3983 if vmatch is not None and vmatch.group(1) == __version__:
3984 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
3988 except (IOError, OSError), err:
3989 sys.exit('ERROR: unable to download latest version')
3992 outf = open(filename, 'wb')
3994 outf.write(newcontent)
3997 except (IOError, OSError), err:
3998 sys.exit('ERROR: unable to overwrite current version')
4000 downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
4008 def _readOptions(filename):
4010 optionf = open(filename)
4012 return [] # silently skip if file is not present
4016 res += shlex.split(l, comments=True)
4021 def _format_option_string(option):
4022 ''' ('-o', '--option') -> -o, --format METAVAR'''
4026 if option._short_opts: opts.append(option._short_opts[0])
4027 if option._long_opts: opts.append(option._long_opts[0])
4028 if len(opts) > 1: opts.insert(1, ', ')
4030 if option.takes_value(): opts.append(' %s' % option.metavar)
4032 return "".join(opts)
4034 def _find_term_columns():
4035 columns = os.environ.get('COLUMNS', None)
4040 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4041 out,err = sp.communicate()
4042 return int(out.split()[1])
4048 max_help_position = 80
4050 # No need to wrap help messages if we're on a wide console
4051 columns = _find_term_columns()
4052 if columns: max_width = columns
4054 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4055 fmt.format_option_strings = _format_option_string
4058 'version' : __version__,
4060 'usage' : '%prog [options] url [url...]',
4061 'conflict_handler' : 'resolve',
4064 parser = optparse.OptionParser(**kw)
4067 general = optparse.OptionGroup(parser, 'General Options')
4068 selection = optparse.OptionGroup(parser, 'Video Selection')
4069 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4070 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4071 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4072 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4073 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4075 general.add_option('-h', '--help',
4076 action='help', help='print this help text and exit')
4077 general.add_option('-v', '--version',
4078 action='version', help='print program version and exit')
4079 general.add_option('-U', '--update',
4080 action='store_true', dest='update_self', help='update this program to latest version')
4081 general.add_option('-i', '--ignore-errors',
4082 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4083 general.add_option('-r', '--rate-limit',
4084 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4085 general.add_option('-R', '--retries',
4086 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4087 general.add_option('--dump-user-agent',
4088 action='store_true', dest='dump_user_agent',
4089 help='display the current browser identification', default=False)
4090 general.add_option('--list-extractors',
4091 action='store_true', dest='list_extractors',
4092 help='List all supported extractors and the URLs they would handle', default=False)
4094 selection.add_option('--playlist-start',
4095 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4096 selection.add_option('--playlist-end',
4097 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4098 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4099 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4100 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4102 authentication.add_option('-u', '--username',
4103 dest='username', metavar='USERNAME', help='account username')
4104 authentication.add_option('-p', '--password',
4105 dest='password', metavar='PASSWORD', help='account password')
4106 authentication.add_option('-n', '--netrc',
4107 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4110 video_format.add_option('-f', '--format',
4111 action='store', dest='format', metavar='FORMAT', help='video format code')
4112 video_format.add_option('--all-formats',
4113 action='store_const', dest='format', help='download all available video formats', const='all')
4114 video_format.add_option('--max-quality',
4115 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4116 video_format.add_option('-F', '--list-formats',
4117 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4120 verbosity.add_option('-q', '--quiet',
4121 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4122 verbosity.add_option('-s', '--simulate',
4123 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4124 verbosity.add_option('--skip-download',
4125 action='store_true', dest='skip_download', help='do not download the video', default=False)
4126 verbosity.add_option('-g', '--get-url',
4127 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4128 verbosity.add_option('-e', '--get-title',
4129 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4130 verbosity.add_option('--get-thumbnail',
4131 action='store_true', dest='getthumbnail',
4132 help='simulate, quiet but print thumbnail URL', default=False)
4133 verbosity.add_option('--get-description',
4134 action='store_true', dest='getdescription',
4135 help='simulate, quiet but print video description', default=False)
4136 verbosity.add_option('--get-filename',
4137 action='store_true', dest='getfilename',
4138 help='simulate, quiet but print output filename', default=False)
4139 verbosity.add_option('--get-format',
4140 action='store_true', dest='getformat',
4141 help='simulate, quiet but print output format', default=False)
4142 verbosity.add_option('--no-progress',
4143 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4144 verbosity.add_option('--console-title',
4145 action='store_true', dest='consoletitle',
4146 help='display progress in console titlebar', default=False)
4149 filesystem.add_option('-t', '--title',
4150 action='store_true', dest='usetitle', help='use title in file name', default=False)
4151 filesystem.add_option('-l', '--literal',
4152 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4153 filesystem.add_option('-A', '--auto-number',
4154 action='store_true', dest='autonumber',
4155 help='number downloaded files starting from 00000', default=False)
4156 filesystem.add_option('-o', '--output',
4157 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4158 filesystem.add_option('-a', '--batch-file',
4159 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4160 filesystem.add_option('-w', '--no-overwrites',
4161 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4162 filesystem.add_option('-c', '--continue',
4163 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
4164 filesystem.add_option('--no-continue',
4165 action='store_false', dest='continue_dl',
4166 help='do not resume partially downloaded files (restart from beginning)')
4167 filesystem.add_option('--cookies',
4168 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4169 filesystem.add_option('--no-part',
4170 action='store_true', dest='nopart', help='do not use .part files', default=False)
4171 filesystem.add_option('--no-mtime',
4172 action='store_false', dest='updatetime',
4173 help='do not use the Last-modified header to set the file modification time', default=True)
4174 filesystem.add_option('--write-description',
4175 action='store_true', dest='writedescription',
4176 help='write video description to a .description file', default=False)
4177 filesystem.add_option('--write-info-json',
4178 action='store_true', dest='writeinfojson',
4179 help='write video metadata to a .info.json file', default=False)
4182 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4183 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4184 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4185 help='"best", "aac", "vorbis" or "mp3"; best by default')
4186 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4187 help='ffmpeg audio bitrate specification, 128k by default')
4188 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4189 help='keeps the video file on disk after the post-processing; the video is erased by default')
4192 parser.add_option_group(general)
4193 parser.add_option_group(selection)
4194 parser.add_option_group(filesystem)
4195 parser.add_option_group(verbosity)
4196 parser.add_option_group(video_format)
4197 parser.add_option_group(authentication)
4198 parser.add_option_group(postproc)
4200 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4202 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4204 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4205 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4206 opts, args = parser.parse_args(argv)
4208 return parser, opts, args
4210 def gen_extractors():
4211 """ Return a list of an instance of every supported extractor.
4212 The order does matter; the first extractor matched is the one handling the URL.
4214 youtube_ie = YoutubeIE()
4215 google_ie = GoogleIE()
4216 yahoo_ie = YahooIE()
4218 YoutubePlaylistIE(youtube_ie),
4219 YoutubeUserIE(youtube_ie),
4220 YoutubeSearchIE(youtube_ie),
4222 MetacafeIE(youtube_ie),
4225 GoogleSearchIE(google_ie),
4228 YahooSearchIE(yahoo_ie),
4241 StanfordOpenClassroomIE(),
4247 parser, opts, args = parseOpts()
4249 # Open appropriate CookieJar
4250 if opts.cookiefile is None:
4251 jar = cookielib.CookieJar()
4254 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4255 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4257 except (IOError, OSError), err:
4258 sys.exit(u'ERROR: unable to open cookie file')
4261 if opts.dump_user_agent:
4262 print std_headers['User-Agent']
4265 # Batch file verification
4267 if opts.batchfile is not None:
4269 if opts.batchfile == '-':
4272 batchfd = open(opts.batchfile, 'r')
4273 batchurls = batchfd.readlines()
4274 batchurls = [x.strip() for x in batchurls]
4275 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4277 sys.exit(u'ERROR: batch file could not be read')
4278 all_urls = batchurls + args
4280 # General configuration
4281 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4282 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4283 urllib2.install_opener(opener)
4284 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4286 extractors = gen_extractors()
4288 if opts.list_extractors:
4289 for ie in extractors:
4291 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4292 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4293 for mu in matchedUrls:
4297 # Conflicting, missing and erroneous options
4298 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4299 parser.error(u'using .netrc conflicts with giving username/password')
4300 if opts.password is not None and opts.username is None:
4301 parser.error(u'account username missing')
4302 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4303 parser.error(u'using output template conflicts with using title, literal title or auto number')
4304 if opts.usetitle and opts.useliteral:
4305 parser.error(u'using title conflicts with using literal title')
4306 if opts.username is not None and opts.password is None:
4307 opts.password = getpass.getpass(u'Type account password and press return:')
4308 if opts.ratelimit is not None:
4309 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4310 if numeric_limit is None:
4311 parser.error(u'invalid rate limit specified')
4312 opts.ratelimit = numeric_limit
4313 if opts.retries is not None:
4315 opts.retries = long(opts.retries)
4316 except (TypeError, ValueError), err:
4317 parser.error(u'invalid retry count specified')
4319 opts.playliststart = int(opts.playliststart)
4320 if opts.playliststart <= 0:
4321 raise ValueError(u'Playlist start must be positive')
4322 except (TypeError, ValueError), err:
4323 parser.error(u'invalid playlist start number specified')
4325 opts.playlistend = int(opts.playlistend)
4326 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4327 raise ValueError(u'Playlist end must be greater than playlist start')
4328 except (TypeError, ValueError), err:
4329 parser.error(u'invalid playlist end number specified')
4330 if opts.extractaudio:
4331 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis']:
4332 parser.error(u'invalid audio format specified')
4335 fd = FileDownloader({
4336 'usenetrc': opts.usenetrc,
4337 'username': opts.username,
4338 'password': opts.password,
4339 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4340 'forceurl': opts.geturl,
4341 'forcetitle': opts.gettitle,
4342 'forcethumbnail': opts.getthumbnail,
4343 'forcedescription': opts.getdescription,
4344 'forcefilename': opts.getfilename,
4345 'forceformat': opts.getformat,
4346 'simulate': opts.simulate,
4347 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4348 'format': opts.format,
4349 'format_limit': opts.format_limit,
4350 'listformats': opts.listformats,
4351 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4352 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4353 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4354 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4355 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4356 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4357 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4358 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4359 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4360 or u'%(id)s.%(ext)s'),
4361 'ignoreerrors': opts.ignoreerrors,
4362 'ratelimit': opts.ratelimit,
4363 'nooverwrites': opts.nooverwrites,
4364 'retries': opts.retries,
4365 'continuedl': opts.continue_dl,
4366 'noprogress': opts.noprogress,
4367 'playliststart': opts.playliststart,
4368 'playlistend': opts.playlistend,
4369 'logtostderr': opts.outtmpl == '-',
4370 'consoletitle': opts.consoletitle,
4371 'nopart': opts.nopart,
4372 'updatetime': opts.updatetime,
4373 'writedescription': opts.writedescription,
4374 'writeinfojson': opts.writeinfojson,
4375 'matchtitle': opts.matchtitle,
4376 'rejecttitle': opts.rejecttitle,
4377 'max_downloads': opts.max_downloads,
4379 for extractor in extractors:
4380 fd.add_info_extractor(extractor)
4383 if opts.extractaudio:
4384 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4387 if opts.update_self:
4388 updateSelf(fd, sys.argv[0])
4391 if len(all_urls) < 1:
4392 if not opts.update_self:
4393 parser.error(u'you must provide at least one URL')
4396 retcode = fd.download(all_urls)
4398 # Dump cookie jar if requested
4399 if opts.cookiefile is not None:
4402 except (IOError, OSError), err:
4403 sys.exit(u'ERROR: unable to save cookie jar')
4410 except DownloadError:
4412 except SameFileError:
4413 sys.exit(u'ERROR: fixed output name but more than one file to download')
4414 except KeyboardInterrupt:
4415 sys.exit(u'\nERROR: Interrupted by user')
4417 if __name__ == '__main__':
4420 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: