2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
20 __license__ = 'Public Domain'
21 __version__ = '2011.11.23'
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
52 except ImportError: # Python 2.4
55 import cStringIO as StringIO
59 # parse_qs was moved from the cgi module to the urlparse module recently.
61 from urlparse import parse_qs
63 from cgi import parse_qs
71 import xml.etree.ElementTree
72 except ImportError: # Python<2.5: Not officially supported, but let it slip
73 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
76 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
77 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
78 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
79 'Accept-Encoding': 'gzip, deflate',
80 'Accept-Language': 'en-us,en;q=0.5',
85 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
91 def raiseError(msg, i):
92 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
93 def skipSpace(i, expectMore=True):
94 while i < len(s) and s[i] in ' \t\r\n':
98 raiseError('Premature end', i)
100 def decodeEscape(match):
116 return unichr(int(esc[1:5], 16))
117 if len(esc) == 5+6 and esc[5:7] == '\\u':
118 hi = int(esc[1:5], 16)
119 low = int(esc[7:11], 16)
120 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
121 raise ValueError('Unknown escape ' + str(esc))
128 while s[e-bslashes-1] == '\\':
130 if bslashes % 2 == 1:
134 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
135 stri = rexp.sub(decodeEscape, s[i:e])
141 if s[i] == '}': # Empty dictionary
145 raiseError('Expected a string object key', i)
146 i,key = parseString(i)
148 if i >= len(s) or s[i] != ':':
149 raiseError('Expected a colon', i)
156 raiseError('Expected comma or closing curly brace', i)
161 if s[i] == ']': # Empty array
166 i = skipSpace(i) # Raise exception if premature end
170 raiseError('Expected a comma or closing bracket', i)
172 def parseDiscrete(i):
173 for k,v in {'true': True, 'false': False, 'null': None}.items():
174 if s.startswith(k, i):
176 raiseError('Not a boolean (or null)', i)
178 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
180 raiseError('Not a number', i)
182 if '.' in nums or 'e' in nums or 'E' in nums:
183 return (i+len(nums), float(nums))
184 return (i+len(nums), int(nums))
185 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
188 i,res = CHARMAP.get(s[i], parseNumber)(i)
189 i = skipSpace(i, False)
193 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
196 def preferredencoding():
197 """Get preferred encoding.
199 Returns the best encoding scheme for the system, based on
200 locale.getpreferredencoding() and some further tweaks.
202 def yield_preferredencoding():
204 pref = locale.getpreferredencoding()
210 return yield_preferredencoding().next()
213 def htmlentity_transform(matchobj):
214 """Transforms an HTML entity to a Unicode character.
216 This function receives a match object and is intended to be used with
217 the re.sub() function.
219 entity = matchobj.group(1)
221 # Known non-numeric HTML entity
222 if entity in htmlentitydefs.name2codepoint:
223 return unichr(htmlentitydefs.name2codepoint[entity])
226 mobj = re.match(ur'(?u)#(x?\d+)', entity)
228 numstr = mobj.group(1)
229 if numstr.startswith(u'x'):
231 numstr = u'0%s' % numstr
234 return unichr(long(numstr, base))
236 # Unknown entity in name, return its literal representation
237 return (u'&%s;' % entity)
240 def sanitize_title(utitle):
241 """Sanitizes a video title so it could be used as part of a filename."""
242 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
243 return utitle.replace(unicode(os.sep), u'%')
246 def sanitize_open(filename, open_mode):
247 """Try to open the given filename, and slightly tweak it if this fails.
249 Attempts to open the given filename. If this fails, it tries to change
250 the filename slightly, step by step, until it's either able to open it
251 or it fails and raises a final exception, like the standard open()
254 It returns the tuple (stream, definitive_file_name).
258 if sys.platform == 'win32':
260 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
261 return (sys.stdout, filename)
262 stream = open(filename, open_mode)
263 return (stream, filename)
264 except (IOError, OSError), err:
265 # In case of error, try to remove win32 forbidden chars
266 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
268 # An exception here should be caught in the caller
269 stream = open(filename, open_mode)
270 return (stream, filename)
273 def timeconvert(timestr):
274 """Convert RFC 2822 defined time string into system timestamp"""
276 timetuple = email.utils.parsedate_tz(timestr)
277 if timetuple is not None:
278 timestamp = email.utils.mktime_tz(timetuple)
281 def _simplify_title(title):
282 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
283 return expr.sub(u'_', title).strip(u'_')
285 def _orderedSet(iterable):
286 """ Remove all duplicates from the input iterable """
293 class DownloadError(Exception):
294 """Download Error exception.
296 This exception may be thrown by FileDownloader objects if they are not
297 configured to continue on errors. They will contain the appropriate
303 class SameFileError(Exception):
304 """Same File exception.
306 This exception will be thrown by FileDownloader objects if they detect
307 multiple files would have to be downloaded to the same file on disk.
312 class PostProcessingError(Exception):
313 """Post Processing exception.
315 This exception may be raised by PostProcessor's .run() method to
316 indicate an error in the postprocessing task.
320 class MaxDownloadsReached(Exception):
321 """ --max-downloads limit has been reached. """
325 class UnavailableVideoError(Exception):
326 """Unavailable Format exception.
328 This exception will be thrown when a video is requested
329 in a format that is not available for that video.
334 class ContentTooShortError(Exception):
335 """Content Too Short exception.
337 This exception may be raised by FileDownloader objects when a file they
338 download is too small for what the server announced first, indicating
339 the connection was probably interrupted.
345 def __init__(self, downloaded, expected):
346 self.downloaded = downloaded
347 self.expected = expected
350 class YoutubeDLHandler(urllib2.HTTPHandler):
351 """Handler for HTTP requests and responses.
353 This class, when installed with an OpenerDirector, automatically adds
354 the standard headers to every HTTP request and handles gzipped and
355 deflated responses from web servers. If compression is to be avoided in
356 a particular request, the original request in the program code only has
357 to include the HTTP header "Youtubedl-No-Compression", which will be
358 removed before making the real request.
360 Part of this code was copied from:
362 http://techknack.net/python-urllib2-handlers/
364 Andrew Rowls, the author of that code, agreed to release it to the
371 return zlib.decompress(data, -zlib.MAX_WBITS)
373 return zlib.decompress(data)
376 def addinfourl_wrapper(stream, headers, url, code):
377 if hasattr(urllib2.addinfourl, 'getcode'):
378 return urllib2.addinfourl(stream, headers, url, code)
379 ret = urllib2.addinfourl(stream, headers, url)
383 def http_request(self, req):
384 for h in std_headers:
387 req.add_header(h, std_headers[h])
388 if 'Youtubedl-no-compression' in req.headers:
389 if 'Accept-encoding' in req.headers:
390 del req.headers['Accept-encoding']
391 del req.headers['Youtubedl-no-compression']
394 def http_response(self, req, resp):
397 if resp.headers.get('Content-encoding', '') == 'gzip':
398 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
399 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
400 resp.msg = old_resp.msg
402 if resp.headers.get('Content-encoding', '') == 'deflate':
403 gz = StringIO.StringIO(self.deflate(resp.read()))
404 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
405 resp.msg = old_resp.msg
409 class FileDownloader(object):
410 """File Downloader class.
412 File downloader objects are the ones responsible of downloading the
413 actual video file and writing it to disk if the user has requested
414 it, among some other tasks. In most cases there should be one per
415 program. As, given a video URL, the downloader doesn't know how to
416 extract all the needed information, task that InfoExtractors do, it
417 has to pass the URL to one of them.
419 For this, file downloader objects have a method that allows
420 InfoExtractors to be registered in a given order. When it is passed
421 a URL, the file downloader handles it to the first InfoExtractor it
422 finds that reports being able to handle it. The InfoExtractor extracts
423 all the information about the video or videos the URL refers to, and
424 asks the FileDownloader to process the video information, possibly
425 downloading the video.
427 File downloaders accept a lot of parameters. In order not to saturate
428 the object constructor with arguments, it receives a dictionary of
429 options instead. These options are available through the params
430 attribute for the InfoExtractors to use. The FileDownloader also
431 registers itself as the downloader in charge for the InfoExtractors
432 that are added to it, so this is a "mutual registration".
436 username: Username for authentication purposes.
437 password: Password for authentication purposes.
438 usenetrc: Use netrc for authentication instead.
439 quiet: Do not print messages to stdout.
440 forceurl: Force printing final URL.
441 forcetitle: Force printing title.
442 forcethumbnail: Force printing thumbnail URL.
443 forcedescription: Force printing description.
444 forcefilename: Force printing final filename.
445 simulate: Do not download the video files.
446 format: Video format code.
447 format_limit: Highest quality format to try.
448 outtmpl: Template for output names.
449 ignoreerrors: Do not stop on download errors.
450 ratelimit: Download speed limit, in bytes/sec.
451 nooverwrites: Prevent overwriting files.
452 retries: Number of times to retry for HTTP error 5xx
453 continuedl: Try to continue downloads if possible.
454 noprogress: Do not print the progress bar.
455 playliststart: Playlist item to start at.
456 playlistend: Playlist item to end at.
457 matchtitle: Download only matching titles.
458 rejecttitle: Reject downloads for matching titles.
459 logtostderr: Log messages to stderr instead of stdout.
460 consoletitle: Display progress in console window's titlebar.
461 nopart: Do not use temporary .part files.
462 updatetime: Use the Last-modified header to set output file timestamps.
463 writedescription: Write the video description to a .description file
464 writeinfojson: Write the video description to a .info.json file
470 _download_retcode = None
471 _num_downloads = None
474 def __init__(self, params):
475 """Create a FileDownloader object with the given options."""
478 self._download_retcode = 0
479 self._num_downloads = 0
480 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
484 def format_bytes(bytes):
487 if type(bytes) is str:
492 exponent = long(math.log(bytes, 1024.0))
493 suffix = 'bkMGTPEZY'[exponent]
494 converted = float(bytes) / float(1024 ** exponent)
495 return '%.2f%s' % (converted, suffix)
498 def calc_percent(byte_counter, data_len):
501 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
504 def calc_eta(start, now, total, current):
508 if current == 0 or dif < 0.001: # One millisecond
510 rate = float(current) / dif
511 eta = long((float(total) - float(current)) / rate)
512 (eta_mins, eta_secs) = divmod(eta, 60)
515 return '%02d:%02d' % (eta_mins, eta_secs)
518 def calc_speed(start, now, bytes):
520 if bytes == 0 or dif < 0.001: # One millisecond
521 return '%10s' % '---b/s'
522 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
525 def best_block_size(elapsed_time, bytes):
526 new_min = max(bytes / 2.0, 1.0)
527 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
528 if elapsed_time < 0.001:
530 rate = bytes / elapsed_time
538 def parse_bytes(bytestr):
539 """Parse a string indicating a byte quantity into a long integer."""
540 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
543 number = float(matchobj.group(1))
544 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
545 return long(round(number * multiplier))
547 def add_info_extractor(self, ie):
548 """Add an InfoExtractor object to the end of the list."""
550 ie.set_downloader(self)
552 def add_post_processor(self, pp):
553 """Add a PostProcessor object to the end of the chain."""
555 pp.set_downloader(self)
557 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
558 """Print message to stdout if not in quiet mode."""
560 if not self.params.get('quiet', False):
561 terminator = [u'\n', u''][skip_eol]
562 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
563 self._screen_file.flush()
564 except (UnicodeEncodeError), err:
565 if not ignore_encoding_errors:
568 def to_stderr(self, message):
569 """Print message to stderr."""
570 print >>sys.stderr, message.encode(preferredencoding())
572 def to_cons_title(self, message):
573 """Set console/terminal window title to message."""
574 if not self.params.get('consoletitle', False):
576 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
577 # c_wchar_p() might not be necessary if `message` is
578 # already of type unicode()
579 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
580 elif 'TERM' in os.environ:
581 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
583 def fixed_template(self):
584 """Checks if the output template is fixed."""
585 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
587 def trouble(self, message=None):
588 """Determine action to take when a download problem appears.
590 Depending on if the downloader has been configured to ignore
591 download errors or not, this method may throw an exception or
592 not when errors are found, after printing the message.
594 if message is not None:
595 self.to_stderr(message)
596 if not self.params.get('ignoreerrors', False):
597 raise DownloadError(message)
598 self._download_retcode = 1
600 def slow_down(self, start_time, byte_counter):
601 """Sleep if the download speed is over the rate limit."""
602 rate_limit = self.params.get('ratelimit', None)
603 if rate_limit is None or byte_counter == 0:
606 elapsed = now - start_time
609 speed = float(byte_counter) / elapsed
610 if speed > rate_limit:
611 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
613 def temp_name(self, filename):
614 """Returns a temporary filename for the given filename."""
615 if self.params.get('nopart', False) or filename == u'-' or \
616 (os.path.exists(filename) and not os.path.isfile(filename)):
618 return filename + u'.part'
620 def undo_temp_name(self, filename):
621 if filename.endswith(u'.part'):
622 return filename[:-len(u'.part')]
625 def try_rename(self, old_filename, new_filename):
627 if old_filename == new_filename:
629 os.rename(old_filename, new_filename)
630 except (IOError, OSError), err:
631 self.trouble(u'ERROR: unable to rename file')
633 def try_utime(self, filename, last_modified_hdr):
634 """Try to set the last-modified time of the given file."""
635 if last_modified_hdr is None:
637 if not os.path.isfile(filename):
639 timestr = last_modified_hdr
642 filetime = timeconvert(timestr)
646 os.utime(filename, (time.time(), filetime))
651 def report_writedescription(self, descfn):
652 """ Report that the description file is being written """
653 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
655 def report_writeinfojson(self, infofn):
656 """ Report that the metadata file has been written """
657 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
659 def report_destination(self, filename):
660 """Report destination filename."""
661 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
663 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
664 """Report download progress."""
665 if self.params.get('noprogress', False):
667 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
668 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
669 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
670 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
672 def report_resuming_byte(self, resume_len):
673 """Report attempt to resume at given byte."""
674 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
676 def report_retry(self, count, retries):
677 """Report retry in case of HTTP error 5xx"""
678 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
680 def report_file_already_downloaded(self, file_name):
681 """Report file has already been fully downloaded."""
683 self.to_screen(u'[download] %s has already been downloaded' % file_name)
684 except (UnicodeEncodeError), err:
685 self.to_screen(u'[download] The file has already been downloaded')
687 def report_unable_to_resume(self):
688 """Report it was impossible to resume download."""
689 self.to_screen(u'[download] Unable to resume')
691 def report_finish(self):
692 """Report download finished."""
693 if self.params.get('noprogress', False):
694 self.to_screen(u'[download] Download completed')
698 def increment_downloads(self):
699 """Increment the ordinal that assigns a number to each file."""
700 self._num_downloads += 1
702 def prepare_filename(self, info_dict):
703 """Generate the output filename."""
705 template_dict = dict(info_dict)
706 template_dict['epoch'] = unicode(long(time.time()))
707 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
708 filename = self.params['outtmpl'] % template_dict
710 except (ValueError, KeyError), err:
711 self.trouble(u'ERROR: invalid system charset or erroneous output template')
714 def _match_entry(self, info_dict):
715 """ Returns None iff the file should be downloaded """
717 title = info_dict['title']
718 matchtitle = self.params.get('matchtitle', False)
719 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
720 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
721 rejecttitle = self.params.get('rejecttitle', False)
722 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
723 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
726 def process_info(self, info_dict):
727 """Process a single dictionary returned by an InfoExtractor."""
729 reason = self._match_entry(info_dict)
730 if reason is not None:
731 self.to_screen(u'[download] ' + reason)
734 max_downloads = self.params.get('max_downloads')
735 if max_downloads is not None:
736 if self._num_downloads > int(max_downloads):
737 raise MaxDownloadsReached()
739 filename = self.prepare_filename(info_dict)
742 if self.params.get('forcetitle', False):
743 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
744 if self.params.get('forceurl', False):
745 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
746 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
747 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
748 if self.params.get('forcedescription', False) and 'description' in info_dict:
749 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
750 if self.params.get('forcefilename', False) and filename is not None:
751 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
752 if self.params.get('forceformat', False):
753 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
755 # Do nothing else if in simulate mode
756 if self.params.get('simulate', False):
762 if self.params.get('nooverwrites', False) and os.path.exists(filename):
763 self.to_stderr(u'WARNING: file exists and will be skipped')
767 dn = os.path.dirname(filename)
768 if dn != '' and not os.path.exists(dn):
770 except (OSError, IOError), err:
771 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
774 if self.params.get('writedescription', False):
776 descfn = filename + '.description'
777 self.report_writedescription(descfn)
778 descfile = open(descfn, 'wb')
780 descfile.write(info_dict['description'].encode('utf-8'))
783 except (OSError, IOError):
784 self.trouble(u'ERROR: Cannot write description file ' + descfn)
787 if self.params.get('writeinfojson', False):
788 infofn = filename + '.info.json'
789 self.report_writeinfojson(infofn)
792 except (NameError,AttributeError):
793 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
796 infof = open(infofn, 'wb')
798 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
799 json.dump(json_info_dict, infof)
802 except (OSError, IOError):
803 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
806 if not self.params.get('skip_download', False):
808 success = self._do_download(filename, info_dict)
809 except (OSError, IOError), err:
810 raise UnavailableVideoError
811 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
812 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
814 except (ContentTooShortError, ), err:
815 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
820 self.post_process(filename, info_dict)
821 except (PostProcessingError), err:
822 self.trouble(u'ERROR: postprocessing: %s' % str(err))
825 def download(self, url_list):
826 """Download a given list of URLs."""
827 if len(url_list) > 1 and self.fixed_template():
828 raise SameFileError(self.params['outtmpl'])
831 suitable_found = False
833 # Go to next InfoExtractor if not suitable
834 if not ie.suitable(url):
837 # Suitable InfoExtractor found
838 suitable_found = True
840 # Extract information from URL and process it
843 # Suitable InfoExtractor had been found; go to next URL
846 if not suitable_found:
847 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
849 return self._download_retcode
851 def post_process(self, filename, ie_info):
852 """Run the postprocessing chain on the given file."""
854 info['filepath'] = filename
860 def _download_with_rtmpdump(self, filename, url, player_url):
861 self.report_destination(filename)
862 tmpfilename = self.temp_name(filename)
864 # Check for rtmpdump first
866 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
867 except (OSError, IOError):
868 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
871 # Download using rtmpdump. rtmpdump returns exit code 2 when
872 # the connection was interrumpted and resuming appears to be
873 # possible. This is part of rtmpdump's normal usage, AFAIK.
874 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
875 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
876 while retval == 2 or retval == 1:
877 prevsize = os.path.getsize(tmpfilename)
878 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
879 time.sleep(5.0) # This seems to be needed
880 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
881 cursize = os.path.getsize(tmpfilename)
882 if prevsize == cursize and retval == 1:
884 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
885 if prevsize == cursize and retval == 2 and cursize > 1024:
886 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
890 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
891 self.try_rename(tmpfilename, filename)
894 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
897 def _do_download(self, filename, info_dict):
898 url = info_dict['url']
899 player_url = info_dict.get('player_url', None)
901 # Check file already present
902 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
903 self.report_file_already_downloaded(filename)
906 # Attempt to download using rtmpdump
907 if url.startswith('rtmp'):
908 return self._download_with_rtmpdump(filename, url, player_url)
910 tmpfilename = self.temp_name(filename)
913 # Do not include the Accept-Encoding header
914 headers = {'Youtubedl-no-compression': 'True'}
915 basic_request = urllib2.Request(url, None, headers)
916 request = urllib2.Request(url, None, headers)
918 # Establish possible resume length
919 if os.path.isfile(tmpfilename):
920 resume_len = os.path.getsize(tmpfilename)
926 if self.params.get('continuedl', False):
927 self.report_resuming_byte(resume_len)
928 request.add_header('Range','bytes=%d-' % resume_len)
934 retries = self.params.get('retries', 0)
935 while count <= retries:
936 # Establish connection
938 if count == 0 and 'urlhandle' in info_dict:
939 data = info_dict['urlhandle']
940 data = urllib2.urlopen(request)
942 except (urllib2.HTTPError, ), err:
943 if (err.code < 500 or err.code >= 600) and err.code != 416:
944 # Unexpected HTTP error
946 elif err.code == 416:
947 # Unable to resume (requested range not satisfiable)
949 # Open the connection again without the range header
950 data = urllib2.urlopen(basic_request)
951 content_length = data.info()['Content-Length']
952 except (urllib2.HTTPError, ), err:
953 if err.code < 500 or err.code >= 600:
956 # Examine the reported length
957 if (content_length is not None and
958 (resume_len - 100 < long(content_length) < resume_len + 100)):
959 # The file had already been fully downloaded.
960 # Explanation to the above condition: in issue #175 it was revealed that
961 # YouTube sometimes adds or removes a few bytes from the end of the file,
962 # changing the file size slightly and causing problems for some users. So
963 # I decided to implement a suggested change and consider the file
964 # completely downloaded if the file size differs less than 100 bytes from
965 # the one in the hard drive.
966 self.report_file_already_downloaded(filename)
967 self.try_rename(tmpfilename, filename)
970 # The length does not match, we start the download over
971 self.report_unable_to_resume()
977 self.report_retry(count, retries)
980 self.trouble(u'ERROR: giving up after %s retries' % retries)
983 data_len = data.info().get('Content-length', None)
984 if data_len is not None:
985 data_len = long(data_len) + resume_len
986 data_len_str = self.format_bytes(data_len)
987 byte_counter = 0 + resume_len
993 data_block = data.read(block_size)
995 if len(data_block) == 0:
997 byte_counter += len(data_block)
999 # Open file just in time
1002 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1003 assert stream is not None
1004 filename = self.undo_temp_name(tmpfilename)
1005 self.report_destination(filename)
1006 except (OSError, IOError), err:
1007 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1010 stream.write(data_block)
1011 except (IOError, OSError), err:
1012 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1014 block_size = self.best_block_size(after - before, len(data_block))
1017 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1018 if data_len is None:
1019 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1021 percent_str = self.calc_percent(byte_counter, data_len)
1022 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1023 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1026 self.slow_down(start, byte_counter - resume_len)
1029 self.trouble(u'\nERROR: Did not get any data blocks')
1032 self.report_finish()
1033 if data_len is not None and byte_counter != data_len:
1034 raise ContentTooShortError(byte_counter, long(data_len))
1035 self.try_rename(tmpfilename, filename)
1037 # Update file modification time
1038 if self.params.get('updatetime', True):
1039 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1044 class InfoExtractor(object):
1045 """Information Extractor class.
1047 Information extractors are the classes that, given a URL, extract
1048 information from the video (or videos) the URL refers to. This
1049 information includes the real video URL, the video title and simplified
1050 title, author and others. The information is stored in a dictionary
1051 which is then passed to the FileDownloader. The FileDownloader
1052 processes this information possibly downloading the video to the file
1053 system, among other possible outcomes. The dictionaries must include
1054 the following fields:
1056 id: Video identifier.
1057 url: Final video URL.
1058 uploader: Nickname of the video uploader.
1059 title: Literal title.
1060 stitle: Simplified title.
1061 ext: Video filename extension.
1062 format: Video format.
1063 player_url: SWF Player URL (may be None).
1065 The following fields are optional. Their primary purpose is to allow
1066 youtube-dl to serve as the backend for a video search function, such
1067 as the one in youtube2mp3. They are only used when their respective
1068 forced printing functions are called:
1070 thumbnail: Full URL to a video thumbnail image.
1071 description: One-line video description.
1073 Subclasses of this one should re-define the _real_initialize() and
1074 _real_extract() methods and define a _VALID_URL regexp.
1075 Probably, they should also be added to the list of extractors.
1081 def __init__(self, downloader=None):
1082 """Constructor. Receives an optional downloader."""
1084 self.set_downloader(downloader)
1086 def suitable(self, url):
1087 """Receives a URL and returns True if suitable for this IE."""
1088 return re.match(self._VALID_URL, url) is not None
1090 def initialize(self):
1091 """Initializes an instance (authentication, etc)."""
1093 self._real_initialize()
1096 def extract(self, url):
1097 """Extracts URL information and returns it in list of dicts."""
1099 return self._real_extract(url)
1101 def set_downloader(self, downloader):
1102 """Sets the downloader for this IE."""
1103 self._downloader = downloader
1105 def _real_initialize(self):
1106 """Real initialization process. Redefine in subclasses."""
1109 def _real_extract(self, url):
1110 """Real extraction process. Redefine in subclasses."""
1114 class YoutubeIE(InfoExtractor):
1115 """Information extractor for youtube.com."""
1117 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1118 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1119 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1120 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1121 _NETRC_MACHINE = 'youtube'
1122 # Listed in order of quality
1123 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1124 _video_extensions = {
1130 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1135 _video_dimensions = {
1150 IE_NAME = u'youtube'
1152 def report_lang(self):
1153 """Report attempt to set language."""
1154 self._downloader.to_screen(u'[youtube] Setting language')
1156 def report_login(self):
1157 """Report attempt to log in."""
1158 self._downloader.to_screen(u'[youtube] Logging in')
1160 def report_age_confirmation(self):
1161 """Report attempt to confirm age."""
1162 self._downloader.to_screen(u'[youtube] Confirming age')
1164 def report_video_webpage_download(self, video_id):
1165 """Report attempt to download video webpage."""
1166 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1168 def report_video_info_webpage_download(self, video_id):
1169 """Report attempt to download video info webpage."""
1170 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1172 def report_information_extraction(self, video_id):
1173 """Report attempt to extract video information."""
1174 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1176 def report_unavailable_format(self, video_id, format):
1177 """Report extracted video URL."""
1178 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1180 def report_rtmp_download(self):
1181 """Indicate the download will use the RTMP protocol."""
1182 self._downloader.to_screen(u'[youtube] RTMP download detected')
1184 def _print_formats(self, formats):
1185 print 'Available formats:'
1187 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1189 def _real_initialize(self):
1190 if self._downloader is None:
1195 downloader_params = self._downloader.params
1197 # Attempt to use provided username and password or .netrc data
1198 if downloader_params.get('username', None) is not None:
1199 username = downloader_params['username']
1200 password = downloader_params['password']
1201 elif downloader_params.get('usenetrc', False):
1203 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1204 if info is not None:
1208 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1209 except (IOError, netrc.NetrcParseError), err:
1210 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1214 request = urllib2.Request(self._LANG_URL)
1217 urllib2.urlopen(request).read()
1218 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1219 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1222 # No authentication to be performed
1223 if username is None:
1228 'current_form': 'loginForm',
1230 'action_login': 'Log In',
1231 'username': username,
1232 'password': password,
1234 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1237 login_results = urllib2.urlopen(request).read()
1238 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1239 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1241 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1242 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1248 'action_confirm': 'Confirm',
1250 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1252 self.report_age_confirmation()
1253 age_results = urllib2.urlopen(request).read()
1254 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1255 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1258 def _real_extract(self, url):
1259 # Extract video id from URL
1260 mobj = re.match(self._VALID_URL, url)
1262 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1264 video_id = mobj.group(2)
1267 self.report_video_webpage_download(video_id)
1268 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1270 video_webpage = urllib2.urlopen(request).read()
1271 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1272 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1275 # Attempt to extract SWF player URL
1276 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1277 if mobj is not None:
1278 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1283 self.report_video_info_webpage_download(video_id)
1284 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1285 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1286 % (video_id, el_type))
1287 request = urllib2.Request(video_info_url)
1289 video_info_webpage = urllib2.urlopen(request).read()
1290 video_info = parse_qs(video_info_webpage)
1291 if 'token' in video_info:
1293 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1294 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1296 if 'token' not in video_info:
1297 if 'reason' in video_info:
1298 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1300 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1303 # Start extracting information
1304 self.report_information_extraction(video_id)
1307 if 'author' not in video_info:
1308 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1310 video_uploader = urllib.unquote_plus(video_info['author'][0])
1313 if 'title' not in video_info:
1314 self._downloader.trouble(u'ERROR: unable to extract video title')
1316 video_title = urllib.unquote_plus(video_info['title'][0])
1317 video_title = video_title.decode('utf-8')
1318 video_title = sanitize_title(video_title)
1321 simple_title = _simplify_title(video_title)
1324 if 'thumbnail_url' not in video_info:
1325 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1326 video_thumbnail = ''
1327 else: # don't panic if we can't find it
1328 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1332 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1333 if mobj is not None:
1334 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1335 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1336 for expression in format_expressions:
1338 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1346 video_description = u'No description available.'
1347 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1348 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1349 if mobj is not None:
1350 video_description = mobj.group(1).decode('utf-8')
1352 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1353 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1354 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1355 # TODO use another parser
1358 video_token = urllib.unquote_plus(video_info['token'][0])
1360 # Decide which formats to download
1361 req_format = self._downloader.params.get('format', None)
1363 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1364 self.report_rtmp_download()
1365 video_url_list = [(None, video_info['conn'][0])]
1366 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1367 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1368 url_data = [parse_qs(uds) for uds in url_data_strs]
1369 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1370 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1372 format_limit = self._downloader.params.get('format_limit', None)
1373 if format_limit is not None and format_limit in self._available_formats:
1374 format_list = self._available_formats[self._available_formats.index(format_limit):]
1376 format_list = self._available_formats
1377 existing_formats = [x for x in format_list if x in url_map]
1378 if len(existing_formats) == 0:
1379 self._downloader.trouble(u'ERROR: no known formats available for video')
1381 if self._downloader.params.get('listformats', None):
1382 self._print_formats(existing_formats)
1384 if req_format is None or req_format == 'best':
1385 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1386 elif req_format == 'worst':
1387 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1388 elif req_format in ('-1', 'all'):
1389 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1391 # Specific formats. We pick the first in a slash-delimeted sequence.
1392 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1393 req_formats = req_format.split('/')
1394 video_url_list = None
1395 for rf in req_formats:
1397 video_url_list = [(rf, url_map[rf])]
1399 if video_url_list is None:
1400 self._downloader.trouble(u'ERROR: requested format not available')
1403 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1406 for format_param, video_real_url in video_url_list:
1407 # At this point we have a new video
1408 self._downloader.increment_downloads()
1411 video_extension = self._video_extensions.get(format_param, 'flv')
1414 # Process video information
1415 self._downloader.process_info({
1416 'id': video_id.decode('utf-8'),
1417 'url': video_real_url.decode('utf-8'),
1418 'uploader': video_uploader.decode('utf-8'),
1419 'upload_date': upload_date,
1420 'title': video_title,
1421 'stitle': simple_title,
1422 'ext': video_extension.decode('utf-8'),
1423 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1424 'thumbnail': video_thumbnail.decode('utf-8'),
1425 'description': video_description,
1426 'player_url': player_url,
1428 except UnavailableVideoError, err:
1429 self._downloader.trouble(u'\nERROR: unable to download video')
1432 class MetacafeIE(InfoExtractor):
1433 """Information Extractor for metacafe.com."""
1435 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1436 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1437 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1439 IE_NAME = u'metacafe'
1441 def __init__(self, youtube_ie, downloader=None):
1442 InfoExtractor.__init__(self, downloader)
1443 self._youtube_ie = youtube_ie
1445 def report_disclaimer(self):
1446 """Report disclaimer retrieval."""
1447 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1449 def report_age_confirmation(self):
1450 """Report attempt to confirm age."""
1451 self._downloader.to_screen(u'[metacafe] Confirming age')
1453 def report_download_webpage(self, video_id):
1454 """Report webpage download."""
1455 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1457 def report_extraction(self, video_id):
1458 """Report information extraction."""
1459 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1461 def _real_initialize(self):
1462 # Retrieve disclaimer
1463 request = urllib2.Request(self._DISCLAIMER)
1465 self.report_disclaimer()
1466 disclaimer = urllib2.urlopen(request).read()
1467 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1468 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1474 'submit': "Continue - I'm over 18",
1476 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1478 self.report_age_confirmation()
1479 disclaimer = urllib2.urlopen(request).read()
1480 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1481 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1484 def _real_extract(self, url):
1485 # Extract id and simplified title from URL
1486 mobj = re.match(self._VALID_URL, url)
1488 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1491 video_id = mobj.group(1)
1493 # Check if video comes from YouTube
1494 mobj2 = re.match(r'^yt-(.*)$', video_id)
1495 if mobj2 is not None:
1496 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1499 # At this point we have a new video
1500 self._downloader.increment_downloads()
1502 simple_title = mobj.group(2).decode('utf-8')
1504 # Retrieve video webpage to extract further information
1505 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1507 self.report_download_webpage(video_id)
1508 webpage = urllib2.urlopen(request).read()
1509 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1510 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1513 # Extract URL, uploader and title from webpage
1514 self.report_extraction(video_id)
1515 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1516 if mobj is not None:
1517 mediaURL = urllib.unquote(mobj.group(1))
1518 video_extension = mediaURL[-3:]
1520 # Extract gdaKey if available
1521 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1523 video_url = mediaURL
1525 gdaKey = mobj.group(1)
1526 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1528 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1530 self._downloader.trouble(u'ERROR: unable to extract media URL')
1532 vardict = parse_qs(mobj.group(1))
1533 if 'mediaData' not in vardict:
1534 self._downloader.trouble(u'ERROR: unable to extract media URL')
1536 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1538 self._downloader.trouble(u'ERROR: unable to extract media URL')
1540 mediaURL = mobj.group(1).replace('\\/', '/')
1541 video_extension = mediaURL[-3:]
1542 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1544 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1546 self._downloader.trouble(u'ERROR: unable to extract title')
1548 video_title = mobj.group(1).decode('utf-8')
1549 video_title = sanitize_title(video_title)
1551 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1553 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1555 video_uploader = mobj.group(1)
1558 # Process video information
1559 self._downloader.process_info({
1560 'id': video_id.decode('utf-8'),
1561 'url': video_url.decode('utf-8'),
1562 'uploader': video_uploader.decode('utf-8'),
1563 'upload_date': u'NA',
1564 'title': video_title,
1565 'stitle': simple_title,
1566 'ext': video_extension.decode('utf-8'),
1570 except UnavailableVideoError:
1571 self._downloader.trouble(u'\nERROR: unable to download video')
1574 class DailymotionIE(InfoExtractor):
1575 """Information Extractor for Dailymotion"""
1577 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1578 IE_NAME = u'dailymotion'
1580 def __init__(self, downloader=None):
1581 InfoExtractor.__init__(self, downloader)
1583 def report_download_webpage(self, video_id):
1584 """Report webpage download."""
1585 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1587 def report_extraction(self, video_id):
1588 """Report information extraction."""
1589 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1591 def _real_extract(self, url):
1592 # Extract id and simplified title from URL
1593 mobj = re.match(self._VALID_URL, url)
1595 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1598 # At this point we have a new video
1599 self._downloader.increment_downloads()
1600 video_id = mobj.group(1)
1602 simple_title = mobj.group(2).decode('utf-8')
1603 video_extension = 'flv'
1605 # Retrieve video webpage to extract further information
1606 request = urllib2.Request(url)
1607 request.add_header('Cookie', 'family_filter=off')
1609 self.report_download_webpage(video_id)
1610 webpage = urllib2.urlopen(request).read()
1611 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1612 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1615 # Extract URL, uploader and title from webpage
1616 self.report_extraction(video_id)
1617 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1619 self._downloader.trouble(u'ERROR: unable to extract media URL')
1621 sequence = urllib.unquote(mobj.group(1))
1622 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1624 self._downloader.trouble(u'ERROR: unable to extract media URL')
1626 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1628 # if needed add http://www.dailymotion.com/ if relative URL
1630 video_url = mediaURL
1632 mobj = re.search(r'(?im)<title>\s*(.+)\s*-\s*Video\s+Dailymotion</title>', webpage)
1634 self._downloader.trouble(u'ERROR: unable to extract title')
1636 video_title = mobj.group(1).decode('utf-8')
1637 video_title = sanitize_title(video_title)
1639 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1641 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1643 video_uploader = mobj.group(1)
1646 # Process video information
1647 self._downloader.process_info({
1648 'id': video_id.decode('utf-8'),
1649 'url': video_url.decode('utf-8'),
1650 'uploader': video_uploader.decode('utf-8'),
1651 'upload_date': u'NA',
1652 'title': video_title,
1653 'stitle': simple_title,
1654 'ext': video_extension.decode('utf-8'),
1658 except UnavailableVideoError:
1659 self._downloader.trouble(u'\nERROR: unable to download video')
1662 class GoogleIE(InfoExtractor):
1663 """Information extractor for video.google.com."""
1665 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1666 IE_NAME = u'video.google'
1668 def __init__(self, downloader=None):
1669 InfoExtractor.__init__(self, downloader)
1671 def report_download_webpage(self, video_id):
1672 """Report webpage download."""
1673 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1675 def report_extraction(self, video_id):
1676 """Report information extraction."""
1677 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1679 def _real_extract(self, url):
1680 # Extract id from URL
1681 mobj = re.match(self._VALID_URL, url)
1683 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1686 # At this point we have a new video
1687 self._downloader.increment_downloads()
1688 video_id = mobj.group(1)
1690 video_extension = 'mp4'
1692 # Retrieve video webpage to extract further information
1693 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1695 self.report_download_webpage(video_id)
1696 webpage = urllib2.urlopen(request).read()
1697 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1698 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1701 # Extract URL, uploader, and title from webpage
1702 self.report_extraction(video_id)
1703 mobj = re.search(r"download_url:'([^']+)'", webpage)
1705 video_extension = 'flv'
1706 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1708 self._downloader.trouble(u'ERROR: unable to extract media URL')
1710 mediaURL = urllib.unquote(mobj.group(1))
1711 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1712 mediaURL = mediaURL.replace('\\x26', '\x26')
1714 video_url = mediaURL
1716 mobj = re.search(r'<title>(.*)</title>', webpage)
1718 self._downloader.trouble(u'ERROR: unable to extract title')
1720 video_title = mobj.group(1).decode('utf-8')
1721 video_title = sanitize_title(video_title)
1722 simple_title = _simplify_title(video_title)
1724 # Extract video description
1725 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1727 self._downloader.trouble(u'ERROR: unable to extract video description')
1729 video_description = mobj.group(1).decode('utf-8')
1730 if not video_description:
1731 video_description = 'No description available.'
1733 # Extract video thumbnail
1734 if self._downloader.params.get('forcethumbnail', False):
1735 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1737 webpage = urllib2.urlopen(request).read()
1738 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1739 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1741 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1743 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1745 video_thumbnail = mobj.group(1)
1746 else: # we need something to pass to process_info
1747 video_thumbnail = ''
1750 # Process video information
1751 self._downloader.process_info({
1752 'id': video_id.decode('utf-8'),
1753 'url': video_url.decode('utf-8'),
1755 'upload_date': u'NA',
1756 'title': video_title,
1757 'stitle': simple_title,
1758 'ext': video_extension.decode('utf-8'),
1762 except UnavailableVideoError:
1763 self._downloader.trouble(u'\nERROR: unable to download video')
1766 class PhotobucketIE(InfoExtractor):
1767 """Information extractor for photobucket.com."""
1769 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1770 IE_NAME = u'photobucket'
1772 def __init__(self, downloader=None):
1773 InfoExtractor.__init__(self, downloader)
1775 def report_download_webpage(self, video_id):
1776 """Report webpage download."""
1777 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1779 def report_extraction(self, video_id):
1780 """Report information extraction."""
1781 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1783 def _real_extract(self, url):
1784 # Extract id from URL
1785 mobj = re.match(self._VALID_URL, url)
1787 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1790 # At this point we have a new video
1791 self._downloader.increment_downloads()
1792 video_id = mobj.group(1)
1794 video_extension = 'flv'
1796 # Retrieve video webpage to extract further information
1797 request = urllib2.Request(url)
1799 self.report_download_webpage(video_id)
1800 webpage = urllib2.urlopen(request).read()
1801 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1802 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1805 # Extract URL, uploader, and title from webpage
1806 self.report_extraction(video_id)
1807 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1809 self._downloader.trouble(u'ERROR: unable to extract media URL')
1811 mediaURL = urllib.unquote(mobj.group(1))
1813 video_url = mediaURL
1815 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1817 self._downloader.trouble(u'ERROR: unable to extract title')
1819 video_title = mobj.group(1).decode('utf-8')
1820 video_title = sanitize_title(video_title)
1821 simple_title = _simplify_title(vide_title)
1823 video_uploader = mobj.group(2).decode('utf-8')
1826 # Process video information
1827 self._downloader.process_info({
1828 'id': video_id.decode('utf-8'),
1829 'url': video_url.decode('utf-8'),
1830 'uploader': video_uploader,
1831 'upload_date': u'NA',
1832 'title': video_title,
1833 'stitle': simple_title,
1834 'ext': video_extension.decode('utf-8'),
1838 except UnavailableVideoError:
1839 self._downloader.trouble(u'\nERROR: unable to download video')
1842 class YahooIE(InfoExtractor):
1843 """Information extractor for video.yahoo.com."""
1845 # _VALID_URL matches all Yahoo! Video URLs
1846 # _VPAGE_URL matches only the extractable '/watch/' URLs
1847 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1848 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1849 IE_NAME = u'video.yahoo'
1851 def __init__(self, downloader=None):
1852 InfoExtractor.__init__(self, downloader)
1854 def report_download_webpage(self, video_id):
1855 """Report webpage download."""
1856 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1858 def report_extraction(self, video_id):
1859 """Report information extraction."""
1860 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1862 def _real_extract(self, url, new_video=True):
1863 # Extract ID from URL
1864 mobj = re.match(self._VALID_URL, url)
1866 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1869 # At this point we have a new video
1870 self._downloader.increment_downloads()
1871 video_id = mobj.group(2)
1872 video_extension = 'flv'
1874 # Rewrite valid but non-extractable URLs as
1875 # extractable English language /watch/ URLs
1876 if re.match(self._VPAGE_URL, url) is None:
1877 request = urllib2.Request(url)
1879 webpage = urllib2.urlopen(request).read()
1880 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1881 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1884 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1886 self._downloader.trouble(u'ERROR: Unable to extract id field')
1888 yahoo_id = mobj.group(1)
1890 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1892 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1894 yahoo_vid = mobj.group(1)
1896 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1897 return self._real_extract(url, new_video=False)
1899 # Retrieve video webpage to extract further information
1900 request = urllib2.Request(url)
1902 self.report_download_webpage(video_id)
1903 webpage = urllib2.urlopen(request).read()
1904 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1905 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1908 # Extract uploader and title from webpage
1909 self.report_extraction(video_id)
1910 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1912 self._downloader.trouble(u'ERROR: unable to extract video title')
1914 video_title = mobj.group(1).decode('utf-8')
1915 simple_title = _simplify_title(video_title)
1917 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1919 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1921 video_uploader = mobj.group(1).decode('utf-8')
1923 # Extract video thumbnail
1924 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1926 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1928 video_thumbnail = mobj.group(1).decode('utf-8')
1930 # Extract video description
1931 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1933 self._downloader.trouble(u'ERROR: unable to extract video description')
1935 video_description = mobj.group(1).decode('utf-8')
1936 if not video_description:
1937 video_description = 'No description available.'
1939 # Extract video height and width
1940 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1942 self._downloader.trouble(u'ERROR: unable to extract video height')
1944 yv_video_height = mobj.group(1)
1946 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1948 self._downloader.trouble(u'ERROR: unable to extract video width')
1950 yv_video_width = mobj.group(1)
1952 # Retrieve video playlist to extract media URL
1953 # I'm not completely sure what all these options are, but we
1954 # seem to need most of them, otherwise the server sends a 401.
1955 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1956 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1957 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1958 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1959 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1961 self.report_download_webpage(video_id)
1962 webpage = urllib2.urlopen(request).read()
1963 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1964 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1967 # Extract media URL from playlist XML
1968 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1970 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1972 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1973 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1976 # Process video information
1977 self._downloader.process_info({
1978 'id': video_id.decode('utf-8'),
1980 'uploader': video_uploader,
1981 'upload_date': u'NA',
1982 'title': video_title,
1983 'stitle': simple_title,
1984 'ext': video_extension.decode('utf-8'),
1985 'thumbnail': video_thumbnail.decode('utf-8'),
1986 'description': video_description,
1987 'thumbnail': video_thumbnail,
1990 except UnavailableVideoError:
1991 self._downloader.trouble(u'\nERROR: unable to download video')
1994 class VimeoIE(InfoExtractor):
1995 """Information extractor for vimeo.com."""
1997 # _VALID_URL matches Vimeo URLs
1998 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2001 def __init__(self, downloader=None):
2002 InfoExtractor.__init__(self, downloader)
2004 def report_download_webpage(self, video_id):
2005 """Report webpage download."""
2006 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2008 def report_extraction(self, video_id):
2009 """Report information extraction."""
2010 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2012 def _real_extract(self, url, new_video=True):
2013 # Extract ID from URL
2014 mobj = re.match(self._VALID_URL, url)
2016 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2019 # At this point we have a new video
2020 self._downloader.increment_downloads()
2021 video_id = mobj.group(1)
2023 # Retrieve video webpage to extract further information
2024 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2026 self.report_download_webpage(video_id)
2027 webpage = urllib2.urlopen(request).read()
2028 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2029 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2032 # Now we begin extracting as much information as we can from what we
2033 # retrieved. First we extract the information common to all extractors,
2034 # and latter we extract those that are Vimeo specific.
2035 self.report_extraction(video_id)
2038 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2040 self._downloader.trouble(u'ERROR: unable to extract video title')
2042 video_title = mobj.group(1).decode('utf-8')
2043 simple_title = _simplify_title(video_title)
2046 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2048 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2050 video_uploader = mobj.group(1).decode('utf-8')
2052 # Extract video thumbnail
2053 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2055 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2057 video_thumbnail = mobj.group(1).decode('utf-8')
2059 # # Extract video description
2060 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2062 # self._downloader.trouble(u'ERROR: unable to extract video description')
2064 # video_description = mobj.group(1).decode('utf-8')
2065 # if not video_description: video_description = 'No description available.'
2066 video_description = 'Foo.'
2068 # Vimeo specific: extract request signature
2069 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2071 self._downloader.trouble(u'ERROR: unable to extract request signature')
2073 sig = mobj.group(1).decode('utf-8')
2075 # Vimeo specific: extract video quality information
2076 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2078 self._downloader.trouble(u'ERROR: unable to extract video quality information')
2080 quality = mobj.group(1).decode('utf-8')
2082 if int(quality) == 1:
2087 # Vimeo specific: Extract request signature expiration
2088 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2090 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2092 sig_exp = mobj.group(1).decode('utf-8')
2094 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2097 # Process video information
2098 self._downloader.process_info({
2099 'id': video_id.decode('utf-8'),
2101 'uploader': video_uploader,
2102 'upload_date': u'NA',
2103 'title': video_title,
2104 'stitle': simple_title,
2106 'thumbnail': video_thumbnail.decode('utf-8'),
2107 'description': video_description,
2108 'thumbnail': video_thumbnail,
2109 'description': video_description,
2112 except UnavailableVideoError:
2113 self._downloader.trouble(u'ERROR: unable to download video')
2116 class GenericIE(InfoExtractor):
2117 """Generic last-resort information extractor."""
2120 IE_NAME = u'generic'
2122 def __init__(self, downloader=None):
2123 InfoExtractor.__init__(self, downloader)
2125 def report_download_webpage(self, video_id):
2126 """Report webpage download."""
2127 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2128 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2130 def report_extraction(self, video_id):
2131 """Report information extraction."""
2132 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2134 def _real_extract(self, url):
2135 # At this point we have a new video
2136 self._downloader.increment_downloads()
2138 video_id = url.split('/')[-1]
2139 request = urllib2.Request(url)
2141 self.report_download_webpage(video_id)
2142 webpage = urllib2.urlopen(request).read()
2143 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2144 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2146 except ValueError, err:
2147 # since this is the last-resort InfoExtractor, if
2148 # this error is thrown, it'll be thrown here
2149 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2152 self.report_extraction(video_id)
2153 # Start with something easy: JW Player in SWFObject
2154 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2156 # Broaden the search a little bit
2157 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2159 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2162 # It's possible that one of the regexes
2163 # matched, but returned an empty group:
2164 if mobj.group(1) is None:
2165 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2168 video_url = urllib.unquote(mobj.group(1))
2169 video_id = os.path.basename(video_url)
2171 # here's a fun little line of code for you:
2172 video_extension = os.path.splitext(video_id)[1][1:]
2173 video_id = os.path.splitext(video_id)[0]
2175 # it's tempting to parse this further, but you would
2176 # have to take into account all the variations like
2177 # Video Title - Site Name
2178 # Site Name | Video Title
2179 # Video Title - Tagline | Site Name
2180 # and so on and so forth; it's just not practical
2181 mobj = re.search(r'<title>(.*)</title>', webpage)
2183 self._downloader.trouble(u'ERROR: unable to extract title')
2185 video_title = mobj.group(1).decode('utf-8')
2186 video_title = sanitize_title(video_title)
2187 simple_title = _simplify_title(video_title)
2189 # video uploader is domain name
2190 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2192 self._downloader.trouble(u'ERROR: unable to extract title')
2194 video_uploader = mobj.group(1).decode('utf-8')
2197 # Process video information
2198 self._downloader.process_info({
2199 'id': video_id.decode('utf-8'),
2200 'url': video_url.decode('utf-8'),
2201 'uploader': video_uploader,
2202 'upload_date': u'NA',
2203 'title': video_title,
2204 'stitle': simple_title,
2205 'ext': video_extension.decode('utf-8'),
2209 except UnavailableVideoError, err:
2210 self._downloader.trouble(u'\nERROR: unable to download video')
2213 class YoutubeSearchIE(InfoExtractor):
2214 """Information Extractor for YouTube search queries."""
2215 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2216 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2217 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2218 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2220 _max_youtube_results = 1000
2221 IE_NAME = u'youtube:search'
2223 def __init__(self, youtube_ie, downloader=None):
2224 InfoExtractor.__init__(self, downloader)
2225 self._youtube_ie = youtube_ie
2227 def report_download_page(self, query, pagenum):
2228 """Report attempt to download playlist page with given number."""
2229 query = query.decode(preferredencoding())
2230 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2232 def _real_initialize(self):
2233 self._youtube_ie.initialize()
2235 def _real_extract(self, query):
2236 mobj = re.match(self._VALID_URL, query)
2238 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2241 prefix, query = query.split(':')
2243 query = query.encode('utf-8')
2245 self._download_n_results(query, 1)
2247 elif prefix == 'all':
2248 self._download_n_results(query, self._max_youtube_results)
2254 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2256 elif n > self._max_youtube_results:
2257 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2258 n = self._max_youtube_results
2259 self._download_n_results(query, n)
2261 except ValueError: # parsing prefix as integer fails
2262 self._download_n_results(query, 1)
2265 def _download_n_results(self, query, n):
2266 """Downloads a specified number of results for a query"""
2269 already_seen = set()
2273 self.report_download_page(query, pagenum)
2274 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2275 request = urllib2.Request(result_url)
2277 page = urllib2.urlopen(request).read()
2278 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2279 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2282 # Extract video identifiers
2283 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2284 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2285 if video_id not in already_seen:
2286 video_ids.append(video_id)
2287 already_seen.add(video_id)
2288 if len(video_ids) == n:
2289 # Specified n videos reached
2290 for id in video_ids:
2291 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2294 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2295 for id in video_ids:
2296 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2299 pagenum = pagenum + 1
2302 class GoogleSearchIE(InfoExtractor):
2303 """Information Extractor for Google Video search queries."""
2304 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2305 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2306 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2307 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2309 _max_google_results = 1000
2310 IE_NAME = u'video.google:search'
2312 def __init__(self, google_ie, downloader=None):
2313 InfoExtractor.__init__(self, downloader)
2314 self._google_ie = google_ie
2316 def report_download_page(self, query, pagenum):
2317 """Report attempt to download playlist page with given number."""
2318 query = query.decode(preferredencoding())
2319 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2321 def _real_initialize(self):
2322 self._google_ie.initialize()
2324 def _real_extract(self, query):
2325 mobj = re.match(self._VALID_URL, query)
2327 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2330 prefix, query = query.split(':')
2332 query = query.encode('utf-8')
2334 self._download_n_results(query, 1)
2336 elif prefix == 'all':
2337 self._download_n_results(query, self._max_google_results)
2343 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2345 elif n > self._max_google_results:
2346 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2347 n = self._max_google_results
2348 self._download_n_results(query, n)
2350 except ValueError: # parsing prefix as integer fails
2351 self._download_n_results(query, 1)
2354 def _download_n_results(self, query, n):
2355 """Downloads a specified number of results for a query"""
2358 already_seen = set()
2362 self.report_download_page(query, pagenum)
2363 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2364 request = urllib2.Request(result_url)
2366 page = urllib2.urlopen(request).read()
2367 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2368 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2371 # Extract video identifiers
2372 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2373 video_id = mobj.group(1)
2374 if video_id not in already_seen:
2375 video_ids.append(video_id)
2376 already_seen.add(video_id)
2377 if len(video_ids) == n:
2378 # Specified n videos reached
2379 for id in video_ids:
2380 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2383 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2384 for id in video_ids:
2385 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2388 pagenum = pagenum + 1
2391 class YahooSearchIE(InfoExtractor):
2392 """Information Extractor for Yahoo! Video search queries."""
2393 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2394 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2395 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2396 _MORE_PAGES_INDICATOR = r'\s*Next'
2398 _max_yahoo_results = 1000
2399 IE_NAME = u'video.yahoo:search'
2401 def __init__(self, yahoo_ie, downloader=None):
2402 InfoExtractor.__init__(self, downloader)
2403 self._yahoo_ie = yahoo_ie
2405 def report_download_page(self, query, pagenum):
2406 """Report attempt to download playlist page with given number."""
2407 query = query.decode(preferredencoding())
2408 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2410 def _real_initialize(self):
2411 self._yahoo_ie.initialize()
2413 def _real_extract(self, query):
2414 mobj = re.match(self._VALID_URL, query)
2416 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2419 prefix, query = query.split(':')
2421 query = query.encode('utf-8')
2423 self._download_n_results(query, 1)
2425 elif prefix == 'all':
2426 self._download_n_results(query, self._max_yahoo_results)
2432 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2434 elif n > self._max_yahoo_results:
2435 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2436 n = self._max_yahoo_results
2437 self._download_n_results(query, n)
2439 except ValueError: # parsing prefix as integer fails
2440 self._download_n_results(query, 1)
2443 def _download_n_results(self, query, n):
2444 """Downloads a specified number of results for a query"""
2447 already_seen = set()
2451 self.report_download_page(query, pagenum)
2452 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2453 request = urllib2.Request(result_url)
2455 page = urllib2.urlopen(request).read()
2456 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2457 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2460 # Extract video identifiers
2461 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2462 video_id = mobj.group(1)
2463 if video_id not in already_seen:
2464 video_ids.append(video_id)
2465 already_seen.add(video_id)
2466 if len(video_ids) == n:
2467 # Specified n videos reached
2468 for id in video_ids:
2469 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2472 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2473 for id in video_ids:
2474 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2477 pagenum = pagenum + 1
2480 class YoutubePlaylistIE(InfoExtractor):
2481 """Information Extractor for YouTube playlists."""
2483 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2484 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2485 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2486 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2488 IE_NAME = u'youtube:playlist'
2490 def __init__(self, youtube_ie, downloader=None):
2491 InfoExtractor.__init__(self, downloader)
2492 self._youtube_ie = youtube_ie
2494 def report_download_page(self, playlist_id, pagenum):
2495 """Report attempt to download playlist page with given number."""
2496 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2498 def _real_initialize(self):
2499 self._youtube_ie.initialize()
2501 def _real_extract(self, url):
2502 # Extract playlist id
2503 mobj = re.match(self._VALID_URL, url)
2505 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2509 if mobj.group(3) is not None:
2510 self._youtube_ie.extract(mobj.group(3))
2513 # Download playlist pages
2514 # prefix is 'p' as default for playlists but there are other types that need extra care
2515 playlist_prefix = mobj.group(1)
2516 if playlist_prefix == 'a':
2517 playlist_access = 'artist'
2519 playlist_prefix = 'p'
2520 playlist_access = 'view_play_list'
2521 playlist_id = mobj.group(2)
2526 self.report_download_page(playlist_id, pagenum)
2527 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2528 request = urllib2.Request(url)
2530 page = urllib2.urlopen(request).read()
2531 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2532 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2535 # Extract video identifiers
2537 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2538 if mobj.group(1) not in ids_in_page:
2539 ids_in_page.append(mobj.group(1))
2540 video_ids.extend(ids_in_page)
2542 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2544 pagenum = pagenum + 1
2546 playliststart = self._downloader.params.get('playliststart', 1) - 1
2547 playlistend = self._downloader.params.get('playlistend', -1)
2548 video_ids = video_ids[playliststart:playlistend]
2550 for id in video_ids:
2551 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2555 class YoutubeUserIE(InfoExtractor):
2556 """Information Extractor for YouTube users."""
2558 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2559 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2560 _GDATA_PAGE_SIZE = 50
2561 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2562 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2564 IE_NAME = u'youtube:user'
2566 def __init__(self, youtube_ie, downloader=None):
2567 InfoExtractor.__init__(self, downloader)
2568 self._youtube_ie = youtube_ie
2570 def report_download_page(self, username, start_index):
2571 """Report attempt to download user page."""
2572 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2573 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2575 def _real_initialize(self):
2576 self._youtube_ie.initialize()
2578 def _real_extract(self, url):
2580 mobj = re.match(self._VALID_URL, url)
2582 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2585 username = mobj.group(1)
2587 # Download video ids using YouTube Data API. Result size per
2588 # query is limited (currently to 50 videos) so we need to query
2589 # page by page until there are no video ids - it means we got
2596 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2597 self.report_download_page(username, start_index)
2599 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2602 page = urllib2.urlopen(request).read()
2603 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2604 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2607 # Extract video identifiers
2610 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2611 if mobj.group(1) not in ids_in_page:
2612 ids_in_page.append(mobj.group(1))
2614 video_ids.extend(ids_in_page)
2616 # A little optimization - if current page is not
2617 # "full", ie. does not contain PAGE_SIZE video ids then
2618 # we can assume that this page is the last one - there
2619 # are no more ids on further pages - no need to query
2622 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2627 all_ids_count = len(video_ids)
2628 playliststart = self._downloader.params.get('playliststart', 1) - 1
2629 playlistend = self._downloader.params.get('playlistend', -1)
2631 if playlistend == -1:
2632 video_ids = video_ids[playliststart:]
2634 video_ids = video_ids[playliststart:playlistend]
2636 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2637 (username, all_ids_count, len(video_ids)))
2639 for video_id in video_ids:
2640 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2643 class DepositFilesIE(InfoExtractor):
2644 """Information extractor for depositfiles.com"""
2646 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2647 IE_NAME = u'DepositFiles'
2649 def __init__(self, downloader=None):
2650 InfoExtractor.__init__(self, downloader)
2652 def report_download_webpage(self, file_id):
2653 """Report webpage download."""
2654 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2656 def report_extraction(self, file_id):
2657 """Report information extraction."""
2658 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2660 def _real_extract(self, url):
2661 # At this point we have a new file
2662 self._downloader.increment_downloads()
2664 file_id = url.split('/')[-1]
2665 # Rebuild url in english locale
2666 url = 'http://depositfiles.com/en/files/' + file_id
2668 # Retrieve file webpage with 'Free download' button pressed
2669 free_download_indication = { 'gateway_result' : '1' }
2670 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2672 self.report_download_webpage(file_id)
2673 webpage = urllib2.urlopen(request).read()
2674 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2675 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2678 # Search for the real file URL
2679 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2680 if (mobj is None) or (mobj.group(1) is None):
2681 # Try to figure out reason of the error.
2682 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2683 if (mobj is not None) and (mobj.group(1) is not None):
2684 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2685 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2687 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2690 file_url = mobj.group(1)
2691 file_extension = os.path.splitext(file_url)[1][1:]
2693 # Search for file title
2694 mobj = re.search(r'<b title="(.*?)">', webpage)
2696 self._downloader.trouble(u'ERROR: unable to extract title')
2698 file_title = mobj.group(1).decode('utf-8')
2701 # Process file information
2702 self._downloader.process_info({
2703 'id': file_id.decode('utf-8'),
2704 'url': file_url.decode('utf-8'),
2706 'upload_date': u'NA',
2707 'title': file_title,
2708 'stitle': file_title,
2709 'ext': file_extension.decode('utf-8'),
2713 except UnavailableVideoError, err:
2714 self._downloader.trouble(u'ERROR: unable to download file')
2717 class FacebookIE(InfoExtractor):
2718 """Information Extractor for Facebook"""
2720 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2721 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2722 _NETRC_MACHINE = 'facebook'
2723 _available_formats = ['video', 'highqual', 'lowqual']
2724 _video_extensions = {
2729 IE_NAME = u'facebook'
2731 def __init__(self, downloader=None):
2732 InfoExtractor.__init__(self, downloader)
2734 def _reporter(self, message):
2735 """Add header and report message."""
2736 self._downloader.to_screen(u'[facebook] %s' % message)
2738 def report_login(self):
2739 """Report attempt to log in."""
2740 self._reporter(u'Logging in')
2742 def report_video_webpage_download(self, video_id):
2743 """Report attempt to download video webpage."""
2744 self._reporter(u'%s: Downloading video webpage' % video_id)
2746 def report_information_extraction(self, video_id):
2747 """Report attempt to extract video information."""
2748 self._reporter(u'%s: Extracting video information' % video_id)
2750 def _parse_page(self, video_webpage):
2751 """Extract video information from page"""
2753 data = {'title': r'\("video_title", "(.*?)"\)',
2754 'description': r'<div class="datawrap">(.*?)</div>',
2755 'owner': r'\("video_owner_name", "(.*?)"\)',
2756 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2759 for piece in data.keys():
2760 mobj = re.search(data[piece], video_webpage)
2761 if mobj is not None:
2762 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2766 for fmt in self._available_formats:
2767 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2768 if mobj is not None:
2769 # URL is in a Javascript segment inside an escaped Unicode format within
2770 # the generally utf-8 page
2771 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2772 video_info['video_urls'] = video_urls
2776 def _real_initialize(self):
2777 if self._downloader is None:
2782 downloader_params = self._downloader.params
2784 # Attempt to use provided username and password or .netrc data
2785 if downloader_params.get('username', None) is not None:
2786 useremail = downloader_params['username']
2787 password = downloader_params['password']
2788 elif downloader_params.get('usenetrc', False):
2790 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2791 if info is not None:
2795 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2796 except (IOError, netrc.NetrcParseError), err:
2797 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2800 if useremail is None:
2809 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2812 login_results = urllib2.urlopen(request).read()
2813 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2814 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2816 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2817 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2820 def _real_extract(self, url):
2821 mobj = re.match(self._VALID_URL, url)
2823 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2825 video_id = mobj.group('ID')
2828 self.report_video_webpage_download(video_id)
2829 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2831 page = urllib2.urlopen(request)
2832 video_webpage = page.read()
2833 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2834 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2837 # Start extracting information
2838 self.report_information_extraction(video_id)
2840 # Extract information
2841 video_info = self._parse_page(video_webpage)
2844 if 'owner' not in video_info:
2845 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2847 video_uploader = video_info['owner']
2850 if 'title' not in video_info:
2851 self._downloader.trouble(u'ERROR: unable to extract video title')
2853 video_title = video_info['title']
2854 video_title = video_title.decode('utf-8')
2855 video_title = sanitize_title(video_title)
2857 simple_title = _simplify_title(video_title)
2860 if 'thumbnail' not in video_info:
2861 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2862 video_thumbnail = ''
2864 video_thumbnail = video_info['thumbnail']
2868 if 'upload_date' in video_info:
2869 upload_time = video_info['upload_date']
2870 timetuple = email.utils.parsedate_tz(upload_time)
2871 if timetuple is not None:
2873 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2878 video_description = video_info.get('description', 'No description available.')
2880 url_map = video_info['video_urls']
2881 if len(url_map.keys()) > 0:
2882 # Decide which formats to download
2883 req_format = self._downloader.params.get('format', None)
2884 format_limit = self._downloader.params.get('format_limit', None)
2886 if format_limit is not None and format_limit in self._available_formats:
2887 format_list = self._available_formats[self._available_formats.index(format_limit):]
2889 format_list = self._available_formats
2890 existing_formats = [x for x in format_list if x in url_map]
2891 if len(existing_formats) == 0:
2892 self._downloader.trouble(u'ERROR: no known formats available for video')
2894 if req_format is None:
2895 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2896 elif req_format == 'worst':
2897 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2898 elif req_format == '-1':
2899 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2902 if req_format not in url_map:
2903 self._downloader.trouble(u'ERROR: requested format not available')
2905 video_url_list = [(req_format, url_map[req_format])] # Specific format
2907 for format_param, video_real_url in video_url_list:
2909 # At this point we have a new video
2910 self._downloader.increment_downloads()
2913 video_extension = self._video_extensions.get(format_param, 'mp4')
2916 # Process video information
2917 self._downloader.process_info({
2918 'id': video_id.decode('utf-8'),
2919 'url': video_real_url.decode('utf-8'),
2920 'uploader': video_uploader.decode('utf-8'),
2921 'upload_date': upload_date,
2922 'title': video_title,
2923 'stitle': simple_title,
2924 'ext': video_extension.decode('utf-8'),
2925 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2926 'thumbnail': video_thumbnail.decode('utf-8'),
2927 'description': video_description.decode('utf-8'),
2930 except UnavailableVideoError, err:
2931 self._downloader.trouble(u'\nERROR: unable to download video')
2933 class BlipTVIE(InfoExtractor):
2934 """Information extractor for blip.tv"""
2936 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2937 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2938 IE_NAME = u'blip.tv'
2940 def report_extraction(self, file_id):
2941 """Report information extraction."""
2942 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2944 def report_direct_download(self, title):
2945 """Report information extraction."""
2946 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2948 def _real_extract(self, url):
2949 mobj = re.match(self._VALID_URL, url)
2951 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2958 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2959 request = urllib2.Request(json_url)
2960 self.report_extraction(mobj.group(1))
2963 urlh = urllib2.urlopen(request)
2964 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2965 basename = url.split('/')[-1]
2966 title,ext = os.path.splitext(basename)
2967 title = title.decode('UTF-8')
2968 ext = ext.replace('.', '')
2969 self.report_direct_download(title)
2974 'stitle': _simplify_title(title),
2978 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2979 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2981 if info is None: # Regular URL
2983 json_code = urlh.read()
2984 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2985 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2989 json_data = json.loads(json_code)
2990 if 'Post' in json_data:
2991 data = json_data['Post']
2995 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2996 video_url = data['media']['url']
2997 umobj = re.match(self._URL_EXT, video_url)
2999 raise ValueError('Can not determine filename extension')
3000 ext = umobj.group(1)
3003 'id': data['item_id'],
3005 'uploader': data['display_name'],
3006 'upload_date': upload_date,
3007 'title': data['title'],
3008 'stitle': _simplify_title(data['title']),
3010 'format': data['media']['mimeType'],
3011 'thumbnail': data['thumbnailUrl'],
3012 'description': data['description'],
3013 'player_url': data['embedUrl']
3015 except (ValueError,KeyError), err:
3016 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3019 self._downloader.increment_downloads()
3022 self._downloader.process_info(info)
3023 except UnavailableVideoError, err:
3024 self._downloader.trouble(u'\nERROR: unable to download video')
3027 class MyVideoIE(InfoExtractor):
3028 """Information Extractor for myvideo.de."""
3030 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3031 IE_NAME = u'myvideo'
3033 def __init__(self, downloader=None):
3034 InfoExtractor.__init__(self, downloader)
3036 def report_download_webpage(self, video_id):
3037 """Report webpage download."""
3038 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3040 def report_extraction(self, video_id):
3041 """Report information extraction."""
3042 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3044 def _real_extract(self,url):
3045 mobj = re.match(self._VALID_URL, url)
3047 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3050 video_id = mobj.group(1)
3053 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3055 self.report_download_webpage(video_id)
3056 webpage = urllib2.urlopen(request).read()
3057 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3058 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3061 self.report_extraction(video_id)
3062 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3065 self._downloader.trouble(u'ERROR: unable to extract media URL')
3067 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3069 mobj = re.search('<title>([^<]+)</title>', webpage)
3071 self._downloader.trouble(u'ERROR: unable to extract title')
3074 video_title = mobj.group(1)
3075 video_title = sanitize_title(video_title)
3077 simple_title = _simplify_title(video_title)
3080 self._downloader.process_info({
3084 'upload_date': u'NA',
3085 'title': video_title,
3086 'stitle': simple_title,
3091 except UnavailableVideoError:
3092 self._downloader.trouble(u'\nERROR: Unable to download video')
3094 class ComedyCentralIE(InfoExtractor):
3095 """Information extractor for The Daily Show and Colbert Report """
3097 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3098 IE_NAME = u'comedycentral'
3100 def report_extraction(self, episode_id):
3101 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3103 def report_config_download(self, episode_id):
3104 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3106 def report_index_download(self, episode_id):
3107 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3109 def report_player_url(self, episode_id):
3110 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3112 def _real_extract(self, url):
3113 mobj = re.match(self._VALID_URL, url)
3115 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3118 if mobj.group('shortname'):
3119 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3120 url = u'http://www.thedailyshow.com/full-episodes/'
3122 url = u'http://www.colbertnation.com/full-episodes/'
3123 mobj = re.match(self._VALID_URL, url)
3124 assert mobj is not None
3126 dlNewest = not mobj.group('episode')
3128 epTitle = mobj.group('showname')
3130 epTitle = mobj.group('episode')
3132 req = urllib2.Request(url)
3133 self.report_extraction(epTitle)
3135 htmlHandle = urllib2.urlopen(req)
3136 html = htmlHandle.read()
3137 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3138 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3141 url = htmlHandle.geturl()
3142 mobj = re.match(self._VALID_URL, url)
3144 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3146 if mobj.group('episode') == '':
3147 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3149 epTitle = mobj.group('episode')
3151 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3152 if len(mMovieParams) == 0:
3153 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3156 playerUrl_raw = mMovieParams[0][0]
3157 self.report_player_url(epTitle)
3159 urlHandle = urllib2.urlopen(playerUrl_raw)
3160 playerUrl = urlHandle.geturl()
3161 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3162 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3165 uri = mMovieParams[0][1]
3166 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3167 self.report_index_download(epTitle)
3169 indexXml = urllib2.urlopen(indexUrl).read()
3170 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3171 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3174 idoc = xml.etree.ElementTree.fromstring(indexXml)
3175 itemEls = idoc.findall('.//item')
3176 for itemEl in itemEls:
3177 mediaId = itemEl.findall('./guid')[0].text
3178 shortMediaId = mediaId.split(':')[-1]
3179 showId = mediaId.split(':')[-2].replace('.com', '')
3180 officialTitle = itemEl.findall('./title')[0].text
3181 officialDate = itemEl.findall('./pubDate')[0].text
3183 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3184 urllib.urlencode({'uri': mediaId}))
3185 configReq = urllib2.Request(configUrl)
3186 self.report_config_download(epTitle)
3188 configXml = urllib2.urlopen(configReq).read()
3189 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3190 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3193 cdoc = xml.etree.ElementTree.fromstring(configXml)
3195 for rendition in cdoc.findall('.//rendition'):
3196 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3200 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3203 # For now, just pick the highest bitrate
3204 format,video_url = turls[-1]
3206 self._downloader.increment_downloads()
3208 effTitle = showId + u'-' + epTitle
3213 'upload_date': officialDate,
3215 'stitle': _simplify_title(effTitle),
3219 'description': officialTitle,
3220 'player_url': playerUrl
3224 self._downloader.process_info(info)
3225 except UnavailableVideoError, err:
3226 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3230 class EscapistIE(InfoExtractor):
3231 """Information extractor for The Escapist """
3233 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3234 IE_NAME = u'escapist'
3236 def report_extraction(self, showName):
3237 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3239 def report_config_download(self, showName):
3240 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3242 def _real_extract(self, url):
3243 htmlParser = HTMLParser.HTMLParser()
3245 mobj = re.match(self._VALID_URL, url)
3247 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3249 showName = mobj.group('showname')
3250 videoId = mobj.group('episode')
3252 self.report_extraction(showName)
3254 webPage = urllib2.urlopen(url).read()
3255 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3256 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3259 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3260 description = htmlParser.unescape(descMatch.group(1))
3261 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3262 imgUrl = htmlParser.unescape(imgMatch.group(1))
3263 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3264 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3265 configUrlMatch = re.search('config=(.*)$', playerUrl)
3266 configUrl = urllib2.unquote(configUrlMatch.group(1))
3268 self.report_config_download(showName)
3270 configJSON = urllib2.urlopen(configUrl).read()
3271 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3272 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3275 # Technically, it's JavaScript, not JSON
3276 configJSON = configJSON.replace("'", '"')
3279 config = json.loads(configJSON)
3280 except (ValueError,), err:
3281 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3284 playlist = config['playlist']
3285 videoUrl = playlist[1]['url']
3287 self._downloader.increment_downloads()
3291 'uploader': showName,
3292 'upload_date': None,
3294 'stitle': _simplify_title(showName),
3297 'thumbnail': imgUrl,
3298 'description': description,
3299 'player_url': playerUrl,
3303 self._downloader.process_info(info)
3304 except UnavailableVideoError, err:
3305 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3308 class CollegeHumorIE(InfoExtractor):
3309 """Information extractor for collegehumor.com"""
3311 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3312 IE_NAME = u'collegehumor'
3314 def report_webpage(self, video_id):
3315 """Report information extraction."""
3316 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3318 def report_extraction(self, video_id):
3319 """Report information extraction."""
3320 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3322 def _real_extract(self, url):
3323 htmlParser = HTMLParser.HTMLParser()
3325 mobj = re.match(self._VALID_URL, url)
3327 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3329 video_id = mobj.group('videoid')
3331 self.report_webpage(video_id)
3332 request = urllib2.Request(url)
3334 webpage = urllib2.urlopen(request).read()
3335 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3336 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3339 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3341 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3343 internal_video_id = m.group('internalvideoid')
3347 'internal_id': internal_video_id,
3350 self.report_extraction(video_id)
3351 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3353 metaXml = urllib2.urlopen(xmlUrl).read()
3354 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3355 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3358 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3360 videoNode = mdoc.findall('./video')[0]
3361 info['description'] = videoNode.findall('./description')[0].text
3362 info['title'] = videoNode.findall('./caption')[0].text
3363 info['stitle'] = _simplify_title(info['title'])
3364 info['url'] = videoNode.findall('./file')[0].text
3365 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3366 info['ext'] = info['url'].rpartition('.')[2]
3367 info['format'] = info['ext']
3369 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3372 self._downloader.increment_downloads()
3375 self._downloader.process_info(info)
3376 except UnavailableVideoError, err:
3377 self._downloader.trouble(u'\nERROR: unable to download video')
3380 class XVideosIE(InfoExtractor):
3381 """Information extractor for xvideos.com"""
3383 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3384 IE_NAME = u'xvideos'
3386 def report_webpage(self, video_id):
3387 """Report information extraction."""
3388 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3390 def report_extraction(self, video_id):
3391 """Report information extraction."""
3392 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3394 def _real_extract(self, url):
3395 htmlParser = HTMLParser.HTMLParser()
3397 mobj = re.match(self._VALID_URL, url)
3399 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3401 video_id = mobj.group(1).decode('utf-8')
3403 self.report_webpage(video_id)
3405 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3407 webpage = urllib2.urlopen(request).read()
3408 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3409 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3412 self.report_extraction(video_id)
3416 mobj = re.search(r'flv_url=(.+?)&', webpage)
3418 self._downloader.trouble(u'ERROR: unable to extract video url')
3420 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3424 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3426 self._downloader.trouble(u'ERROR: unable to extract video title')
3428 video_title = mobj.group(1).decode('utf-8')
3431 # Extract video thumbnail
3432 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3434 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3436 video_thumbnail = mobj.group(1).decode('utf-8')
3440 self._downloader.increment_downloads()
3445 'upload_date': None,
3446 'title': video_title,
3447 'stitle': _simplify_title(video_title),
3450 'thumbnail': video_thumbnail,
3451 'description': None,
3456 self._downloader.process_info(info)
3457 except UnavailableVideoError, err:
3458 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3461 class SoundcloudIE(InfoExtractor):
3462 """Information extractor for soundcloud.com
3463 To access the media, the uid of the song and a stream token
3464 must be extracted from the page source and the script must make
3465 a request to media.soundcloud.com/crossdomain.xml. Then
3466 the media can be grabbed by requesting from an url composed
3467 of the stream token and uid
3470 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3471 IE_NAME = u'soundcloud'
3473 def __init__(self, downloader=None):
3474 InfoExtractor.__init__(self, downloader)
3476 def report_webpage(self, video_id):
3477 """Report information extraction."""
3478 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3480 def report_extraction(self, video_id):
3481 """Report information extraction."""
3482 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3484 def _real_extract(self, url):
3485 htmlParser = HTMLParser.HTMLParser()
3487 mobj = re.match(self._VALID_URL, url)
3489 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3492 # extract uploader (which is in the url)
3493 uploader = mobj.group(1).decode('utf-8')
3494 # extract simple title (uploader + slug of song title)
3495 slug_title = mobj.group(2).decode('utf-8')
3496 simple_title = uploader + '-' + slug_title
3498 self.report_webpage('%s/%s' % (uploader, slug_title))
3500 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3502 webpage = urllib2.urlopen(request).read()
3503 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3504 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3507 self.report_extraction('%s/%s' % (uploader, slug_title))
3509 # extract uid and stream token that soundcloud hands out for access
3510 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3512 video_id = mobj.group(1)
3513 stream_token = mobj.group(2)
3515 # extract unsimplified title
3516 mobj = re.search('"title":"(.*?)",', webpage)
3518 title = mobj.group(1)
3520 # construct media url (with uid/token)
3521 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3522 mediaURL = mediaURL % (video_id, stream_token)
3525 description = u'No description available'
3526 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3528 description = mobj.group(1)
3532 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3535 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3536 except Exception, e:
3539 # for soundcloud, a request to a cross domain is required for cookies
3540 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3543 self._downloader.process_info({
3544 'id': video_id.decode('utf-8'),
3546 'uploader': uploader.decode('utf-8'),
3547 'upload_date': upload_date,
3548 'title': simple_title.decode('utf-8'),
3549 'stitle': simple_title.decode('utf-8'),
3553 'description': description.decode('utf-8')
3555 except UnavailableVideoError:
3556 self._downloader.trouble(u'\nERROR: unable to download video')
3559 class InfoQIE(InfoExtractor):
3560 """Information extractor for infoq.com"""
3562 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3565 def report_webpage(self, video_id):
3566 """Report information extraction."""
3567 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3569 def report_extraction(self, video_id):
3570 """Report information extraction."""
3571 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3573 def _real_extract(self, url):
3574 htmlParser = HTMLParser.HTMLParser()
3576 mobj = re.match(self._VALID_URL, url)
3578 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3581 self.report_webpage(url)
3583 request = urllib2.Request(url)
3585 webpage = urllib2.urlopen(request).read()
3586 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3587 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3590 self.report_extraction(url)
3594 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3596 self._downloader.trouble(u'ERROR: unable to extract video url')
3598 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3602 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3604 self._downloader.trouble(u'ERROR: unable to extract video title')
3606 video_title = mobj.group(1).decode('utf-8')
3608 # Extract description
3609 video_description = u'No description available.'
3610 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3611 if mobj is not None:
3612 video_description = mobj.group(1).decode('utf-8')
3614 video_filename = video_url.split('/')[-1]
3615 video_id, extension = video_filename.split('.')
3617 self._downloader.increment_downloads()
3622 'upload_date': None,
3623 'title': video_title,
3624 'stitle': _simplify_title(video_title),
3626 'format': extension, # Extension is always(?) mp4, but seems to be flv
3628 'description': video_description,
3633 self._downloader.process_info(info)
3634 except UnavailableVideoError, err:
3635 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3637 class MixcloudIE(InfoExtractor):
3638 """Information extractor for www.mixcloud.com"""
3639 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3640 IE_NAME = u'mixcloud'
3642 def __init__(self, downloader=None):
3643 InfoExtractor.__init__(self, downloader)
3645 def report_download_json(self, file_id):
3646 """Report JSON download."""
3647 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3649 def report_extraction(self, file_id):
3650 """Report information extraction."""
3651 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3653 def get_urls(self, jsonData, fmt, bitrate='best'):
3654 """Get urls from 'audio_formats' section in json"""
3657 bitrate_list = jsonData[fmt]
3658 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3659 bitrate = max(bitrate_list) # select highest
3661 url_list = jsonData[fmt][bitrate]
3662 except TypeError: # we have no bitrate info.
3663 url_list = jsonData[fmt]
3667 def check_urls(self, url_list):
3668 """Returns 1st active url from list"""
3669 for url in url_list:
3671 urllib2.urlopen(url)
3673 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3678 def _print_formats(self, formats):
3679 print 'Available formats:'
3680 for fmt in formats.keys():
3681 for b in formats[fmt]:
3683 ext = formats[fmt][b][0]
3684 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3685 except TypeError: # we have no bitrate info
3686 ext = formats[fmt][0]
3687 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3690 def _real_extract(self, url):
3691 mobj = re.match(self._VALID_URL, url)
3693 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3695 # extract uploader & filename from url
3696 uploader = mobj.group(1).decode('utf-8')
3697 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3699 # construct API request
3700 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3701 # retrieve .json file with links to files
3702 request = urllib2.Request(file_url)
3704 self.report_download_json(file_url)
3705 jsonData = urllib2.urlopen(request).read()
3706 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3707 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3711 json_data = json.loads(jsonData)
3712 player_url = json_data['player_swf_url']
3713 formats = dict(json_data['audio_formats'])
3715 req_format = self._downloader.params.get('format', None)
3718 if self._downloader.params.get('listformats', None):
3719 self._print_formats(formats)
3722 if req_format is None or req_format == 'best':
3723 for format_param in formats.keys():
3724 url_list = self.get_urls(formats, format_param)
3726 file_url = self.check_urls(url_list)
3727 if file_url is not None:
3730 if req_format not in formats.keys():
3731 self._downloader.trouble(u'ERROR: format is not available')
3734 url_list = self.get_urls(formats, req_format)
3735 file_url = self.check_urls(url_list)
3736 format_param = req_format
3739 self._downloader.increment_downloads()
3741 # Process file information
3742 self._downloader.process_info({
3743 'id': file_id.decode('utf-8'),
3744 'url': file_url.decode('utf-8'),
3745 'uploader': uploader.decode('utf-8'),
3746 'upload_date': u'NA',
3747 'title': json_data['name'],
3748 'stitle': _simplify_title(json_data['name']),
3749 'ext': file_url.split('.')[-1].decode('utf-8'),
3750 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3751 'thumbnail': json_data['thumbnail_url'],
3752 'description': json_data['description'],
3753 'player_url': player_url.decode('utf-8'),
3755 except UnavailableVideoError, err:
3756 self._downloader.trouble(u'ERROR: unable to download file')
3758 class StanfordOpenClassroomIE(InfoExtractor):
3759 """Information extractor for Stanford's Open ClassRoom"""
3761 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3762 IE_NAME = u'stanfordoc'
3764 def report_download_webpage(self, objid):
3765 """Report information extraction."""
3766 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3768 def report_extraction(self, video_id):
3769 """Report information extraction."""
3770 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3772 def _real_extract(self, url):
3773 mobj = re.match(self._VALID_URL, url)
3775 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3778 if mobj.group('course') and mobj.group('video'): # A specific video
3779 course = mobj.group('course')
3780 video = mobj.group('video')
3782 'id': _simplify_title(course + '_' + video),
3785 self.report_extraction(info['id'])
3786 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3787 xmlUrl = baseUrl + video + '.xml'
3789 metaXml = urllib2.urlopen(xmlUrl).read()
3790 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3791 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3793 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3795 info['title'] = mdoc.findall('./title')[0].text
3796 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3798 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3800 info['stitle'] = _simplify_title(info['title'])
3801 info['ext'] = info['url'].rpartition('.')[2]
3802 info['format'] = info['ext']
3803 self._downloader.increment_downloads()
3805 self._downloader.process_info(info)
3806 except UnavailableVideoError, err:
3807 self._downloader.trouble(u'\nERROR: unable to download video')
3808 elif mobj.group('course'): # A course page
3809 unescapeHTML = HTMLParser.HTMLParser().unescape
3811 course = mobj.group('course')
3813 'id': _simplify_title(course),
3817 self.report_download_webpage(info['id'])
3819 coursepage = urllib2.urlopen(url).read()
3820 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3821 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3824 m = re.search('<h1>([^<]+)</h1>', coursepage)
3826 info['title'] = unescapeHTML(m.group(1))
3828 info['title'] = info['id']
3829 info['stitle'] = _simplify_title(info['title'])
3831 m = re.search('<description>([^<]+)</description>', coursepage)
3833 info['description'] = unescapeHTML(m.group(1))
3835 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3838 'type': 'reference',
3839 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3843 for entry in info['list']:
3844 assert entry['type'] == 'reference'
3845 self.extract(entry['url'])
3847 unescapeHTML = HTMLParser.HTMLParser().unescape
3850 'id': 'Stanford OpenClassroom',
3854 self.report_download_webpage(info['id'])
3855 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3857 rootpage = urllib2.urlopen(rootURL).read()
3858 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3859 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3862 info['title'] = info['id']
3863 info['stitle'] = _simplify_title(info['title'])
3865 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3868 'type': 'reference',
3869 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3873 for entry in info['list']:
3874 assert entry['type'] == 'reference'
3875 self.extract(entry['url'])
3878 class PostProcessor(object):
3879 """Post Processor class.
3881 PostProcessor objects can be added to downloaders with their
3882 add_post_processor() method. When the downloader has finished a
3883 successful download, it will take its internal chain of PostProcessors
3884 and start calling the run() method on each one of them, first with
3885 an initial argument and then with the returned value of the previous
3888 The chain will be stopped if one of them ever returns None or the end
3889 of the chain is reached.
3891 PostProcessor objects follow a "mutual registration" process similar
3892 to InfoExtractor objects.
3897 def __init__(self, downloader=None):
3898 self._downloader = downloader
3900 def set_downloader(self, downloader):
3901 """Sets the downloader for this PP."""
3902 self._downloader = downloader
3904 def run(self, information):
3905 """Run the PostProcessor.
3907 The "information" argument is a dictionary like the ones
3908 composed by InfoExtractors. The only difference is that this
3909 one has an extra field called "filepath" that points to the
3912 When this method returns None, the postprocessing chain is
3913 stopped. However, this method may return an information
3914 dictionary that will be passed to the next postprocessing
3915 object in the chain. It can be the one it received after
3916 changing some fields.
3918 In addition, this method may raise a PostProcessingError
3919 exception that will be taken into account by the downloader
3922 return information # by default, do nothing
3925 class FFmpegExtractAudioPP(PostProcessor):
3927 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3928 PostProcessor.__init__(self, downloader)
3929 if preferredcodec is None:
3930 preferredcodec = 'best'
3931 self._preferredcodec = preferredcodec
3932 self._preferredquality = preferredquality
3933 self._keepvideo = keepvideo
3936 def get_audio_codec(path):
3938 cmd = ['ffprobe', '-show_streams', '--', path]
3939 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3940 output = handle.communicate()[0]
3941 if handle.wait() != 0:
3943 except (IOError, OSError):
3946 for line in output.split('\n'):
3947 if line.startswith('codec_name='):
3948 audio_codec = line.split('=')[1].strip()
3949 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3954 def run_ffmpeg(path, out_path, codec, more_opts):
3956 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3957 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3959 except (IOError, OSError):
3962 def run(self, information):
3963 path = information['filepath']
3965 filecodec = self.get_audio_codec(path)
3966 if filecodec is None:
3967 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3971 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3972 if filecodec in ['aac', 'mp3', 'vorbis']:
3973 # Lossless if possible
3975 extension = filecodec
3976 if filecodec == 'aac':
3977 more_opts = ['-f', 'adts']
3978 if filecodec == 'vorbis':
3982 acodec = 'libmp3lame'
3985 if self._preferredquality is not None:
3986 more_opts += ['-ab', self._preferredquality]
3988 # We convert the audio (lossy)
3989 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3990 extension = self._preferredcodec
3992 if self._preferredquality is not None:
3993 more_opts += ['-ab', self._preferredquality]
3994 if self._preferredcodec == 'aac':
3995 more_opts += ['-f', 'adts']
3996 if self._preferredcodec == 'vorbis':
3999 (prefix, ext) = os.path.splitext(path)
4000 new_path = prefix + '.' + extension
4001 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
4002 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
4005 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
4008 # Try to update the date time for extracted audio file.
4009 if information.get('filetime') is not None:
4011 os.utime(new_path, (time.time(), information['filetime']))
4013 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4015 if not self._keepvideo:
4018 except (IOError, OSError):
4019 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4022 information['filepath'] = new_path
4026 def updateSelf(downloader, filename):
4027 ''' Update the program file with the latest version from the repository '''
4028 # Note: downloader only used for options
4029 if not os.access(filename, os.W_OK):
4030 sys.exit('ERROR: no write permissions on %s' % filename)
4032 downloader.to_screen('Updating to latest version...')
4036 urlh = urllib.urlopen(UPDATE_URL)
4037 newcontent = urlh.read()
4039 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4040 if vmatch is not None and vmatch.group(1) == __version__:
4041 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
4045 except (IOError, OSError), err:
4046 sys.exit('ERROR: unable to download latest version')
4049 outf = open(filename, 'wb')
4051 outf.write(newcontent)
4054 except (IOError, OSError), err:
4055 sys.exit('ERROR: unable to overwrite current version')
4057 downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
4065 def _readOptions(filename):
4067 optionf = open(filename)
4069 return [] # silently skip if file is not present
4073 res += shlex.split(l, comments=True)
4078 def _format_option_string(option):
4079 ''' ('-o', '--option') -> -o, --format METAVAR'''
4083 if option._short_opts: opts.append(option._short_opts[0])
4084 if option._long_opts: opts.append(option._long_opts[0])
4085 if len(opts) > 1: opts.insert(1, ', ')
4087 if option.takes_value(): opts.append(' %s' % option.metavar)
4089 return "".join(opts)
4091 def _find_term_columns():
4092 columns = os.environ.get('COLUMNS', None)
4097 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4098 out,err = sp.communicate()
4099 return int(out.split()[1])
4105 max_help_position = 80
4107 # No need to wrap help messages if we're on a wide console
4108 columns = _find_term_columns()
4109 if columns: max_width = columns
4111 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4112 fmt.format_option_strings = _format_option_string
4115 'version' : __version__,
4117 'usage' : '%prog [options] url [url...]',
4118 'conflict_handler' : 'resolve',
4121 parser = optparse.OptionParser(**kw)
4124 general = optparse.OptionGroup(parser, 'General Options')
4125 selection = optparse.OptionGroup(parser, 'Video Selection')
4126 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4127 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4128 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4129 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4130 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4132 general.add_option('-h', '--help',
4133 action='help', help='print this help text and exit')
4134 general.add_option('-v', '--version',
4135 action='version', help='print program version and exit')
4136 general.add_option('-U', '--update',
4137 action='store_true', dest='update_self', help='update this program to latest version')
4138 general.add_option('-i', '--ignore-errors',
4139 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4140 general.add_option('-r', '--rate-limit',
4141 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4142 general.add_option('-R', '--retries',
4143 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4144 general.add_option('--dump-user-agent',
4145 action='store_true', dest='dump_user_agent',
4146 help='display the current browser identification', default=False)
4147 general.add_option('--list-extractors',
4148 action='store_true', dest='list_extractors',
4149 help='List all supported extractors and the URLs they would handle', default=False)
4151 selection.add_option('--playlist-start',
4152 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4153 selection.add_option('--playlist-end',
4154 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4155 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4156 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4157 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4159 authentication.add_option('-u', '--username',
4160 dest='username', metavar='USERNAME', help='account username')
4161 authentication.add_option('-p', '--password',
4162 dest='password', metavar='PASSWORD', help='account password')
4163 authentication.add_option('-n', '--netrc',
4164 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4167 video_format.add_option('-f', '--format',
4168 action='store', dest='format', metavar='FORMAT', help='video format code')
4169 video_format.add_option('--all-formats',
4170 action='store_const', dest='format', help='download all available video formats', const='all')
4171 video_format.add_option('--max-quality',
4172 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4173 video_format.add_option('-F', '--list-formats',
4174 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4177 verbosity.add_option('-q', '--quiet',
4178 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4179 verbosity.add_option('-s', '--simulate',
4180 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4181 verbosity.add_option('--skip-download',
4182 action='store_true', dest='skip_download', help='do not download the video', default=False)
4183 verbosity.add_option('-g', '--get-url',
4184 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4185 verbosity.add_option('-e', '--get-title',
4186 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4187 verbosity.add_option('--get-thumbnail',
4188 action='store_true', dest='getthumbnail',
4189 help='simulate, quiet but print thumbnail URL', default=False)
4190 verbosity.add_option('--get-description',
4191 action='store_true', dest='getdescription',
4192 help='simulate, quiet but print video description', default=False)
4193 verbosity.add_option('--get-filename',
4194 action='store_true', dest='getfilename',
4195 help='simulate, quiet but print output filename', default=False)
4196 verbosity.add_option('--get-format',
4197 action='store_true', dest='getformat',
4198 help='simulate, quiet but print output format', default=False)
4199 verbosity.add_option('--no-progress',
4200 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4201 verbosity.add_option('--console-title',
4202 action='store_true', dest='consoletitle',
4203 help='display progress in console titlebar', default=False)
4206 filesystem.add_option('-t', '--title',
4207 action='store_true', dest='usetitle', help='use title in file name', default=False)
4208 filesystem.add_option('-l', '--literal',
4209 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4210 filesystem.add_option('-A', '--auto-number',
4211 action='store_true', dest='autonumber',
4212 help='number downloaded files starting from 00000', default=False)
4213 filesystem.add_option('-o', '--output',
4214 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4215 filesystem.add_option('-a', '--batch-file',
4216 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4217 filesystem.add_option('-w', '--no-overwrites',
4218 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4219 filesystem.add_option('-c', '--continue',
4220 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
4221 filesystem.add_option('--no-continue',
4222 action='store_false', dest='continue_dl',
4223 help='do not resume partially downloaded files (restart from beginning)')
4224 filesystem.add_option('--cookies',
4225 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4226 filesystem.add_option('--no-part',
4227 action='store_true', dest='nopart', help='do not use .part files', default=False)
4228 filesystem.add_option('--no-mtime',
4229 action='store_false', dest='updatetime',
4230 help='do not use the Last-modified header to set the file modification time', default=True)
4231 filesystem.add_option('--write-description',
4232 action='store_true', dest='writedescription',
4233 help='write video description to a .description file', default=False)
4234 filesystem.add_option('--write-info-json',
4235 action='store_true', dest='writeinfojson',
4236 help='write video metadata to a .info.json file', default=False)
4239 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4240 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4241 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4242 help='"best", "aac", "vorbis" or "mp3"; best by default')
4243 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4244 help='ffmpeg audio bitrate specification, 128k by default')
4245 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4246 help='keeps the video file on disk after the post-processing; the video is erased by default')
4249 parser.add_option_group(general)
4250 parser.add_option_group(selection)
4251 parser.add_option_group(filesystem)
4252 parser.add_option_group(verbosity)
4253 parser.add_option_group(video_format)
4254 parser.add_option_group(authentication)
4255 parser.add_option_group(postproc)
4257 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4259 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4261 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4262 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4263 opts, args = parser.parse_args(argv)
4265 return parser, opts, args
4267 def gen_extractors():
4268 """ Return a list of an instance of every supported extractor.
4269 The order does matter; the first extractor matched is the one handling the URL.
4271 youtube_ie = YoutubeIE()
4272 google_ie = GoogleIE()
4273 yahoo_ie = YahooIE()
4275 YoutubePlaylistIE(youtube_ie),
4276 YoutubeUserIE(youtube_ie),
4277 YoutubeSearchIE(youtube_ie),
4279 MetacafeIE(youtube_ie),
4282 GoogleSearchIE(google_ie),
4285 YahooSearchIE(yahoo_ie),
4298 StanfordOpenClassroomIE(),
4304 parser, opts, args = parseOpts()
4306 # Open appropriate CookieJar
4307 if opts.cookiefile is None:
4308 jar = cookielib.CookieJar()
4311 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4312 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4314 except (IOError, OSError), err:
4315 sys.exit(u'ERROR: unable to open cookie file')
4318 if opts.dump_user_agent:
4319 print std_headers['User-Agent']
4322 # Batch file verification
4324 if opts.batchfile is not None:
4326 if opts.batchfile == '-':
4329 batchfd = open(opts.batchfile, 'r')
4330 batchurls = batchfd.readlines()
4331 batchurls = [x.strip() for x in batchurls]
4332 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4334 sys.exit(u'ERROR: batch file could not be read')
4335 all_urls = batchurls + args
4337 # General configuration
4338 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4339 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4340 urllib2.install_opener(opener)
4341 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4343 extractors = gen_extractors()
4345 if opts.list_extractors:
4346 for ie in extractors:
4348 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4349 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4350 for mu in matchedUrls:
4354 # Conflicting, missing and erroneous options
4355 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4356 parser.error(u'using .netrc conflicts with giving username/password')
4357 if opts.password is not None and opts.username is None:
4358 parser.error(u'account username missing')
4359 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4360 parser.error(u'using output template conflicts with using title, literal title or auto number')
4361 if opts.usetitle and opts.useliteral:
4362 parser.error(u'using title conflicts with using literal title')
4363 if opts.username is not None and opts.password is None:
4364 opts.password = getpass.getpass(u'Type account password and press return:')
4365 if opts.ratelimit is not None:
4366 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4367 if numeric_limit is None:
4368 parser.error(u'invalid rate limit specified')
4369 opts.ratelimit = numeric_limit
4370 if opts.retries is not None:
4372 opts.retries = long(opts.retries)
4373 except (TypeError, ValueError), err:
4374 parser.error(u'invalid retry count specified')
4376 opts.playliststart = int(opts.playliststart)
4377 if opts.playliststart <= 0:
4378 raise ValueError(u'Playlist start must be positive')
4379 except (TypeError, ValueError), err:
4380 parser.error(u'invalid playlist start number specified')
4382 opts.playlistend = int(opts.playlistend)
4383 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4384 raise ValueError(u'Playlist end must be greater than playlist start')
4385 except (TypeError, ValueError), err:
4386 parser.error(u'invalid playlist end number specified')
4387 if opts.extractaudio:
4388 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis']:
4389 parser.error(u'invalid audio format specified')
4392 fd = FileDownloader({
4393 'usenetrc': opts.usenetrc,
4394 'username': opts.username,
4395 'password': opts.password,
4396 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4397 'forceurl': opts.geturl,
4398 'forcetitle': opts.gettitle,
4399 'forcethumbnail': opts.getthumbnail,
4400 'forcedescription': opts.getdescription,
4401 'forcefilename': opts.getfilename,
4402 'forceformat': opts.getformat,
4403 'simulate': opts.simulate,
4404 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4405 'format': opts.format,
4406 'format_limit': opts.format_limit,
4407 'listformats': opts.listformats,
4408 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4409 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4410 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4411 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4412 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4413 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4414 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4415 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4416 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4417 or u'%(id)s.%(ext)s'),
4418 'ignoreerrors': opts.ignoreerrors,
4419 'ratelimit': opts.ratelimit,
4420 'nooverwrites': opts.nooverwrites,
4421 'retries': opts.retries,
4422 'continuedl': opts.continue_dl,
4423 'noprogress': opts.noprogress,
4424 'playliststart': opts.playliststart,
4425 'playlistend': opts.playlistend,
4426 'logtostderr': opts.outtmpl == '-',
4427 'consoletitle': opts.consoletitle,
4428 'nopart': opts.nopart,
4429 'updatetime': opts.updatetime,
4430 'writedescription': opts.writedescription,
4431 'writeinfojson': opts.writeinfojson,
4432 'matchtitle': opts.matchtitle,
4433 'rejecttitle': opts.rejecttitle,
4434 'max_downloads': opts.max_downloads,
4436 for extractor in extractors:
4437 fd.add_info_extractor(extractor)
4440 if opts.extractaudio:
4441 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4444 if opts.update_self:
4445 updateSelf(fd, sys.argv[0])
4448 if len(all_urls) < 1:
4449 if not opts.update_self:
4450 parser.error(u'you must provide at least one URL')
4455 retcode = fd.download(all_urls)
4456 except MaxDownloadsReached:
4457 fd.to_screen(u'--max-download limit reached, aborting.')
4460 # Dump cookie jar if requested
4461 if opts.cookiefile is not None:
4464 except (IOError, OSError), err:
4465 sys.exit(u'ERROR: unable to save cookie jar')
4472 except DownloadError:
4474 except SameFileError:
4475 sys.exit(u'ERROR: fixed output name but more than one file to download')
4476 except KeyboardInterrupt:
4477 sys.exit(u'\nERROR: Interrupted by user')
4479 if __name__ == '__main__':
4482 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: