2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
20 __license__ = 'Public Domain'
21 __version__ = '2011.12.08'
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
52 except ImportError: # Python 2.4
55 import cStringIO as StringIO
59 # parse_qs was moved from the cgi module to the urlparse module recently.
61 from urlparse import parse_qs
63 from cgi import parse_qs
71 import xml.etree.ElementTree
72 except ImportError: # Python<2.5: Not officially supported, but let it slip
73 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
76 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
77 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
78 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
79 'Accept-Encoding': 'gzip, deflate',
80 'Accept-Language': 'en-us,en;q=0.5',
85 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
91 def raiseError(msg, i):
92 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
93 def skipSpace(i, expectMore=True):
94 while i < len(s) and s[i] in ' \t\r\n':
98 raiseError('Premature end', i)
100 def decodeEscape(match):
116 return unichr(int(esc[1:5], 16))
117 if len(esc) == 5+6 and esc[5:7] == '\\u':
118 hi = int(esc[1:5], 16)
119 low = int(esc[7:11], 16)
120 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
121 raise ValueError('Unknown escape ' + str(esc))
128 while s[e-bslashes-1] == '\\':
130 if bslashes % 2 == 1:
134 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
135 stri = rexp.sub(decodeEscape, s[i:e])
141 if s[i] == '}': # Empty dictionary
145 raiseError('Expected a string object key', i)
146 i,key = parseString(i)
148 if i >= len(s) or s[i] != ':':
149 raiseError('Expected a colon', i)
156 raiseError('Expected comma or closing curly brace', i)
161 if s[i] == ']': # Empty array
166 i = skipSpace(i) # Raise exception if premature end
170 raiseError('Expected a comma or closing bracket', i)
172 def parseDiscrete(i):
173 for k,v in {'true': True, 'false': False, 'null': None}.items():
174 if s.startswith(k, i):
176 raiseError('Not a boolean (or null)', i)
178 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
180 raiseError('Not a number', i)
182 if '.' in nums or 'e' in nums or 'E' in nums:
183 return (i+len(nums), float(nums))
184 return (i+len(nums), int(nums))
185 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
188 i,res = CHARMAP.get(s[i], parseNumber)(i)
189 i = skipSpace(i, False)
193 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
196 def preferredencoding():
197 """Get preferred encoding.
199 Returns the best encoding scheme for the system, based on
200 locale.getpreferredencoding() and some further tweaks.
202 def yield_preferredencoding():
204 pref = locale.getpreferredencoding()
210 return yield_preferredencoding().next()
213 def htmlentity_transform(matchobj):
214 """Transforms an HTML entity to a Unicode character.
216 This function receives a match object and is intended to be used with
217 the re.sub() function.
219 entity = matchobj.group(1)
221 # Known non-numeric HTML entity
222 if entity in htmlentitydefs.name2codepoint:
223 return unichr(htmlentitydefs.name2codepoint[entity])
226 mobj = re.match(ur'(?u)#(x?\d+)', entity)
228 numstr = mobj.group(1)
229 if numstr.startswith(u'x'):
231 numstr = u'0%s' % numstr
234 return unichr(long(numstr, base))
236 # Unknown entity in name, return its literal representation
237 return (u'&%s;' % entity)
240 def sanitize_title(utitle):
241 """Sanitizes a video title so it could be used as part of a filename."""
242 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
243 return utitle.replace(unicode(os.sep), u'%')
246 def sanitize_open(filename, open_mode):
247 """Try to open the given filename, and slightly tweak it if this fails.
249 Attempts to open the given filename. If this fails, it tries to change
250 the filename slightly, step by step, until it's either able to open it
251 or it fails and raises a final exception, like the standard open()
254 It returns the tuple (stream, definitive_file_name).
258 if sys.platform == 'win32':
260 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
261 return (sys.stdout, filename)
262 stream = open(filename, open_mode)
263 return (stream, filename)
264 except (IOError, OSError), err:
265 # In case of error, try to remove win32 forbidden chars
266 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
268 # An exception here should be caught in the caller
269 stream = open(filename, open_mode)
270 return (stream, filename)
273 def timeconvert(timestr):
274 """Convert RFC 2822 defined time string into system timestamp"""
276 timetuple = email.utils.parsedate_tz(timestr)
277 if timetuple is not None:
278 timestamp = email.utils.mktime_tz(timetuple)
281 def _simplify_title(title):
282 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
283 return expr.sub(u'_', title).strip(u'_')
285 def _orderedSet(iterable):
286 """ Remove all duplicates from the input iterable """
293 class DownloadError(Exception):
294 """Download Error exception.
296 This exception may be thrown by FileDownloader objects if they are not
297 configured to continue on errors. They will contain the appropriate
303 class SameFileError(Exception):
304 """Same File exception.
306 This exception will be thrown by FileDownloader objects if they detect
307 multiple files would have to be downloaded to the same file on disk.
312 class PostProcessingError(Exception):
313 """Post Processing exception.
315 This exception may be raised by PostProcessor's .run() method to
316 indicate an error in the postprocessing task.
320 class MaxDownloadsReached(Exception):
321 """ --max-downloads limit has been reached. """
325 class UnavailableVideoError(Exception):
326 """Unavailable Format exception.
328 This exception will be thrown when a video is requested
329 in a format that is not available for that video.
334 class ContentTooShortError(Exception):
335 """Content Too Short exception.
337 This exception may be raised by FileDownloader objects when a file they
338 download is too small for what the server announced first, indicating
339 the connection was probably interrupted.
345 def __init__(self, downloaded, expected):
346 self.downloaded = downloaded
347 self.expected = expected
350 class YoutubeDLHandler(urllib2.HTTPHandler):
351 """Handler for HTTP requests and responses.
353 This class, when installed with an OpenerDirector, automatically adds
354 the standard headers to every HTTP request and handles gzipped and
355 deflated responses from web servers. If compression is to be avoided in
356 a particular request, the original request in the program code only has
357 to include the HTTP header "Youtubedl-No-Compression", which will be
358 removed before making the real request.
360 Part of this code was copied from:
362 http://techknack.net/python-urllib2-handlers/
364 Andrew Rowls, the author of that code, agreed to release it to the
371 return zlib.decompress(data, -zlib.MAX_WBITS)
373 return zlib.decompress(data)
376 def addinfourl_wrapper(stream, headers, url, code):
377 if hasattr(urllib2.addinfourl, 'getcode'):
378 return urllib2.addinfourl(stream, headers, url, code)
379 ret = urllib2.addinfourl(stream, headers, url)
383 def http_request(self, req):
384 for h in std_headers:
387 req.add_header(h, std_headers[h])
388 if 'Youtubedl-no-compression' in req.headers:
389 if 'Accept-encoding' in req.headers:
390 del req.headers['Accept-encoding']
391 del req.headers['Youtubedl-no-compression']
394 def http_response(self, req, resp):
397 if resp.headers.get('Content-encoding', '') == 'gzip':
398 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
399 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
400 resp.msg = old_resp.msg
402 if resp.headers.get('Content-encoding', '') == 'deflate':
403 gz = StringIO.StringIO(self.deflate(resp.read()))
404 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
405 resp.msg = old_resp.msg
409 class FileDownloader(object):
410 """File Downloader class.
412 File downloader objects are the ones responsible of downloading the
413 actual video file and writing it to disk if the user has requested
414 it, among some other tasks. In most cases there should be one per
415 program. As, given a video URL, the downloader doesn't know how to
416 extract all the needed information, task that InfoExtractors do, it
417 has to pass the URL to one of them.
419 For this, file downloader objects have a method that allows
420 InfoExtractors to be registered in a given order. When it is passed
421 a URL, the file downloader handles it to the first InfoExtractor it
422 finds that reports being able to handle it. The InfoExtractor extracts
423 all the information about the video or videos the URL refers to, and
424 asks the FileDownloader to process the video information, possibly
425 downloading the video.
427 File downloaders accept a lot of parameters. In order not to saturate
428 the object constructor with arguments, it receives a dictionary of
429 options instead. These options are available through the params
430 attribute for the InfoExtractors to use. The FileDownloader also
431 registers itself as the downloader in charge for the InfoExtractors
432 that are added to it, so this is a "mutual registration".
436 username: Username for authentication purposes.
437 password: Password for authentication purposes.
438 usenetrc: Use netrc for authentication instead.
439 quiet: Do not print messages to stdout.
440 forceurl: Force printing final URL.
441 forcetitle: Force printing title.
442 forcethumbnail: Force printing thumbnail URL.
443 forcedescription: Force printing description.
444 forcefilename: Force printing final filename.
445 simulate: Do not download the video files.
446 format: Video format code.
447 format_limit: Highest quality format to try.
448 outtmpl: Template for output names.
449 ignoreerrors: Do not stop on download errors.
450 ratelimit: Download speed limit, in bytes/sec.
451 nooverwrites: Prevent overwriting files.
452 retries: Number of times to retry for HTTP error 5xx
453 continuedl: Try to continue downloads if possible.
454 noprogress: Do not print the progress bar.
455 playliststart: Playlist item to start at.
456 playlistend: Playlist item to end at.
457 matchtitle: Download only matching titles.
458 rejecttitle: Reject downloads for matching titles.
459 logtostderr: Log messages to stderr instead of stdout.
460 consoletitle: Display progress in console window's titlebar.
461 nopart: Do not use temporary .part files.
462 updatetime: Use the Last-modified header to set output file timestamps.
463 writedescription: Write the video description to a .description file
464 writeinfojson: Write the video description to a .info.json file
470 _download_retcode = None
471 _num_downloads = None
474 def __init__(self, params):
475 """Create a FileDownloader object with the given options."""
478 self._download_retcode = 0
479 self._num_downloads = 0
480 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
484 def format_bytes(bytes):
487 if type(bytes) is str:
492 exponent = long(math.log(bytes, 1024.0))
493 suffix = 'bkMGTPEZY'[exponent]
494 converted = float(bytes) / float(1024 ** exponent)
495 return '%.2f%s' % (converted, suffix)
498 def calc_percent(byte_counter, data_len):
501 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
504 def calc_eta(start, now, total, current):
508 if current == 0 or dif < 0.001: # One millisecond
510 rate = float(current) / dif
511 eta = long((float(total) - float(current)) / rate)
512 (eta_mins, eta_secs) = divmod(eta, 60)
515 return '%02d:%02d' % (eta_mins, eta_secs)
518 def calc_speed(start, now, bytes):
520 if bytes == 0 or dif < 0.001: # One millisecond
521 return '%10s' % '---b/s'
522 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
525 def best_block_size(elapsed_time, bytes):
526 new_min = max(bytes / 2.0, 1.0)
527 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
528 if elapsed_time < 0.001:
530 rate = bytes / elapsed_time
538 def parse_bytes(bytestr):
539 """Parse a string indicating a byte quantity into a long integer."""
540 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
543 number = float(matchobj.group(1))
544 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
545 return long(round(number * multiplier))
547 def add_info_extractor(self, ie):
548 """Add an InfoExtractor object to the end of the list."""
550 ie.set_downloader(self)
552 def add_post_processor(self, pp):
553 """Add a PostProcessor object to the end of the chain."""
555 pp.set_downloader(self)
557 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
558 """Print message to stdout if not in quiet mode."""
560 if not self.params.get('quiet', False):
561 terminator = [u'\n', u''][skip_eol]
562 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
563 self._screen_file.flush()
564 except (UnicodeEncodeError), err:
565 if not ignore_encoding_errors:
568 def to_stderr(self, message):
569 """Print message to stderr."""
570 print >>sys.stderr, message.encode(preferredencoding())
572 def to_cons_title(self, message):
573 """Set console/terminal window title to message."""
574 if not self.params.get('consoletitle', False):
576 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
577 # c_wchar_p() might not be necessary if `message` is
578 # already of type unicode()
579 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
580 elif 'TERM' in os.environ:
581 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
583 def fixed_template(self):
584 """Checks if the output template is fixed."""
585 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
587 def trouble(self, message=None):
588 """Determine action to take when a download problem appears.
590 Depending on if the downloader has been configured to ignore
591 download errors or not, this method may throw an exception or
592 not when errors are found, after printing the message.
594 if message is not None:
595 self.to_stderr(message)
596 if not self.params.get('ignoreerrors', False):
597 raise DownloadError(message)
598 self._download_retcode = 1
600 def slow_down(self, start_time, byte_counter):
601 """Sleep if the download speed is over the rate limit."""
602 rate_limit = self.params.get('ratelimit', None)
603 if rate_limit is None or byte_counter == 0:
606 elapsed = now - start_time
609 speed = float(byte_counter) / elapsed
610 if speed > rate_limit:
611 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
613 def temp_name(self, filename):
614 """Returns a temporary filename for the given filename."""
615 if self.params.get('nopart', False) or filename == u'-' or \
616 (os.path.exists(filename) and not os.path.isfile(filename)):
618 return filename + u'.part'
620 def undo_temp_name(self, filename):
621 if filename.endswith(u'.part'):
622 return filename[:-len(u'.part')]
625 def try_rename(self, old_filename, new_filename):
627 if old_filename == new_filename:
629 os.rename(old_filename, new_filename)
630 except (IOError, OSError), err:
631 self.trouble(u'ERROR: unable to rename file')
633 def try_utime(self, filename, last_modified_hdr):
634 """Try to set the last-modified time of the given file."""
635 if last_modified_hdr is None:
637 if not os.path.isfile(filename):
639 timestr = last_modified_hdr
642 filetime = timeconvert(timestr)
646 os.utime(filename, (time.time(), filetime))
651 def report_writedescription(self, descfn):
652 """ Report that the description file is being written """
653 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
655 def report_writeinfojson(self, infofn):
656 """ Report that the metadata file has been written """
657 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
659 def report_destination(self, filename):
660 """Report destination filename."""
661 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
663 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
664 """Report download progress."""
665 if self.params.get('noprogress', False):
667 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
668 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
669 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
670 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
672 def report_resuming_byte(self, resume_len):
673 """Report attempt to resume at given byte."""
674 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
676 def report_retry(self, count, retries):
677 """Report retry in case of HTTP error 5xx"""
678 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
680 def report_file_already_downloaded(self, file_name):
681 """Report file has already been fully downloaded."""
683 self.to_screen(u'[download] %s has already been downloaded' % file_name)
684 except (UnicodeEncodeError), err:
685 self.to_screen(u'[download] The file has already been downloaded')
687 def report_unable_to_resume(self):
688 """Report it was impossible to resume download."""
689 self.to_screen(u'[download] Unable to resume')
691 def report_finish(self):
692 """Report download finished."""
693 if self.params.get('noprogress', False):
694 self.to_screen(u'[download] Download completed')
698 def increment_downloads(self):
699 """Increment the ordinal that assigns a number to each file."""
700 self._num_downloads += 1
702 def prepare_filename(self, info_dict):
703 """Generate the output filename."""
705 template_dict = dict(info_dict)
706 template_dict['epoch'] = unicode(long(time.time()))
707 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
708 filename = self.params['outtmpl'] % template_dict
710 except (ValueError, KeyError), err:
711 self.trouble(u'ERROR: invalid system charset or erroneous output template')
714 def _match_entry(self, info_dict):
715 """ Returns None iff the file should be downloaded """
717 title = info_dict['title']
718 matchtitle = self.params.get('matchtitle', False)
719 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
720 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
721 rejecttitle = self.params.get('rejecttitle', False)
722 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
723 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
726 def process_info(self, info_dict):
727 """Process a single dictionary returned by an InfoExtractor."""
729 reason = self._match_entry(info_dict)
730 if reason is not None:
731 self.to_screen(u'[download] ' + reason)
734 max_downloads = self.params.get('max_downloads')
735 if max_downloads is not None:
736 if self._num_downloads > int(max_downloads):
737 raise MaxDownloadsReached()
739 filename = self.prepare_filename(info_dict)
742 if self.params.get('forcetitle', False):
743 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
744 if self.params.get('forceurl', False):
745 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
746 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
747 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
748 if self.params.get('forcedescription', False) and 'description' in info_dict:
749 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
750 if self.params.get('forcefilename', False) and filename is not None:
751 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
752 if self.params.get('forceformat', False):
753 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
755 # Do nothing else if in simulate mode
756 if self.params.get('simulate', False):
762 if self.params.get('nooverwrites', False) and os.path.exists(filename):
763 self.to_stderr(u'WARNING: file exists and will be skipped')
767 dn = os.path.dirname(filename)
768 if dn != '' and not os.path.exists(dn):
770 except (OSError, IOError), err:
771 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
774 if self.params.get('writedescription', False):
776 descfn = filename + '.description'
777 self.report_writedescription(descfn)
778 descfile = open(descfn, 'wb')
780 descfile.write(info_dict['description'].encode('utf-8'))
783 except (OSError, IOError):
784 self.trouble(u'ERROR: Cannot write description file ' + descfn)
787 if self.params.get('writeinfojson', False):
788 infofn = filename + '.info.json'
789 self.report_writeinfojson(infofn)
792 except (NameError,AttributeError):
793 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
796 infof = open(infofn, 'wb')
798 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
799 json.dump(json_info_dict, infof)
802 except (OSError, IOError):
803 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
806 if not self.params.get('skip_download', False):
808 success = self._do_download(filename, info_dict)
809 except (OSError, IOError), err:
810 raise UnavailableVideoError
811 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
812 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
814 except (ContentTooShortError, ), err:
815 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
820 self.post_process(filename, info_dict)
821 except (PostProcessingError), err:
822 self.trouble(u'ERROR: postprocessing: %s' % str(err))
825 def download(self, url_list):
826 """Download a given list of URLs."""
827 if len(url_list) > 1 and self.fixed_template():
828 raise SameFileError(self.params['outtmpl'])
831 suitable_found = False
833 # Go to next InfoExtractor if not suitable
834 if not ie.suitable(url):
837 # Suitable InfoExtractor found
838 suitable_found = True
840 # Extract information from URL and process it
843 # Suitable InfoExtractor had been found; go to next URL
846 if not suitable_found:
847 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
849 return self._download_retcode
851 def post_process(self, filename, ie_info):
852 """Run the postprocessing chain on the given file."""
854 info['filepath'] = filename
860 def _download_with_rtmpdump(self, filename, url, player_url):
861 self.report_destination(filename)
862 tmpfilename = self.temp_name(filename)
864 # Check for rtmpdump first
866 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
867 except (OSError, IOError):
868 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
871 # Download using rtmpdump. rtmpdump returns exit code 2 when
872 # the connection was interrumpted and resuming appears to be
873 # possible. This is part of rtmpdump's normal usage, AFAIK.
874 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
875 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
876 while retval == 2 or retval == 1:
877 prevsize = os.path.getsize(tmpfilename)
878 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
879 time.sleep(5.0) # This seems to be needed
880 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
881 cursize = os.path.getsize(tmpfilename)
882 if prevsize == cursize and retval == 1:
884 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
885 if prevsize == cursize and retval == 2 and cursize > 1024:
886 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
890 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
891 self.try_rename(tmpfilename, filename)
894 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
897 def _do_download(self, filename, info_dict):
898 url = info_dict['url']
899 player_url = info_dict.get('player_url', None)
901 # Check file already present
902 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
903 self.report_file_already_downloaded(filename)
906 # Attempt to download using rtmpdump
907 if url.startswith('rtmp'):
908 return self._download_with_rtmpdump(filename, url, player_url)
910 tmpfilename = self.temp_name(filename)
913 # Do not include the Accept-Encoding header
914 headers = {'Youtubedl-no-compression': 'True'}
915 basic_request = urllib2.Request(url, None, headers)
916 request = urllib2.Request(url, None, headers)
918 # Establish possible resume length
919 if os.path.isfile(tmpfilename):
920 resume_len = os.path.getsize(tmpfilename)
926 if self.params.get('continuedl', False):
927 self.report_resuming_byte(resume_len)
928 request.add_header('Range','bytes=%d-' % resume_len)
934 retries = self.params.get('retries', 0)
935 while count <= retries:
936 # Establish connection
938 if count == 0 and 'urlhandle' in info_dict:
939 data = info_dict['urlhandle']
940 data = urllib2.urlopen(request)
942 except (urllib2.HTTPError, ), err:
943 if (err.code < 500 or err.code >= 600) and err.code != 416:
944 # Unexpected HTTP error
946 elif err.code == 416:
947 # Unable to resume (requested range not satisfiable)
949 # Open the connection again without the range header
950 data = urllib2.urlopen(basic_request)
951 content_length = data.info()['Content-Length']
952 except (urllib2.HTTPError, ), err:
953 if err.code < 500 or err.code >= 600:
956 # Examine the reported length
957 if (content_length is not None and
958 (resume_len - 100 < long(content_length) < resume_len + 100)):
959 # The file had already been fully downloaded.
960 # Explanation to the above condition: in issue #175 it was revealed that
961 # YouTube sometimes adds or removes a few bytes from the end of the file,
962 # changing the file size slightly and causing problems for some users. So
963 # I decided to implement a suggested change and consider the file
964 # completely downloaded if the file size differs less than 100 bytes from
965 # the one in the hard drive.
966 self.report_file_already_downloaded(filename)
967 self.try_rename(tmpfilename, filename)
970 # The length does not match, we start the download over
971 self.report_unable_to_resume()
977 self.report_retry(count, retries)
980 self.trouble(u'ERROR: giving up after %s retries' % retries)
983 data_len = data.info().get('Content-length', None)
984 if data_len is not None:
985 data_len = long(data_len) + resume_len
986 data_len_str = self.format_bytes(data_len)
987 byte_counter = 0 + resume_len
993 data_block = data.read(block_size)
995 if len(data_block) == 0:
997 byte_counter += len(data_block)
999 # Open file just in time
1002 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1003 assert stream is not None
1004 filename = self.undo_temp_name(tmpfilename)
1005 self.report_destination(filename)
1006 except (OSError, IOError), err:
1007 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1010 stream.write(data_block)
1011 except (IOError, OSError), err:
1012 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1014 block_size = self.best_block_size(after - before, len(data_block))
1017 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1018 if data_len is None:
1019 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1021 percent_str = self.calc_percent(byte_counter, data_len)
1022 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1023 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1026 self.slow_down(start, byte_counter - resume_len)
1029 self.trouble(u'\nERROR: Did not get any data blocks')
1032 self.report_finish()
1033 if data_len is not None and byte_counter != data_len:
1034 raise ContentTooShortError(byte_counter, long(data_len))
1035 self.try_rename(tmpfilename, filename)
1037 # Update file modification time
1038 if self.params.get('updatetime', True):
1039 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1044 class InfoExtractor(object):
1045 """Information Extractor class.
1047 Information extractors are the classes that, given a URL, extract
1048 information from the video (or videos) the URL refers to. This
1049 information includes the real video URL, the video title and simplified
1050 title, author and others. The information is stored in a dictionary
1051 which is then passed to the FileDownloader. The FileDownloader
1052 processes this information possibly downloading the video to the file
1053 system, among other possible outcomes. The dictionaries must include
1054 the following fields:
1056 id: Video identifier.
1057 url: Final video URL.
1058 uploader: Nickname of the video uploader.
1059 title: Literal title.
1060 stitle: Simplified title.
1061 ext: Video filename extension.
1062 format: Video format.
1063 player_url: SWF Player URL (may be None).
1065 The following fields are optional. Their primary purpose is to allow
1066 youtube-dl to serve as the backend for a video search function, such
1067 as the one in youtube2mp3. They are only used when their respective
1068 forced printing functions are called:
1070 thumbnail: Full URL to a video thumbnail image.
1071 description: One-line video description.
1073 Subclasses of this one should re-define the _real_initialize() and
1074 _real_extract() methods and define a _VALID_URL regexp.
1075 Probably, they should also be added to the list of extractors.
1081 def __init__(self, downloader=None):
1082 """Constructor. Receives an optional downloader."""
1084 self.set_downloader(downloader)
1086 def suitable(self, url):
1087 """Receives a URL and returns True if suitable for this IE."""
1088 return re.match(self._VALID_URL, url) is not None
1090 def initialize(self):
1091 """Initializes an instance (authentication, etc)."""
1093 self._real_initialize()
1096 def extract(self, url):
1097 """Extracts URL information and returns it in list of dicts."""
1099 return self._real_extract(url)
1101 def set_downloader(self, downloader):
1102 """Sets the downloader for this IE."""
1103 self._downloader = downloader
1105 def _real_initialize(self):
1106 """Real initialization process. Redefine in subclasses."""
1109 def _real_extract(self, url):
1110 """Real extraction process. Redefine in subclasses."""
1114 class YoutubeIE(InfoExtractor):
1115 """Information extractor for youtube.com."""
1117 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1118 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1119 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1120 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1121 _NETRC_MACHINE = 'youtube'
1122 # Listed in order of quality
1123 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1124 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1125 _video_extensions = {
1131 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1136 _video_dimensions = {
1151 IE_NAME = u'youtube'
1153 def report_lang(self):
1154 """Report attempt to set language."""
1155 self._downloader.to_screen(u'[youtube] Setting language')
1157 def report_login(self):
1158 """Report attempt to log in."""
1159 self._downloader.to_screen(u'[youtube] Logging in')
1161 def report_age_confirmation(self):
1162 """Report attempt to confirm age."""
1163 self._downloader.to_screen(u'[youtube] Confirming age')
1165 def report_video_webpage_download(self, video_id):
1166 """Report attempt to download video webpage."""
1167 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1169 def report_video_info_webpage_download(self, video_id):
1170 """Report attempt to download video info webpage."""
1171 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1173 def report_information_extraction(self, video_id):
1174 """Report attempt to extract video information."""
1175 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1177 def report_unavailable_format(self, video_id, format):
1178 """Report extracted video URL."""
1179 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1181 def report_rtmp_download(self):
1182 """Indicate the download will use the RTMP protocol."""
1183 self._downloader.to_screen(u'[youtube] RTMP download detected')
1185 def _print_formats(self, formats):
1186 print 'Available formats:'
1188 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1190 def _real_initialize(self):
1191 if self._downloader is None:
1196 downloader_params = self._downloader.params
1198 # Attempt to use provided username and password or .netrc data
1199 if downloader_params.get('username', None) is not None:
1200 username = downloader_params['username']
1201 password = downloader_params['password']
1202 elif downloader_params.get('usenetrc', False):
1204 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1205 if info is not None:
1209 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1210 except (IOError, netrc.NetrcParseError), err:
1211 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1215 request = urllib2.Request(self._LANG_URL)
1218 urllib2.urlopen(request).read()
1219 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1220 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1223 # No authentication to be performed
1224 if username is None:
1229 'current_form': 'loginForm',
1231 'action_login': 'Log In',
1232 'username': username,
1233 'password': password,
1235 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1238 login_results = urllib2.urlopen(request).read()
1239 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1240 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1242 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1243 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1249 'action_confirm': 'Confirm',
1251 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1253 self.report_age_confirmation()
1254 age_results = urllib2.urlopen(request).read()
1255 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1256 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1259 def _real_extract(self, url):
1260 # Extract video id from URL
1261 mobj = re.match(self._VALID_URL, url)
1263 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1265 video_id = mobj.group(2)
1268 self.report_video_webpage_download(video_id)
1269 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1271 video_webpage = urllib2.urlopen(request).read()
1272 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1273 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1276 # Attempt to extract SWF player URL
1277 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1278 if mobj is not None:
1279 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1284 self.report_video_info_webpage_download(video_id)
1285 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1286 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1287 % (video_id, el_type))
1288 request = urllib2.Request(video_info_url)
1290 video_info_webpage = urllib2.urlopen(request).read()
1291 video_info = parse_qs(video_info_webpage)
1292 if 'token' in video_info:
1294 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1295 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1297 if 'token' not in video_info:
1298 if 'reason' in video_info:
1299 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1301 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1304 # Start extracting information
1305 self.report_information_extraction(video_id)
1308 if 'author' not in video_info:
1309 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1311 video_uploader = urllib.unquote_plus(video_info['author'][0])
1314 if 'title' not in video_info:
1315 self._downloader.trouble(u'ERROR: unable to extract video title')
1317 video_title = urllib.unquote_plus(video_info['title'][0])
1318 video_title = video_title.decode('utf-8')
1319 video_title = sanitize_title(video_title)
1322 simple_title = _simplify_title(video_title)
1325 if 'thumbnail_url' not in video_info:
1326 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1327 video_thumbnail = ''
1328 else: # don't panic if we can't find it
1329 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1333 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1334 if mobj is not None:
1335 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1336 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1337 for expression in format_expressions:
1339 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1347 video_description = u'No description available.'
1348 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1349 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1350 if mobj is not None:
1351 video_description = mobj.group(1).decode('utf-8')
1353 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1354 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1355 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1356 # TODO use another parser
1359 video_token = urllib.unquote_plus(video_info['token'][0])
1361 # Decide which formats to download
1362 req_format = self._downloader.params.get('format', None)
1364 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1365 self.report_rtmp_download()
1366 video_url_list = [(None, video_info['conn'][0])]
1367 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1368 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1369 url_data = [parse_qs(uds) for uds in url_data_strs]
1370 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1371 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1373 format_limit = self._downloader.params.get('format_limit', None)
1374 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1375 if format_limit is not None and format_limit in available_formats:
1376 format_list = available_formats[available_formats.index(format_limit):]
1378 format_list = available_formats
1379 existing_formats = [x for x in format_list if x in url_map]
1380 if len(existing_formats) == 0:
1381 self._downloader.trouble(u'ERROR: no known formats available for video')
1383 if self._downloader.params.get('listformats', None):
1384 self._print_formats(existing_formats)
1386 if req_format is None or req_format == 'best':
1387 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1388 elif req_format == 'worst':
1389 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1390 elif req_format in ('-1', 'all'):
1391 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1393 # Specific formats. We pick the first in a slash-delimeted sequence.
1394 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1395 req_formats = req_format.split('/')
1396 video_url_list = None
1397 for rf in req_formats:
1399 video_url_list = [(rf, url_map[rf])]
1401 if video_url_list is None:
1402 self._downloader.trouble(u'ERROR: requested format not available')
1405 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1408 for format_param, video_real_url in video_url_list:
1409 # At this point we have a new video
1410 self._downloader.increment_downloads()
1413 video_extension = self._video_extensions.get(format_param, 'flv')
1416 # Process video information
1417 self._downloader.process_info({
1418 'id': video_id.decode('utf-8'),
1419 'url': video_real_url.decode('utf-8'),
1420 'uploader': video_uploader.decode('utf-8'),
1421 'upload_date': upload_date,
1422 'title': video_title,
1423 'stitle': simple_title,
1424 'ext': video_extension.decode('utf-8'),
1425 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1426 'thumbnail': video_thumbnail.decode('utf-8'),
1427 'description': video_description,
1428 'player_url': player_url,
1430 except UnavailableVideoError, err:
1431 self._downloader.trouble(u'\nERROR: unable to download video')
1434 class MetacafeIE(InfoExtractor):
1435 """Information Extractor for metacafe.com."""
1437 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1438 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1439 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1441 IE_NAME = u'metacafe'
1443 def __init__(self, youtube_ie, downloader=None):
1444 InfoExtractor.__init__(self, downloader)
1445 self._youtube_ie = youtube_ie
1447 def report_disclaimer(self):
1448 """Report disclaimer retrieval."""
1449 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1451 def report_age_confirmation(self):
1452 """Report attempt to confirm age."""
1453 self._downloader.to_screen(u'[metacafe] Confirming age')
1455 def report_download_webpage(self, video_id):
1456 """Report webpage download."""
1457 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1459 def report_extraction(self, video_id):
1460 """Report information extraction."""
1461 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1463 def _real_initialize(self):
1464 # Retrieve disclaimer
1465 request = urllib2.Request(self._DISCLAIMER)
1467 self.report_disclaimer()
1468 disclaimer = urllib2.urlopen(request).read()
1469 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1470 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1476 'submit': "Continue - I'm over 18",
1478 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1480 self.report_age_confirmation()
1481 disclaimer = urllib2.urlopen(request).read()
1482 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1483 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1486 def _real_extract(self, url):
1487 # Extract id and simplified title from URL
1488 mobj = re.match(self._VALID_URL, url)
1490 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1493 video_id = mobj.group(1)
1495 # Check if video comes from YouTube
1496 mobj2 = re.match(r'^yt-(.*)$', video_id)
1497 if mobj2 is not None:
1498 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1501 # At this point we have a new video
1502 self._downloader.increment_downloads()
1504 simple_title = mobj.group(2).decode('utf-8')
1506 # Retrieve video webpage to extract further information
1507 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1509 self.report_download_webpage(video_id)
1510 webpage = urllib2.urlopen(request).read()
1511 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1512 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1515 # Extract URL, uploader and title from webpage
1516 self.report_extraction(video_id)
1517 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1518 if mobj is not None:
1519 mediaURL = urllib.unquote(mobj.group(1))
1520 video_extension = mediaURL[-3:]
1522 # Extract gdaKey if available
1523 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1525 video_url = mediaURL
1527 gdaKey = mobj.group(1)
1528 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1530 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1532 self._downloader.trouble(u'ERROR: unable to extract media URL')
1534 vardict = parse_qs(mobj.group(1))
1535 if 'mediaData' not in vardict:
1536 self._downloader.trouble(u'ERROR: unable to extract media URL')
1538 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1540 self._downloader.trouble(u'ERROR: unable to extract media URL')
1542 mediaURL = mobj.group(1).replace('\\/', '/')
1543 video_extension = mediaURL[-3:]
1544 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1546 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1548 self._downloader.trouble(u'ERROR: unable to extract title')
1550 video_title = mobj.group(1).decode('utf-8')
1551 video_title = sanitize_title(video_title)
1553 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1555 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1557 video_uploader = mobj.group(1)
1560 # Process video information
1561 self._downloader.process_info({
1562 'id': video_id.decode('utf-8'),
1563 'url': video_url.decode('utf-8'),
1564 'uploader': video_uploader.decode('utf-8'),
1565 'upload_date': u'NA',
1566 'title': video_title,
1567 'stitle': simple_title,
1568 'ext': video_extension.decode('utf-8'),
1572 except UnavailableVideoError:
1573 self._downloader.trouble(u'\nERROR: unable to download video')
1576 class DailymotionIE(InfoExtractor):
1577 """Information Extractor for Dailymotion"""
1579 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1580 IE_NAME = u'dailymotion'
1582 def __init__(self, downloader=None):
1583 InfoExtractor.__init__(self, downloader)
1585 def report_download_webpage(self, video_id):
1586 """Report webpage download."""
1587 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1589 def report_extraction(self, video_id):
1590 """Report information extraction."""
1591 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1593 def _real_extract(self, url):
1594 # Extract id and simplified title from URL
1595 mobj = re.match(self._VALID_URL, url)
1597 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1600 # At this point we have a new video
1601 self._downloader.increment_downloads()
1602 video_id = mobj.group(1)
1604 simple_title = mobj.group(2).decode('utf-8')
1605 video_extension = 'flv'
1607 # Retrieve video webpage to extract further information
1608 request = urllib2.Request(url)
1609 request.add_header('Cookie', 'family_filter=off')
1611 self.report_download_webpage(video_id)
1612 webpage = urllib2.urlopen(request).read()
1613 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1614 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1617 # Extract URL, uploader and title from webpage
1618 self.report_extraction(video_id)
1619 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1621 self._downloader.trouble(u'ERROR: unable to extract media URL')
1623 sequence = urllib.unquote(mobj.group(1))
1624 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1626 self._downloader.trouble(u'ERROR: unable to extract media URL')
1628 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1630 # if needed add http://www.dailymotion.com/ if relative URL
1632 video_url = mediaURL
1634 mobj = re.search(r'(?im)<title>\s*(.+)\s*-\s*Video\s+Dailymotion</title>', webpage)
1636 self._downloader.trouble(u'ERROR: unable to extract title')
1638 video_title = mobj.group(1).decode('utf-8')
1639 video_title = sanitize_title(video_title)
1641 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1643 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1645 video_uploader = mobj.group(1)
1648 # Process video information
1649 self._downloader.process_info({
1650 'id': video_id.decode('utf-8'),
1651 'url': video_url.decode('utf-8'),
1652 'uploader': video_uploader.decode('utf-8'),
1653 'upload_date': u'NA',
1654 'title': video_title,
1655 'stitle': simple_title,
1656 'ext': video_extension.decode('utf-8'),
1660 except UnavailableVideoError:
1661 self._downloader.trouble(u'\nERROR: unable to download video')
1664 class GoogleIE(InfoExtractor):
1665 """Information extractor for video.google.com."""
1667 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1668 IE_NAME = u'video.google'
1670 def __init__(self, downloader=None):
1671 InfoExtractor.__init__(self, downloader)
1673 def report_download_webpage(self, video_id):
1674 """Report webpage download."""
1675 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1677 def report_extraction(self, video_id):
1678 """Report information extraction."""
1679 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1681 def _real_extract(self, url):
1682 # Extract id from URL
1683 mobj = re.match(self._VALID_URL, url)
1685 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1688 # At this point we have a new video
1689 self._downloader.increment_downloads()
1690 video_id = mobj.group(1)
1692 video_extension = 'mp4'
1694 # Retrieve video webpage to extract further information
1695 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1697 self.report_download_webpage(video_id)
1698 webpage = urllib2.urlopen(request).read()
1699 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1700 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1703 # Extract URL, uploader, and title from webpage
1704 self.report_extraction(video_id)
1705 mobj = re.search(r"download_url:'([^']+)'", webpage)
1707 video_extension = 'flv'
1708 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1710 self._downloader.trouble(u'ERROR: unable to extract media URL')
1712 mediaURL = urllib.unquote(mobj.group(1))
1713 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1714 mediaURL = mediaURL.replace('\\x26', '\x26')
1716 video_url = mediaURL
1718 mobj = re.search(r'<title>(.*)</title>', webpage)
1720 self._downloader.trouble(u'ERROR: unable to extract title')
1722 video_title = mobj.group(1).decode('utf-8')
1723 video_title = sanitize_title(video_title)
1724 simple_title = _simplify_title(video_title)
1726 # Extract video description
1727 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1729 self._downloader.trouble(u'ERROR: unable to extract video description')
1731 video_description = mobj.group(1).decode('utf-8')
1732 if not video_description:
1733 video_description = 'No description available.'
1735 # Extract video thumbnail
1736 if self._downloader.params.get('forcethumbnail', False):
1737 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1739 webpage = urllib2.urlopen(request).read()
1740 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1741 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1743 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1745 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1747 video_thumbnail = mobj.group(1)
1748 else: # we need something to pass to process_info
1749 video_thumbnail = ''
1752 # Process video information
1753 self._downloader.process_info({
1754 'id': video_id.decode('utf-8'),
1755 'url': video_url.decode('utf-8'),
1757 'upload_date': u'NA',
1758 'title': video_title,
1759 'stitle': simple_title,
1760 'ext': video_extension.decode('utf-8'),
1764 except UnavailableVideoError:
1765 self._downloader.trouble(u'\nERROR: unable to download video')
1768 class PhotobucketIE(InfoExtractor):
1769 """Information extractor for photobucket.com."""
1771 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1772 IE_NAME = u'photobucket'
1774 def __init__(self, downloader=None):
1775 InfoExtractor.__init__(self, downloader)
1777 def report_download_webpage(self, video_id):
1778 """Report webpage download."""
1779 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1781 def report_extraction(self, video_id):
1782 """Report information extraction."""
1783 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1785 def _real_extract(self, url):
1786 # Extract id from URL
1787 mobj = re.match(self._VALID_URL, url)
1789 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1792 # At this point we have a new video
1793 self._downloader.increment_downloads()
1794 video_id = mobj.group(1)
1796 video_extension = 'flv'
1798 # Retrieve video webpage to extract further information
1799 request = urllib2.Request(url)
1801 self.report_download_webpage(video_id)
1802 webpage = urllib2.urlopen(request).read()
1803 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1804 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1807 # Extract URL, uploader, and title from webpage
1808 self.report_extraction(video_id)
1809 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1811 self._downloader.trouble(u'ERROR: unable to extract media URL')
1813 mediaURL = urllib.unquote(mobj.group(1))
1815 video_url = mediaURL
1817 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1819 self._downloader.trouble(u'ERROR: unable to extract title')
1821 video_title = mobj.group(1).decode('utf-8')
1822 video_title = sanitize_title(video_title)
1823 simple_title = _simplify_title(vide_title)
1825 video_uploader = mobj.group(2).decode('utf-8')
1828 # Process video information
1829 self._downloader.process_info({
1830 'id': video_id.decode('utf-8'),
1831 'url': video_url.decode('utf-8'),
1832 'uploader': video_uploader,
1833 'upload_date': u'NA',
1834 'title': video_title,
1835 'stitle': simple_title,
1836 'ext': video_extension.decode('utf-8'),
1840 except UnavailableVideoError:
1841 self._downloader.trouble(u'\nERROR: unable to download video')
1844 class YahooIE(InfoExtractor):
1845 """Information extractor for video.yahoo.com."""
1847 # _VALID_URL matches all Yahoo! Video URLs
1848 # _VPAGE_URL matches only the extractable '/watch/' URLs
1849 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1850 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1851 IE_NAME = u'video.yahoo'
1853 def __init__(self, downloader=None):
1854 InfoExtractor.__init__(self, downloader)
1856 def report_download_webpage(self, video_id):
1857 """Report webpage download."""
1858 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1860 def report_extraction(self, video_id):
1861 """Report information extraction."""
1862 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1864 def _real_extract(self, url, new_video=True):
1865 # Extract ID from URL
1866 mobj = re.match(self._VALID_URL, url)
1868 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1871 # At this point we have a new video
1872 self._downloader.increment_downloads()
1873 video_id = mobj.group(2)
1874 video_extension = 'flv'
1876 # Rewrite valid but non-extractable URLs as
1877 # extractable English language /watch/ URLs
1878 if re.match(self._VPAGE_URL, url) is None:
1879 request = urllib2.Request(url)
1881 webpage = urllib2.urlopen(request).read()
1882 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1883 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1886 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1888 self._downloader.trouble(u'ERROR: Unable to extract id field')
1890 yahoo_id = mobj.group(1)
1892 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1894 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1896 yahoo_vid = mobj.group(1)
1898 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1899 return self._real_extract(url, new_video=False)
1901 # Retrieve video webpage to extract further information
1902 request = urllib2.Request(url)
1904 self.report_download_webpage(video_id)
1905 webpage = urllib2.urlopen(request).read()
1906 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1907 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1910 # Extract uploader and title from webpage
1911 self.report_extraction(video_id)
1912 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1914 self._downloader.trouble(u'ERROR: unable to extract video title')
1916 video_title = mobj.group(1).decode('utf-8')
1917 simple_title = _simplify_title(video_title)
1919 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1921 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1923 video_uploader = mobj.group(1).decode('utf-8')
1925 # Extract video thumbnail
1926 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1928 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1930 video_thumbnail = mobj.group(1).decode('utf-8')
1932 # Extract video description
1933 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1935 self._downloader.trouble(u'ERROR: unable to extract video description')
1937 video_description = mobj.group(1).decode('utf-8')
1938 if not video_description:
1939 video_description = 'No description available.'
1941 # Extract video height and width
1942 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1944 self._downloader.trouble(u'ERROR: unable to extract video height')
1946 yv_video_height = mobj.group(1)
1948 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1950 self._downloader.trouble(u'ERROR: unable to extract video width')
1952 yv_video_width = mobj.group(1)
1954 # Retrieve video playlist to extract media URL
1955 # I'm not completely sure what all these options are, but we
1956 # seem to need most of them, otherwise the server sends a 401.
1957 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1958 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1959 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1960 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1961 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1963 self.report_download_webpage(video_id)
1964 webpage = urllib2.urlopen(request).read()
1965 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1966 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1969 # Extract media URL from playlist XML
1970 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1972 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1974 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1975 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1978 # Process video information
1979 self._downloader.process_info({
1980 'id': video_id.decode('utf-8'),
1982 'uploader': video_uploader,
1983 'upload_date': u'NA',
1984 'title': video_title,
1985 'stitle': simple_title,
1986 'ext': video_extension.decode('utf-8'),
1987 'thumbnail': video_thumbnail.decode('utf-8'),
1988 'description': video_description,
1989 'thumbnail': video_thumbnail,
1992 except UnavailableVideoError:
1993 self._downloader.trouble(u'\nERROR: unable to download video')
1996 class VimeoIE(InfoExtractor):
1997 """Information extractor for vimeo.com."""
1999 # _VALID_URL matches Vimeo URLs
2000 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2003 def __init__(self, downloader=None):
2004 InfoExtractor.__init__(self, downloader)
2006 def report_download_webpage(self, video_id):
2007 """Report webpage download."""
2008 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2010 def report_extraction(self, video_id):
2011 """Report information extraction."""
2012 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2014 def _real_extract(self, url, new_video=True):
2015 # Extract ID from URL
2016 mobj = re.match(self._VALID_URL, url)
2018 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2021 # At this point we have a new video
2022 self._downloader.increment_downloads()
2023 video_id = mobj.group(1)
2025 # Retrieve video webpage to extract further information
2026 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2028 self.report_download_webpage(video_id)
2029 webpage = urllib2.urlopen(request).read()
2030 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2031 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2034 # Now we begin extracting as much information as we can from what we
2035 # retrieved. First we extract the information common to all extractors,
2036 # and latter we extract those that are Vimeo specific.
2037 self.report_extraction(video_id)
2040 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2042 self._downloader.trouble(u'ERROR: unable to extract video title')
2044 video_title = mobj.group(1).decode('utf-8')
2045 simple_title = _simplify_title(video_title)
2048 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2050 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2052 video_uploader = mobj.group(1).decode('utf-8')
2054 # Extract video thumbnail
2055 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2057 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2059 video_thumbnail = mobj.group(1).decode('utf-8')
2061 # # Extract video description
2062 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2064 # self._downloader.trouble(u'ERROR: unable to extract video description')
2066 # video_description = mobj.group(1).decode('utf-8')
2067 # if not video_description: video_description = 'No description available.'
2068 video_description = 'Foo.'
2070 # Vimeo specific: extract request signature
2071 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2073 self._downloader.trouble(u'ERROR: unable to extract request signature')
2075 sig = mobj.group(1).decode('utf-8')
2077 # Vimeo specific: extract video quality information
2078 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2080 self._downloader.trouble(u'ERROR: unable to extract video quality information')
2082 quality = mobj.group(1).decode('utf-8')
2084 if int(quality) == 1:
2089 # Vimeo specific: Extract request signature expiration
2090 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2092 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2094 sig_exp = mobj.group(1).decode('utf-8')
2096 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2099 # Process video information
2100 self._downloader.process_info({
2101 'id': video_id.decode('utf-8'),
2103 'uploader': video_uploader,
2104 'upload_date': u'NA',
2105 'title': video_title,
2106 'stitle': simple_title,
2108 'thumbnail': video_thumbnail.decode('utf-8'),
2109 'description': video_description,
2110 'thumbnail': video_thumbnail,
2111 'description': video_description,
2114 except UnavailableVideoError:
2115 self._downloader.trouble(u'ERROR: unable to download video')
2118 class GenericIE(InfoExtractor):
2119 """Generic last-resort information extractor."""
2122 IE_NAME = u'generic'
2124 def __init__(self, downloader=None):
2125 InfoExtractor.__init__(self, downloader)
2127 def report_download_webpage(self, video_id):
2128 """Report webpage download."""
2129 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2130 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2132 def report_extraction(self, video_id):
2133 """Report information extraction."""
2134 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2136 def _real_extract(self, url):
2137 # At this point we have a new video
2138 self._downloader.increment_downloads()
2140 video_id = url.split('/')[-1]
2141 request = urllib2.Request(url)
2143 self.report_download_webpage(video_id)
2144 webpage = urllib2.urlopen(request).read()
2145 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2146 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2148 except ValueError, err:
2149 # since this is the last-resort InfoExtractor, if
2150 # this error is thrown, it'll be thrown here
2151 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2154 self.report_extraction(video_id)
2155 # Start with something easy: JW Player in SWFObject
2156 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2158 # Broaden the search a little bit
2159 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2161 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2164 # It's possible that one of the regexes
2165 # matched, but returned an empty group:
2166 if mobj.group(1) is None:
2167 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2170 video_url = urllib.unquote(mobj.group(1))
2171 video_id = os.path.basename(video_url)
2173 # here's a fun little line of code for you:
2174 video_extension = os.path.splitext(video_id)[1][1:]
2175 video_id = os.path.splitext(video_id)[0]
2177 # it's tempting to parse this further, but you would
2178 # have to take into account all the variations like
2179 # Video Title - Site Name
2180 # Site Name | Video Title
2181 # Video Title - Tagline | Site Name
2182 # and so on and so forth; it's just not practical
2183 mobj = re.search(r'<title>(.*)</title>', webpage)
2185 self._downloader.trouble(u'ERROR: unable to extract title')
2187 video_title = mobj.group(1).decode('utf-8')
2188 video_title = sanitize_title(video_title)
2189 simple_title = _simplify_title(video_title)
2191 # video uploader is domain name
2192 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2194 self._downloader.trouble(u'ERROR: unable to extract title')
2196 video_uploader = mobj.group(1).decode('utf-8')
2199 # Process video information
2200 self._downloader.process_info({
2201 'id': video_id.decode('utf-8'),
2202 'url': video_url.decode('utf-8'),
2203 'uploader': video_uploader,
2204 'upload_date': u'NA',
2205 'title': video_title,
2206 'stitle': simple_title,
2207 'ext': video_extension.decode('utf-8'),
2211 except UnavailableVideoError, err:
2212 self._downloader.trouble(u'\nERROR: unable to download video')
2215 class YoutubeSearchIE(InfoExtractor):
2216 """Information Extractor for YouTube search queries."""
2217 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2218 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2219 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2220 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2222 _max_youtube_results = 1000
2223 IE_NAME = u'youtube:search'
2225 def __init__(self, youtube_ie, downloader=None):
2226 InfoExtractor.__init__(self, downloader)
2227 self._youtube_ie = youtube_ie
2229 def report_download_page(self, query, pagenum):
2230 """Report attempt to download playlist page with given number."""
2231 query = query.decode(preferredencoding())
2232 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2234 def _real_initialize(self):
2235 self._youtube_ie.initialize()
2237 def _real_extract(self, query):
2238 mobj = re.match(self._VALID_URL, query)
2240 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2243 prefix, query = query.split(':')
2245 query = query.encode('utf-8')
2247 self._download_n_results(query, 1)
2249 elif prefix == 'all':
2250 self._download_n_results(query, self._max_youtube_results)
2256 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2258 elif n > self._max_youtube_results:
2259 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2260 n = self._max_youtube_results
2261 self._download_n_results(query, n)
2263 except ValueError: # parsing prefix as integer fails
2264 self._download_n_results(query, 1)
2267 def _download_n_results(self, query, n):
2268 """Downloads a specified number of results for a query"""
2271 already_seen = set()
2275 self.report_download_page(query, pagenum)
2276 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2277 request = urllib2.Request(result_url)
2279 page = urllib2.urlopen(request).read()
2280 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2281 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2284 # Extract video identifiers
2285 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2286 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2287 if video_id not in already_seen:
2288 video_ids.append(video_id)
2289 already_seen.add(video_id)
2290 if len(video_ids) == n:
2291 # Specified n videos reached
2292 for id in video_ids:
2293 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2296 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2297 for id in video_ids:
2298 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2301 pagenum = pagenum + 1
2304 class GoogleSearchIE(InfoExtractor):
2305 """Information Extractor for Google Video search queries."""
2306 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2307 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2308 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2309 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2311 _max_google_results = 1000
2312 IE_NAME = u'video.google:search'
2314 def __init__(self, google_ie, downloader=None):
2315 InfoExtractor.__init__(self, downloader)
2316 self._google_ie = google_ie
2318 def report_download_page(self, query, pagenum):
2319 """Report attempt to download playlist page with given number."""
2320 query = query.decode(preferredencoding())
2321 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2323 def _real_initialize(self):
2324 self._google_ie.initialize()
2326 def _real_extract(self, query):
2327 mobj = re.match(self._VALID_URL, query)
2329 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2332 prefix, query = query.split(':')
2334 query = query.encode('utf-8')
2336 self._download_n_results(query, 1)
2338 elif prefix == 'all':
2339 self._download_n_results(query, self._max_google_results)
2345 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2347 elif n > self._max_google_results:
2348 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2349 n = self._max_google_results
2350 self._download_n_results(query, n)
2352 except ValueError: # parsing prefix as integer fails
2353 self._download_n_results(query, 1)
2356 def _download_n_results(self, query, n):
2357 """Downloads a specified number of results for a query"""
2360 already_seen = set()
2364 self.report_download_page(query, pagenum)
2365 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2366 request = urllib2.Request(result_url)
2368 page = urllib2.urlopen(request).read()
2369 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2370 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2373 # Extract video identifiers
2374 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2375 video_id = mobj.group(1)
2376 if video_id not in already_seen:
2377 video_ids.append(video_id)
2378 already_seen.add(video_id)
2379 if len(video_ids) == n:
2380 # Specified n videos reached
2381 for id in video_ids:
2382 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2385 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2386 for id in video_ids:
2387 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2390 pagenum = pagenum + 1
2393 class YahooSearchIE(InfoExtractor):
2394 """Information Extractor for Yahoo! Video search queries."""
2395 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2396 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2397 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2398 _MORE_PAGES_INDICATOR = r'\s*Next'
2400 _max_yahoo_results = 1000
2401 IE_NAME = u'video.yahoo:search'
2403 def __init__(self, yahoo_ie, downloader=None):
2404 InfoExtractor.__init__(self, downloader)
2405 self._yahoo_ie = yahoo_ie
2407 def report_download_page(self, query, pagenum):
2408 """Report attempt to download playlist page with given number."""
2409 query = query.decode(preferredencoding())
2410 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2412 def _real_initialize(self):
2413 self._yahoo_ie.initialize()
2415 def _real_extract(self, query):
2416 mobj = re.match(self._VALID_URL, query)
2418 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2421 prefix, query = query.split(':')
2423 query = query.encode('utf-8')
2425 self._download_n_results(query, 1)
2427 elif prefix == 'all':
2428 self._download_n_results(query, self._max_yahoo_results)
2434 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2436 elif n > self._max_yahoo_results:
2437 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2438 n = self._max_yahoo_results
2439 self._download_n_results(query, n)
2441 except ValueError: # parsing prefix as integer fails
2442 self._download_n_results(query, 1)
2445 def _download_n_results(self, query, n):
2446 """Downloads a specified number of results for a query"""
2449 already_seen = set()
2453 self.report_download_page(query, pagenum)
2454 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2455 request = urllib2.Request(result_url)
2457 page = urllib2.urlopen(request).read()
2458 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2459 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2462 # Extract video identifiers
2463 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2464 video_id = mobj.group(1)
2465 if video_id not in already_seen:
2466 video_ids.append(video_id)
2467 already_seen.add(video_id)
2468 if len(video_ids) == n:
2469 # Specified n videos reached
2470 for id in video_ids:
2471 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2474 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2475 for id in video_ids:
2476 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2479 pagenum = pagenum + 1
2482 class YoutubePlaylistIE(InfoExtractor):
2483 """Information Extractor for YouTube playlists."""
2485 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2486 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2487 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2488 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2490 IE_NAME = u'youtube:playlist'
2492 def __init__(self, youtube_ie, downloader=None):
2493 InfoExtractor.__init__(self, downloader)
2494 self._youtube_ie = youtube_ie
2496 def report_download_page(self, playlist_id, pagenum):
2497 """Report attempt to download playlist page with given number."""
2498 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2500 def _real_initialize(self):
2501 self._youtube_ie.initialize()
2503 def _real_extract(self, url):
2504 # Extract playlist id
2505 mobj = re.match(self._VALID_URL, url)
2507 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2511 if mobj.group(3) is not None:
2512 self._youtube_ie.extract(mobj.group(3))
2515 # Download playlist pages
2516 # prefix is 'p' as default for playlists but there are other types that need extra care
2517 playlist_prefix = mobj.group(1)
2518 if playlist_prefix == 'a':
2519 playlist_access = 'artist'
2521 playlist_prefix = 'p'
2522 playlist_access = 'view_play_list'
2523 playlist_id = mobj.group(2)
2528 self.report_download_page(playlist_id, pagenum)
2529 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2530 request = urllib2.Request(url)
2532 page = urllib2.urlopen(request).read()
2533 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2534 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2537 # Extract video identifiers
2539 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2540 if mobj.group(1) not in ids_in_page:
2541 ids_in_page.append(mobj.group(1))
2542 video_ids.extend(ids_in_page)
2544 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2546 pagenum = pagenum + 1
2548 playliststart = self._downloader.params.get('playliststart', 1) - 1
2549 playlistend = self._downloader.params.get('playlistend', -1)
2550 video_ids = video_ids[playliststart:playlistend]
2552 for id in video_ids:
2553 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2557 class YoutubeUserIE(InfoExtractor):
2558 """Information Extractor for YouTube users."""
2560 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2561 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2562 _GDATA_PAGE_SIZE = 50
2563 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2564 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2566 IE_NAME = u'youtube:user'
2568 def __init__(self, youtube_ie, downloader=None):
2569 InfoExtractor.__init__(self, downloader)
2570 self._youtube_ie = youtube_ie
2572 def report_download_page(self, username, start_index):
2573 """Report attempt to download user page."""
2574 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2575 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2577 def _real_initialize(self):
2578 self._youtube_ie.initialize()
2580 def _real_extract(self, url):
2582 mobj = re.match(self._VALID_URL, url)
2584 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2587 username = mobj.group(1)
2589 # Download video ids using YouTube Data API. Result size per
2590 # query is limited (currently to 50 videos) so we need to query
2591 # page by page until there are no video ids - it means we got
2598 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2599 self.report_download_page(username, start_index)
2601 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2604 page = urllib2.urlopen(request).read()
2605 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2606 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2609 # Extract video identifiers
2612 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2613 if mobj.group(1) not in ids_in_page:
2614 ids_in_page.append(mobj.group(1))
2616 video_ids.extend(ids_in_page)
2618 # A little optimization - if current page is not
2619 # "full", ie. does not contain PAGE_SIZE video ids then
2620 # we can assume that this page is the last one - there
2621 # are no more ids on further pages - no need to query
2624 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2629 all_ids_count = len(video_ids)
2630 playliststart = self._downloader.params.get('playliststart', 1) - 1
2631 playlistend = self._downloader.params.get('playlistend', -1)
2633 if playlistend == -1:
2634 video_ids = video_ids[playliststart:]
2636 video_ids = video_ids[playliststart:playlistend]
2638 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2639 (username, all_ids_count, len(video_ids)))
2641 for video_id in video_ids:
2642 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2645 class DepositFilesIE(InfoExtractor):
2646 """Information extractor for depositfiles.com"""
2648 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2649 IE_NAME = u'DepositFiles'
2651 def __init__(self, downloader=None):
2652 InfoExtractor.__init__(self, downloader)
2654 def report_download_webpage(self, file_id):
2655 """Report webpage download."""
2656 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2658 def report_extraction(self, file_id):
2659 """Report information extraction."""
2660 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2662 def _real_extract(self, url):
2663 # At this point we have a new file
2664 self._downloader.increment_downloads()
2666 file_id = url.split('/')[-1]
2667 # Rebuild url in english locale
2668 url = 'http://depositfiles.com/en/files/' + file_id
2670 # Retrieve file webpage with 'Free download' button pressed
2671 free_download_indication = { 'gateway_result' : '1' }
2672 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2674 self.report_download_webpage(file_id)
2675 webpage = urllib2.urlopen(request).read()
2676 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2677 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2680 # Search for the real file URL
2681 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2682 if (mobj is None) or (mobj.group(1) is None):
2683 # Try to figure out reason of the error.
2684 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2685 if (mobj is not None) and (mobj.group(1) is not None):
2686 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2687 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2689 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2692 file_url = mobj.group(1)
2693 file_extension = os.path.splitext(file_url)[1][1:]
2695 # Search for file title
2696 mobj = re.search(r'<b title="(.*?)">', webpage)
2698 self._downloader.trouble(u'ERROR: unable to extract title')
2700 file_title = mobj.group(1).decode('utf-8')
2703 # Process file information
2704 self._downloader.process_info({
2705 'id': file_id.decode('utf-8'),
2706 'url': file_url.decode('utf-8'),
2708 'upload_date': u'NA',
2709 'title': file_title,
2710 'stitle': file_title,
2711 'ext': file_extension.decode('utf-8'),
2715 except UnavailableVideoError, err:
2716 self._downloader.trouble(u'ERROR: unable to download file')
2719 class FacebookIE(InfoExtractor):
2720 """Information Extractor for Facebook"""
2722 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2723 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2724 _NETRC_MACHINE = 'facebook'
2725 _available_formats = ['video', 'highqual', 'lowqual']
2726 _video_extensions = {
2731 IE_NAME = u'facebook'
2733 def __init__(self, downloader=None):
2734 InfoExtractor.__init__(self, downloader)
2736 def _reporter(self, message):
2737 """Add header and report message."""
2738 self._downloader.to_screen(u'[facebook] %s' % message)
2740 def report_login(self):
2741 """Report attempt to log in."""
2742 self._reporter(u'Logging in')
2744 def report_video_webpage_download(self, video_id):
2745 """Report attempt to download video webpage."""
2746 self._reporter(u'%s: Downloading video webpage' % video_id)
2748 def report_information_extraction(self, video_id):
2749 """Report attempt to extract video information."""
2750 self._reporter(u'%s: Extracting video information' % video_id)
2752 def _parse_page(self, video_webpage):
2753 """Extract video information from page"""
2755 data = {'title': r'\("video_title", "(.*?)"\)',
2756 'description': r'<div class="datawrap">(.*?)</div>',
2757 'owner': r'\("video_owner_name", "(.*?)"\)',
2758 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2761 for piece in data.keys():
2762 mobj = re.search(data[piece], video_webpage)
2763 if mobj is not None:
2764 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2768 for fmt in self._available_formats:
2769 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2770 if mobj is not None:
2771 # URL is in a Javascript segment inside an escaped Unicode format within
2772 # the generally utf-8 page
2773 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2774 video_info['video_urls'] = video_urls
2778 def _real_initialize(self):
2779 if self._downloader is None:
2784 downloader_params = self._downloader.params
2786 # Attempt to use provided username and password or .netrc data
2787 if downloader_params.get('username', None) is not None:
2788 useremail = downloader_params['username']
2789 password = downloader_params['password']
2790 elif downloader_params.get('usenetrc', False):
2792 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2793 if info is not None:
2797 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2798 except (IOError, netrc.NetrcParseError), err:
2799 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2802 if useremail is None:
2811 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2814 login_results = urllib2.urlopen(request).read()
2815 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2816 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2818 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2819 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2822 def _real_extract(self, url):
2823 mobj = re.match(self._VALID_URL, url)
2825 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2827 video_id = mobj.group('ID')
2830 self.report_video_webpage_download(video_id)
2831 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2833 page = urllib2.urlopen(request)
2834 video_webpage = page.read()
2835 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2836 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2839 # Start extracting information
2840 self.report_information_extraction(video_id)
2842 # Extract information
2843 video_info = self._parse_page(video_webpage)
2846 if 'owner' not in video_info:
2847 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2849 video_uploader = video_info['owner']
2852 if 'title' not in video_info:
2853 self._downloader.trouble(u'ERROR: unable to extract video title')
2855 video_title = video_info['title']
2856 video_title = video_title.decode('utf-8')
2857 video_title = sanitize_title(video_title)
2859 simple_title = _simplify_title(video_title)
2862 if 'thumbnail' not in video_info:
2863 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2864 video_thumbnail = ''
2866 video_thumbnail = video_info['thumbnail']
2870 if 'upload_date' in video_info:
2871 upload_time = video_info['upload_date']
2872 timetuple = email.utils.parsedate_tz(upload_time)
2873 if timetuple is not None:
2875 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2880 video_description = video_info.get('description', 'No description available.')
2882 url_map = video_info['video_urls']
2883 if len(url_map.keys()) > 0:
2884 # Decide which formats to download
2885 req_format = self._downloader.params.get('format', None)
2886 format_limit = self._downloader.params.get('format_limit', None)
2888 if format_limit is not None and format_limit in self._available_formats:
2889 format_list = self._available_formats[self._available_formats.index(format_limit):]
2891 format_list = self._available_formats
2892 existing_formats = [x for x in format_list if x in url_map]
2893 if len(existing_formats) == 0:
2894 self._downloader.trouble(u'ERROR: no known formats available for video')
2896 if req_format is None:
2897 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2898 elif req_format == 'worst':
2899 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2900 elif req_format == '-1':
2901 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2904 if req_format not in url_map:
2905 self._downloader.trouble(u'ERROR: requested format not available')
2907 video_url_list = [(req_format, url_map[req_format])] # Specific format
2909 for format_param, video_real_url in video_url_list:
2911 # At this point we have a new video
2912 self._downloader.increment_downloads()
2915 video_extension = self._video_extensions.get(format_param, 'mp4')
2918 # Process video information
2919 self._downloader.process_info({
2920 'id': video_id.decode('utf-8'),
2921 'url': video_real_url.decode('utf-8'),
2922 'uploader': video_uploader.decode('utf-8'),
2923 'upload_date': upload_date,
2924 'title': video_title,
2925 'stitle': simple_title,
2926 'ext': video_extension.decode('utf-8'),
2927 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2928 'thumbnail': video_thumbnail.decode('utf-8'),
2929 'description': video_description.decode('utf-8'),
2932 except UnavailableVideoError, err:
2933 self._downloader.trouble(u'\nERROR: unable to download video')
2935 class BlipTVIE(InfoExtractor):
2936 """Information extractor for blip.tv"""
2938 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2939 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2940 IE_NAME = u'blip.tv'
2942 def report_extraction(self, file_id):
2943 """Report information extraction."""
2944 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2946 def report_direct_download(self, title):
2947 """Report information extraction."""
2948 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2950 def _real_extract(self, url):
2951 mobj = re.match(self._VALID_URL, url)
2953 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2960 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2961 request = urllib2.Request(json_url)
2962 self.report_extraction(mobj.group(1))
2965 urlh = urllib2.urlopen(request)
2966 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2967 basename = url.split('/')[-1]
2968 title,ext = os.path.splitext(basename)
2969 title = title.decode('UTF-8')
2970 ext = ext.replace('.', '')
2971 self.report_direct_download(title)
2976 'stitle': _simplify_title(title),
2980 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2981 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2983 if info is None: # Regular URL
2985 json_code = urlh.read()
2986 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2987 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2991 json_data = json.loads(json_code)
2992 if 'Post' in json_data:
2993 data = json_data['Post']
2997 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2998 video_url = data['media']['url']
2999 umobj = re.match(self._URL_EXT, video_url)
3001 raise ValueError('Can not determine filename extension')
3002 ext = umobj.group(1)
3005 'id': data['item_id'],
3007 'uploader': data['display_name'],
3008 'upload_date': upload_date,
3009 'title': data['title'],
3010 'stitle': _simplify_title(data['title']),
3012 'format': data['media']['mimeType'],
3013 'thumbnail': data['thumbnailUrl'],
3014 'description': data['description'],
3015 'player_url': data['embedUrl']
3017 except (ValueError,KeyError), err:
3018 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3021 self._downloader.increment_downloads()
3024 self._downloader.process_info(info)
3025 except UnavailableVideoError, err:
3026 self._downloader.trouble(u'\nERROR: unable to download video')
3029 class MyVideoIE(InfoExtractor):
3030 """Information Extractor for myvideo.de."""
3032 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3033 IE_NAME = u'myvideo'
3035 def __init__(self, downloader=None):
3036 InfoExtractor.__init__(self, downloader)
3038 def report_download_webpage(self, video_id):
3039 """Report webpage download."""
3040 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3042 def report_extraction(self, video_id):
3043 """Report information extraction."""
3044 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3046 def _real_extract(self,url):
3047 mobj = re.match(self._VALID_URL, url)
3049 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3052 video_id = mobj.group(1)
3055 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3057 self.report_download_webpage(video_id)
3058 webpage = urllib2.urlopen(request).read()
3059 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3060 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3063 self.report_extraction(video_id)
3064 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3067 self._downloader.trouble(u'ERROR: unable to extract media URL')
3069 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3071 mobj = re.search('<title>([^<]+)</title>', webpage)
3073 self._downloader.trouble(u'ERROR: unable to extract title')
3076 video_title = mobj.group(1)
3077 video_title = sanitize_title(video_title)
3079 simple_title = _simplify_title(video_title)
3082 self._downloader.process_info({
3086 'upload_date': u'NA',
3087 'title': video_title,
3088 'stitle': simple_title,
3093 except UnavailableVideoError:
3094 self._downloader.trouble(u'\nERROR: Unable to download video')
3096 class ComedyCentralIE(InfoExtractor):
3097 """Information extractor for The Daily Show and Colbert Report """
3099 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3100 IE_NAME = u'comedycentral'
3102 def report_extraction(self, episode_id):
3103 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3105 def report_config_download(self, episode_id):
3106 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3108 def report_index_download(self, episode_id):
3109 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3111 def report_player_url(self, episode_id):
3112 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3114 def _real_extract(self, url):
3115 mobj = re.match(self._VALID_URL, url)
3117 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3120 if mobj.group('shortname'):
3121 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3122 url = u'http://www.thedailyshow.com/full-episodes/'
3124 url = u'http://www.colbertnation.com/full-episodes/'
3125 mobj = re.match(self._VALID_URL, url)
3126 assert mobj is not None
3128 dlNewest = not mobj.group('episode')
3130 epTitle = mobj.group('showname')
3132 epTitle = mobj.group('episode')
3134 req = urllib2.Request(url)
3135 self.report_extraction(epTitle)
3137 htmlHandle = urllib2.urlopen(req)
3138 html = htmlHandle.read()
3139 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3140 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3143 url = htmlHandle.geturl()
3144 mobj = re.match(self._VALID_URL, url)
3146 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3148 if mobj.group('episode') == '':
3149 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3151 epTitle = mobj.group('episode')
3153 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3154 if len(mMovieParams) == 0:
3155 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3158 playerUrl_raw = mMovieParams[0][0]
3159 self.report_player_url(epTitle)
3161 urlHandle = urllib2.urlopen(playerUrl_raw)
3162 playerUrl = urlHandle.geturl()
3163 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3164 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3167 uri = mMovieParams[0][1]
3168 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3169 self.report_index_download(epTitle)
3171 indexXml = urllib2.urlopen(indexUrl).read()
3172 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3173 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3176 idoc = xml.etree.ElementTree.fromstring(indexXml)
3177 itemEls = idoc.findall('.//item')
3178 for itemEl in itemEls:
3179 mediaId = itemEl.findall('./guid')[0].text
3180 shortMediaId = mediaId.split(':')[-1]
3181 showId = mediaId.split(':')[-2].replace('.com', '')
3182 officialTitle = itemEl.findall('./title')[0].text
3183 officialDate = itemEl.findall('./pubDate')[0].text
3185 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3186 urllib.urlencode({'uri': mediaId}))
3187 configReq = urllib2.Request(configUrl)
3188 self.report_config_download(epTitle)
3190 configXml = urllib2.urlopen(configReq).read()
3191 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3192 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3195 cdoc = xml.etree.ElementTree.fromstring(configXml)
3197 for rendition in cdoc.findall('.//rendition'):
3198 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3202 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3205 # For now, just pick the highest bitrate
3206 format,video_url = turls[-1]
3208 self._downloader.increment_downloads()
3210 effTitle = showId + u'-' + epTitle
3215 'upload_date': officialDate,
3217 'stitle': _simplify_title(effTitle),
3221 'description': officialTitle,
3222 'player_url': playerUrl
3226 self._downloader.process_info(info)
3227 except UnavailableVideoError, err:
3228 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3232 class EscapistIE(InfoExtractor):
3233 """Information extractor for The Escapist """
3235 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3236 IE_NAME = u'escapist'
3238 def report_extraction(self, showName):
3239 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3241 def report_config_download(self, showName):
3242 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3244 def _real_extract(self, url):
3245 htmlParser = HTMLParser.HTMLParser()
3247 mobj = re.match(self._VALID_URL, url)
3249 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3251 showName = mobj.group('showname')
3252 videoId = mobj.group('episode')
3254 self.report_extraction(showName)
3256 webPage = urllib2.urlopen(url).read()
3257 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3258 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3261 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3262 description = htmlParser.unescape(descMatch.group(1))
3263 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3264 imgUrl = htmlParser.unescape(imgMatch.group(1))
3265 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3266 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3267 configUrlMatch = re.search('config=(.*)$', playerUrl)
3268 configUrl = urllib2.unquote(configUrlMatch.group(1))
3270 self.report_config_download(showName)
3272 configJSON = urllib2.urlopen(configUrl).read()
3273 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3274 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3277 # Technically, it's JavaScript, not JSON
3278 configJSON = configJSON.replace("'", '"')
3281 config = json.loads(configJSON)
3282 except (ValueError,), err:
3283 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3286 playlist = config['playlist']
3287 videoUrl = playlist[1]['url']
3289 self._downloader.increment_downloads()
3293 'uploader': showName,
3294 'upload_date': None,
3296 'stitle': _simplify_title(showName),
3299 'thumbnail': imgUrl,
3300 'description': description,
3301 'player_url': playerUrl,
3305 self._downloader.process_info(info)
3306 except UnavailableVideoError, err:
3307 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3310 class CollegeHumorIE(InfoExtractor):
3311 """Information extractor for collegehumor.com"""
3313 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3314 IE_NAME = u'collegehumor'
3316 def report_webpage(self, video_id):
3317 """Report information extraction."""
3318 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3320 def report_extraction(self, video_id):
3321 """Report information extraction."""
3322 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3324 def _real_extract(self, url):
3325 htmlParser = HTMLParser.HTMLParser()
3327 mobj = re.match(self._VALID_URL, url)
3329 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3331 video_id = mobj.group('videoid')
3333 self.report_webpage(video_id)
3334 request = urllib2.Request(url)
3336 webpage = urllib2.urlopen(request).read()
3337 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3338 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3341 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3343 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3345 internal_video_id = m.group('internalvideoid')
3349 'internal_id': internal_video_id,
3352 self.report_extraction(video_id)
3353 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3355 metaXml = urllib2.urlopen(xmlUrl).read()
3356 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3357 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3360 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3362 videoNode = mdoc.findall('./video')[0]
3363 info['description'] = videoNode.findall('./description')[0].text
3364 info['title'] = videoNode.findall('./caption')[0].text
3365 info['stitle'] = _simplify_title(info['title'])
3366 info['url'] = videoNode.findall('./file')[0].text
3367 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3368 info['ext'] = info['url'].rpartition('.')[2]
3369 info['format'] = info['ext']
3371 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3374 self._downloader.increment_downloads()
3377 self._downloader.process_info(info)
3378 except UnavailableVideoError, err:
3379 self._downloader.trouble(u'\nERROR: unable to download video')
3382 class XVideosIE(InfoExtractor):
3383 """Information extractor for xvideos.com"""
3385 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3386 IE_NAME = u'xvideos'
3388 def report_webpage(self, video_id):
3389 """Report information extraction."""
3390 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3392 def report_extraction(self, video_id):
3393 """Report information extraction."""
3394 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3396 def _real_extract(self, url):
3397 htmlParser = HTMLParser.HTMLParser()
3399 mobj = re.match(self._VALID_URL, url)
3401 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3403 video_id = mobj.group(1).decode('utf-8')
3405 self.report_webpage(video_id)
3407 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3409 webpage = urllib2.urlopen(request).read()
3410 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3411 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3414 self.report_extraction(video_id)
3418 mobj = re.search(r'flv_url=(.+?)&', webpage)
3420 self._downloader.trouble(u'ERROR: unable to extract video url')
3422 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3426 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3428 self._downloader.trouble(u'ERROR: unable to extract video title')
3430 video_title = mobj.group(1).decode('utf-8')
3433 # Extract video thumbnail
3434 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3436 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3438 video_thumbnail = mobj.group(1).decode('utf-8')
3442 self._downloader.increment_downloads()
3447 'upload_date': None,
3448 'title': video_title,
3449 'stitle': _simplify_title(video_title),
3452 'thumbnail': video_thumbnail,
3453 'description': None,
3458 self._downloader.process_info(info)
3459 except UnavailableVideoError, err:
3460 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3463 class SoundcloudIE(InfoExtractor):
3464 """Information extractor for soundcloud.com
3465 To access the media, the uid of the song and a stream token
3466 must be extracted from the page source and the script must make
3467 a request to media.soundcloud.com/crossdomain.xml. Then
3468 the media can be grabbed by requesting from an url composed
3469 of the stream token and uid
3472 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3473 IE_NAME = u'soundcloud'
3475 def __init__(self, downloader=None):
3476 InfoExtractor.__init__(self, downloader)
3478 def report_webpage(self, video_id):
3479 """Report information extraction."""
3480 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3482 def report_extraction(self, video_id):
3483 """Report information extraction."""
3484 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3486 def _real_extract(self, url):
3487 htmlParser = HTMLParser.HTMLParser()
3489 mobj = re.match(self._VALID_URL, url)
3491 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3494 # extract uploader (which is in the url)
3495 uploader = mobj.group(1).decode('utf-8')
3496 # extract simple title (uploader + slug of song title)
3497 slug_title = mobj.group(2).decode('utf-8')
3498 simple_title = uploader + '-' + slug_title
3500 self.report_webpage('%s/%s' % (uploader, slug_title))
3502 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3504 webpage = urllib2.urlopen(request).read()
3505 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3506 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3509 self.report_extraction('%s/%s' % (uploader, slug_title))
3511 # extract uid and stream token that soundcloud hands out for access
3512 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3514 video_id = mobj.group(1)
3515 stream_token = mobj.group(2)
3517 # extract unsimplified title
3518 mobj = re.search('"title":"(.*?)",', webpage)
3520 title = mobj.group(1)
3522 # construct media url (with uid/token)
3523 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3524 mediaURL = mediaURL % (video_id, stream_token)
3527 description = u'No description available'
3528 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3530 description = mobj.group(1)
3534 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3537 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3538 except Exception, e:
3541 # for soundcloud, a request to a cross domain is required for cookies
3542 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3545 self._downloader.process_info({
3546 'id': video_id.decode('utf-8'),
3548 'uploader': uploader.decode('utf-8'),
3549 'upload_date': upload_date,
3550 'title': simple_title.decode('utf-8'),
3551 'stitle': simple_title.decode('utf-8'),
3555 'description': description.decode('utf-8')
3557 except UnavailableVideoError:
3558 self._downloader.trouble(u'\nERROR: unable to download video')
3561 class InfoQIE(InfoExtractor):
3562 """Information extractor for infoq.com"""
3564 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3567 def report_webpage(self, video_id):
3568 """Report information extraction."""
3569 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3571 def report_extraction(self, video_id):
3572 """Report information extraction."""
3573 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3575 def _real_extract(self, url):
3576 htmlParser = HTMLParser.HTMLParser()
3578 mobj = re.match(self._VALID_URL, url)
3580 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3583 self.report_webpage(url)
3585 request = urllib2.Request(url)
3587 webpage = urllib2.urlopen(request).read()
3588 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3589 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3592 self.report_extraction(url)
3596 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3598 self._downloader.trouble(u'ERROR: unable to extract video url')
3600 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3604 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3606 self._downloader.trouble(u'ERROR: unable to extract video title')
3608 video_title = mobj.group(1).decode('utf-8')
3610 # Extract description
3611 video_description = u'No description available.'
3612 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3613 if mobj is not None:
3614 video_description = mobj.group(1).decode('utf-8')
3616 video_filename = video_url.split('/')[-1]
3617 video_id, extension = video_filename.split('.')
3619 self._downloader.increment_downloads()
3624 'upload_date': None,
3625 'title': video_title,
3626 'stitle': _simplify_title(video_title),
3628 'format': extension, # Extension is always(?) mp4, but seems to be flv
3630 'description': video_description,
3635 self._downloader.process_info(info)
3636 except UnavailableVideoError, err:
3637 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3639 class MixcloudIE(InfoExtractor):
3640 """Information extractor for www.mixcloud.com"""
3641 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3642 IE_NAME = u'mixcloud'
3644 def __init__(self, downloader=None):
3645 InfoExtractor.__init__(self, downloader)
3647 def report_download_json(self, file_id):
3648 """Report JSON download."""
3649 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3651 def report_extraction(self, file_id):
3652 """Report information extraction."""
3653 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3655 def get_urls(self, jsonData, fmt, bitrate='best'):
3656 """Get urls from 'audio_formats' section in json"""
3659 bitrate_list = jsonData[fmt]
3660 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3661 bitrate = max(bitrate_list) # select highest
3663 url_list = jsonData[fmt][bitrate]
3664 except TypeError: # we have no bitrate info.
3665 url_list = jsonData[fmt]
3669 def check_urls(self, url_list):
3670 """Returns 1st active url from list"""
3671 for url in url_list:
3673 urllib2.urlopen(url)
3675 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3680 def _print_formats(self, formats):
3681 print 'Available formats:'
3682 for fmt in formats.keys():
3683 for b in formats[fmt]:
3685 ext = formats[fmt][b][0]
3686 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3687 except TypeError: # we have no bitrate info
3688 ext = formats[fmt][0]
3689 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3692 def _real_extract(self, url):
3693 mobj = re.match(self._VALID_URL, url)
3695 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3697 # extract uploader & filename from url
3698 uploader = mobj.group(1).decode('utf-8')
3699 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3701 # construct API request
3702 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3703 # retrieve .json file with links to files
3704 request = urllib2.Request(file_url)
3706 self.report_download_json(file_url)
3707 jsonData = urllib2.urlopen(request).read()
3708 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3709 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3713 json_data = json.loads(jsonData)
3714 player_url = json_data['player_swf_url']
3715 formats = dict(json_data['audio_formats'])
3717 req_format = self._downloader.params.get('format', None)
3720 if self._downloader.params.get('listformats', None):
3721 self._print_formats(formats)
3724 if req_format is None or req_format == 'best':
3725 for format_param in formats.keys():
3726 url_list = self.get_urls(formats, format_param)
3728 file_url = self.check_urls(url_list)
3729 if file_url is not None:
3732 if req_format not in formats.keys():
3733 self._downloader.trouble(u'ERROR: format is not available')
3736 url_list = self.get_urls(formats, req_format)
3737 file_url = self.check_urls(url_list)
3738 format_param = req_format
3741 self._downloader.increment_downloads()
3743 # Process file information
3744 self._downloader.process_info({
3745 'id': file_id.decode('utf-8'),
3746 'url': file_url.decode('utf-8'),
3747 'uploader': uploader.decode('utf-8'),
3748 'upload_date': u'NA',
3749 'title': json_data['name'],
3750 'stitle': _simplify_title(json_data['name']),
3751 'ext': file_url.split('.')[-1].decode('utf-8'),
3752 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3753 'thumbnail': json_data['thumbnail_url'],
3754 'description': json_data['description'],
3755 'player_url': player_url.decode('utf-8'),
3757 except UnavailableVideoError, err:
3758 self._downloader.trouble(u'ERROR: unable to download file')
3760 class StanfordOpenClassroomIE(InfoExtractor):
3761 """Information extractor for Stanford's Open ClassRoom"""
3763 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3764 IE_NAME = u'stanfordoc'
3766 def report_download_webpage(self, objid):
3767 """Report information extraction."""
3768 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3770 def report_extraction(self, video_id):
3771 """Report information extraction."""
3772 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3774 def _real_extract(self, url):
3775 mobj = re.match(self._VALID_URL, url)
3777 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3780 if mobj.group('course') and mobj.group('video'): # A specific video
3781 course = mobj.group('course')
3782 video = mobj.group('video')
3784 'id': _simplify_title(course + '_' + video),
3787 self.report_extraction(info['id'])
3788 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3789 xmlUrl = baseUrl + video + '.xml'
3791 metaXml = urllib2.urlopen(xmlUrl).read()
3792 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3793 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3795 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3797 info['title'] = mdoc.findall('./title')[0].text
3798 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3800 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3802 info['stitle'] = _simplify_title(info['title'])
3803 info['ext'] = info['url'].rpartition('.')[2]
3804 info['format'] = info['ext']
3805 self._downloader.increment_downloads()
3807 self._downloader.process_info(info)
3808 except UnavailableVideoError, err:
3809 self._downloader.trouble(u'\nERROR: unable to download video')
3810 elif mobj.group('course'): # A course page
3811 unescapeHTML = HTMLParser.HTMLParser().unescape
3813 course = mobj.group('course')
3815 'id': _simplify_title(course),
3819 self.report_download_webpage(info['id'])
3821 coursepage = urllib2.urlopen(url).read()
3822 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3823 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3826 m = re.search('<h1>([^<]+)</h1>', coursepage)
3828 info['title'] = unescapeHTML(m.group(1))
3830 info['title'] = info['id']
3831 info['stitle'] = _simplify_title(info['title'])
3833 m = re.search('<description>([^<]+)</description>', coursepage)
3835 info['description'] = unescapeHTML(m.group(1))
3837 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3840 'type': 'reference',
3841 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3845 for entry in info['list']:
3846 assert entry['type'] == 'reference'
3847 self.extract(entry['url'])
3849 unescapeHTML = HTMLParser.HTMLParser().unescape
3852 'id': 'Stanford OpenClassroom',
3856 self.report_download_webpage(info['id'])
3857 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3859 rootpage = urllib2.urlopen(rootURL).read()
3860 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3861 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3864 info['title'] = info['id']
3865 info['stitle'] = _simplify_title(info['title'])
3867 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3870 'type': 'reference',
3871 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3875 for entry in info['list']:
3876 assert entry['type'] == 'reference'
3877 self.extract(entry['url'])
3880 class PostProcessor(object):
3881 """Post Processor class.
3883 PostProcessor objects can be added to downloaders with their
3884 add_post_processor() method. When the downloader has finished a
3885 successful download, it will take its internal chain of PostProcessors
3886 and start calling the run() method on each one of them, first with
3887 an initial argument and then with the returned value of the previous
3890 The chain will be stopped if one of them ever returns None or the end
3891 of the chain is reached.
3893 PostProcessor objects follow a "mutual registration" process similar
3894 to InfoExtractor objects.
3899 def __init__(self, downloader=None):
3900 self._downloader = downloader
3902 def set_downloader(self, downloader):
3903 """Sets the downloader for this PP."""
3904 self._downloader = downloader
3906 def run(self, information):
3907 """Run the PostProcessor.
3909 The "information" argument is a dictionary like the ones
3910 composed by InfoExtractors. The only difference is that this
3911 one has an extra field called "filepath" that points to the
3914 When this method returns None, the postprocessing chain is
3915 stopped. However, this method may return an information
3916 dictionary that will be passed to the next postprocessing
3917 object in the chain. It can be the one it received after
3918 changing some fields.
3920 In addition, this method may raise a PostProcessingError
3921 exception that will be taken into account by the downloader
3924 return information # by default, do nothing
3927 class FFmpegExtractAudioPP(PostProcessor):
3929 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3930 PostProcessor.__init__(self, downloader)
3931 if preferredcodec is None:
3932 preferredcodec = 'best'
3933 self._preferredcodec = preferredcodec
3934 self._preferredquality = preferredquality
3935 self._keepvideo = keepvideo
3938 def get_audio_codec(path):
3940 cmd = ['ffprobe', '-show_streams', '--', path]
3941 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3942 output = handle.communicate()[0]
3943 if handle.wait() != 0:
3945 except (IOError, OSError):
3948 for line in output.split('\n'):
3949 if line.startswith('codec_name='):
3950 audio_codec = line.split('=')[1].strip()
3951 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3956 def run_ffmpeg(path, out_path, codec, more_opts):
3958 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3959 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3961 except (IOError, OSError):
3964 def run(self, information):
3965 path = information['filepath']
3967 filecodec = self.get_audio_codec(path)
3968 if filecodec is None:
3969 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3973 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
3974 if self._preferredcodec == 'm4a' and filecodec == 'aac':
3975 # Lossless, but in another container
3977 extension = self._preferredcodec
3978 more_opts = ['-absf', 'aac_adtstoasc']
3979 elif filecodec in ['aac', 'mp3', 'vorbis']:
3980 # Lossless if possible
3982 extension = filecodec
3983 if filecodec == 'aac':
3984 more_opts = ['-f', 'adts']
3985 if filecodec == 'vorbis':
3989 acodec = 'libmp3lame'
3992 if self._preferredquality is not None:
3993 more_opts += ['-ab', self._preferredquality]
3995 # We convert the audio (lossy)
3996 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3997 extension = self._preferredcodec
3999 if self._preferredquality is not None:
4000 more_opts += ['-ab', self._preferredquality]
4001 if self._preferredcodec == 'aac':
4002 more_opts += ['-f', 'adts']
4003 if self._preferredcodec == 'm4a':
4004 more_opts += ['-absf', 'aac_adtstoasc']
4005 if self._preferredcodec == 'vorbis':
4008 (prefix, ext) = os.path.splitext(path)
4009 new_path = prefix + '.' + extension
4010 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
4011 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
4014 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
4017 # Try to update the date time for extracted audio file.
4018 if information.get('filetime') is not None:
4020 os.utime(new_path, (time.time(), information['filetime']))
4022 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4024 if not self._keepvideo:
4027 except (IOError, OSError):
4028 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4031 information['filepath'] = new_path
4035 def updateSelf(downloader, filename):
4036 ''' Update the program file with the latest version from the repository '''
4037 # Note: downloader only used for options
4038 if not os.access(filename, os.W_OK):
4039 sys.exit('ERROR: no write permissions on %s' % filename)
4041 downloader.to_screen('Updating to latest version...')
4045 urlh = urllib.urlopen(UPDATE_URL)
4046 newcontent = urlh.read()
4048 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4049 if vmatch is not None and vmatch.group(1) == __version__:
4050 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
4054 except (IOError, OSError), err:
4055 sys.exit('ERROR: unable to download latest version')
4058 outf = open(filename, 'wb')
4060 outf.write(newcontent)
4063 except (IOError, OSError), err:
4064 sys.exit('ERROR: unable to overwrite current version')
4066 downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
4074 def _readOptions(filename):
4076 optionf = open(filename)
4078 return [] # silently skip if file is not present
4082 res += shlex.split(l, comments=True)
4087 def _format_option_string(option):
4088 ''' ('-o', '--option') -> -o, --format METAVAR'''
4092 if option._short_opts: opts.append(option._short_opts[0])
4093 if option._long_opts: opts.append(option._long_opts[0])
4094 if len(opts) > 1: opts.insert(1, ', ')
4096 if option.takes_value(): opts.append(' %s' % option.metavar)
4098 return "".join(opts)
4100 def _find_term_columns():
4101 columns = os.environ.get('COLUMNS', None)
4106 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4107 out,err = sp.communicate()
4108 return int(out.split()[1])
4114 max_help_position = 80
4116 # No need to wrap help messages if we're on a wide console
4117 columns = _find_term_columns()
4118 if columns: max_width = columns
4120 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4121 fmt.format_option_strings = _format_option_string
4124 'version' : __version__,
4126 'usage' : '%prog [options] url [url...]',
4127 'conflict_handler' : 'resolve',
4130 parser = optparse.OptionParser(**kw)
4133 general = optparse.OptionGroup(parser, 'General Options')
4134 selection = optparse.OptionGroup(parser, 'Video Selection')
4135 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4136 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4137 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4138 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4139 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4141 general.add_option('-h', '--help',
4142 action='help', help='print this help text and exit')
4143 general.add_option('-v', '--version',
4144 action='version', help='print program version and exit')
4145 general.add_option('-U', '--update',
4146 action='store_true', dest='update_self', help='update this program to latest version')
4147 general.add_option('-i', '--ignore-errors',
4148 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4149 general.add_option('-r', '--rate-limit',
4150 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4151 general.add_option('-R', '--retries',
4152 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4153 general.add_option('--dump-user-agent',
4154 action='store_true', dest='dump_user_agent',
4155 help='display the current browser identification', default=False)
4156 general.add_option('--list-extractors',
4157 action='store_true', dest='list_extractors',
4158 help='List all supported extractors and the URLs they would handle', default=False)
4160 selection.add_option('--playlist-start',
4161 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4162 selection.add_option('--playlist-end',
4163 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4164 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4165 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4166 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4168 authentication.add_option('-u', '--username',
4169 dest='username', metavar='USERNAME', help='account username')
4170 authentication.add_option('-p', '--password',
4171 dest='password', metavar='PASSWORD', help='account password')
4172 authentication.add_option('-n', '--netrc',
4173 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4176 video_format.add_option('-f', '--format',
4177 action='store', dest='format', metavar='FORMAT', help='video format code')
4178 video_format.add_option('--all-formats',
4179 action='store_const', dest='format', help='download all available video formats', const='all')
4180 video_format.add_option('--prefer-free-formats',
4181 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4182 video_format.add_option('--max-quality',
4183 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4184 video_format.add_option('-F', '--list-formats',
4185 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4188 verbosity.add_option('-q', '--quiet',
4189 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4190 verbosity.add_option('-s', '--simulate',
4191 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4192 verbosity.add_option('--skip-download',
4193 action='store_true', dest='skip_download', help='do not download the video', default=False)
4194 verbosity.add_option('-g', '--get-url',
4195 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4196 verbosity.add_option('-e', '--get-title',
4197 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4198 verbosity.add_option('--get-thumbnail',
4199 action='store_true', dest='getthumbnail',
4200 help='simulate, quiet but print thumbnail URL', default=False)
4201 verbosity.add_option('--get-description',
4202 action='store_true', dest='getdescription',
4203 help='simulate, quiet but print video description', default=False)
4204 verbosity.add_option('--get-filename',
4205 action='store_true', dest='getfilename',
4206 help='simulate, quiet but print output filename', default=False)
4207 verbosity.add_option('--get-format',
4208 action='store_true', dest='getformat',
4209 help='simulate, quiet but print output format', default=False)
4210 verbosity.add_option('--no-progress',
4211 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4212 verbosity.add_option('--console-title',
4213 action='store_true', dest='consoletitle',
4214 help='display progress in console titlebar', default=False)
4217 filesystem.add_option('-t', '--title',
4218 action='store_true', dest='usetitle', help='use title in file name', default=False)
4219 filesystem.add_option('-l', '--literal',
4220 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4221 filesystem.add_option('-A', '--auto-number',
4222 action='store_true', dest='autonumber',
4223 help='number downloaded files starting from 00000', default=False)
4224 filesystem.add_option('-o', '--output',
4225 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4226 filesystem.add_option('-a', '--batch-file',
4227 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4228 filesystem.add_option('-w', '--no-overwrites',
4229 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4230 filesystem.add_option('-c', '--continue',
4231 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
4232 filesystem.add_option('--no-continue',
4233 action='store_false', dest='continue_dl',
4234 help='do not resume partially downloaded files (restart from beginning)')
4235 filesystem.add_option('--cookies',
4236 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4237 filesystem.add_option('--no-part',
4238 action='store_true', dest='nopart', help='do not use .part files', default=False)
4239 filesystem.add_option('--no-mtime',
4240 action='store_false', dest='updatetime',
4241 help='do not use the Last-modified header to set the file modification time', default=True)
4242 filesystem.add_option('--write-description',
4243 action='store_true', dest='writedescription',
4244 help='write video description to a .description file', default=False)
4245 filesystem.add_option('--write-info-json',
4246 action='store_true', dest='writeinfojson',
4247 help='write video metadata to a .info.json file', default=False)
4250 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4251 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4252 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4253 help='"best", "aac", "vorbis", "mp3", or "m4a"; best by default')
4254 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4255 help='ffmpeg audio bitrate specification, 128k by default')
4256 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4257 help='keeps the video file on disk after the post-processing; the video is erased by default')
4260 parser.add_option_group(general)
4261 parser.add_option_group(selection)
4262 parser.add_option_group(filesystem)
4263 parser.add_option_group(verbosity)
4264 parser.add_option_group(video_format)
4265 parser.add_option_group(authentication)
4266 parser.add_option_group(postproc)
4268 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4270 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4272 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4273 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4274 opts, args = parser.parse_args(argv)
4276 return parser, opts, args
4278 def gen_extractors():
4279 """ Return a list of an instance of every supported extractor.
4280 The order does matter; the first extractor matched is the one handling the URL.
4282 youtube_ie = YoutubeIE()
4283 google_ie = GoogleIE()
4284 yahoo_ie = YahooIE()
4286 YoutubePlaylistIE(youtube_ie),
4287 YoutubeUserIE(youtube_ie),
4288 YoutubeSearchIE(youtube_ie),
4290 MetacafeIE(youtube_ie),
4293 GoogleSearchIE(google_ie),
4296 YahooSearchIE(yahoo_ie),
4309 StanfordOpenClassroomIE(),
4315 parser, opts, args = parseOpts()
4317 # Open appropriate CookieJar
4318 if opts.cookiefile is None:
4319 jar = cookielib.CookieJar()
4322 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4323 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4325 except (IOError, OSError), err:
4326 sys.exit(u'ERROR: unable to open cookie file')
4329 if opts.dump_user_agent:
4330 print std_headers['User-Agent']
4333 # Batch file verification
4335 if opts.batchfile is not None:
4337 if opts.batchfile == '-':
4340 batchfd = open(opts.batchfile, 'r')
4341 batchurls = batchfd.readlines()
4342 batchurls = [x.strip() for x in batchurls]
4343 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4345 sys.exit(u'ERROR: batch file could not be read')
4346 all_urls = batchurls + args
4348 # General configuration
4349 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4350 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4351 urllib2.install_opener(opener)
4352 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4354 extractors = gen_extractors()
4356 if opts.list_extractors:
4357 for ie in extractors:
4359 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4360 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4361 for mu in matchedUrls:
4365 # Conflicting, missing and erroneous options
4366 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4367 parser.error(u'using .netrc conflicts with giving username/password')
4368 if opts.password is not None and opts.username is None:
4369 parser.error(u'account username missing')
4370 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4371 parser.error(u'using output template conflicts with using title, literal title or auto number')
4372 if opts.usetitle and opts.useliteral:
4373 parser.error(u'using title conflicts with using literal title')
4374 if opts.username is not None and opts.password is None:
4375 opts.password = getpass.getpass(u'Type account password and press return:')
4376 if opts.ratelimit is not None:
4377 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4378 if numeric_limit is None:
4379 parser.error(u'invalid rate limit specified')
4380 opts.ratelimit = numeric_limit
4381 if opts.retries is not None:
4383 opts.retries = long(opts.retries)
4384 except (TypeError, ValueError), err:
4385 parser.error(u'invalid retry count specified')
4387 opts.playliststart = int(opts.playliststart)
4388 if opts.playliststart <= 0:
4389 raise ValueError(u'Playlist start must be positive')
4390 except (TypeError, ValueError), err:
4391 parser.error(u'invalid playlist start number specified')
4393 opts.playlistend = int(opts.playlistend)
4394 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4395 raise ValueError(u'Playlist end must be greater than playlist start')
4396 except (TypeError, ValueError), err:
4397 parser.error(u'invalid playlist end number specified')
4398 if opts.extractaudio:
4399 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a']:
4400 parser.error(u'invalid audio format specified')
4403 fd = FileDownloader({
4404 'usenetrc': opts.usenetrc,
4405 'username': opts.username,
4406 'password': opts.password,
4407 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4408 'forceurl': opts.geturl,
4409 'forcetitle': opts.gettitle,
4410 'forcethumbnail': opts.getthumbnail,
4411 'forcedescription': opts.getdescription,
4412 'forcefilename': opts.getfilename,
4413 'forceformat': opts.getformat,
4414 'simulate': opts.simulate,
4415 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4416 'format': opts.format,
4417 'format_limit': opts.format_limit,
4418 'listformats': opts.listformats,
4419 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4420 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4421 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4422 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4423 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4424 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4425 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4426 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4427 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4428 or u'%(id)s.%(ext)s'),
4429 'ignoreerrors': opts.ignoreerrors,
4430 'ratelimit': opts.ratelimit,
4431 'nooverwrites': opts.nooverwrites,
4432 'retries': opts.retries,
4433 'continuedl': opts.continue_dl,
4434 'noprogress': opts.noprogress,
4435 'playliststart': opts.playliststart,
4436 'playlistend': opts.playlistend,
4437 'logtostderr': opts.outtmpl == '-',
4438 'consoletitle': opts.consoletitle,
4439 'nopart': opts.nopart,
4440 'updatetime': opts.updatetime,
4441 'writedescription': opts.writedescription,
4442 'writeinfojson': opts.writeinfojson,
4443 'matchtitle': opts.matchtitle,
4444 'rejecttitle': opts.rejecttitle,
4445 'max_downloads': opts.max_downloads,
4446 'prefer_free_formats': opts.prefer_free_formats,
4448 for extractor in extractors:
4449 fd.add_info_extractor(extractor)
4452 if opts.extractaudio:
4453 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4456 if opts.update_self:
4457 updateSelf(fd, sys.argv[0])
4460 if len(all_urls) < 1:
4461 if not opts.update_self:
4462 parser.error(u'you must provide at least one URL')
4467 retcode = fd.download(all_urls)
4468 except MaxDownloadsReached:
4469 fd.to_screen(u'--max-download limit reached, aborting.')
4472 # Dump cookie jar if requested
4473 if opts.cookiefile is not None:
4476 except (IOError, OSError), err:
4477 sys.exit(u'ERROR: unable to save cookie jar')
4484 except DownloadError:
4486 except SameFileError:
4487 sys.exit(u'ERROR: fixed output name but more than one file to download')
4488 except KeyboardInterrupt:
4489 sys.exit(u'\nERROR: Interrupted by user')
4491 if __name__ == '__main__':
4494 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: