2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # Author: Vasyl' Vavrychuk
7 # Author: Witold Baryluk
8 # Author: Paweł Paprota
9 # Author: Gergely Imreh
10 # License: Public domain code
12 from __future__ import with_statement
41 warnings.warn('No JSON support (TODO: insert trivialjson here)')
44 import cStringIO as StringIO
48 # parse_qs was moved from the cgi module to the urlparse module recently.
50 from urlparse import parse_qs
52 from cgi import parse_qs
56 except ImportError: # Python < 2.6
60 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b11) Gecko/20100101 Firefox/4.0b11',
61 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
62 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
63 'Accept-Encoding': 'gzip, deflate',
64 'Accept-Language': 'en-us,en;q=0.5',
67 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
69 def preferredencoding():
70 """Get preferred encoding.
72 Returns the best encoding scheme for the system, based on
73 locale.getpreferredencoding() and some further tweaks.
75 def yield_preferredencoding():
77 pref = locale.getpreferredencoding()
83 return yield_preferredencoding().next()
85 def htmlentity_transform(matchobj):
86 """Transforms an HTML entity to a Unicode character.
88 This function receives a match object and is intended to be used with
89 the re.sub() function.
91 entity = matchobj.group(1)
93 # Known non-numeric HTML entity
94 if entity in htmlentitydefs.name2codepoint:
95 return unichr(htmlentitydefs.name2codepoint[entity])
98 mobj = re.match(ur'(?u)#(x?\d+)', entity)
100 numstr = mobj.group(1)
101 if numstr.startswith(u'x'):
103 numstr = u'0%s' % numstr
106 return unichr(long(numstr, base))
108 # Unknown entity in name, return its literal representation
109 return (u'&%s;' % entity)
111 def sanitize_title(utitle):
112 """Sanitizes a video title so it could be used as part of a filename."""
113 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
114 return utitle.replace(unicode(os.sep), u'%')
116 def sanitize_open(filename, open_mode):
117 """Try to open the given filename, and slightly tweak it if this fails.
119 Attempts to open the given filename. If this fails, it tries to change
120 the filename slightly, step by step, until it's either able to open it
121 or it fails and raises a final exception, like the standard open()
124 It returns the tuple (stream, definitive_file_name).
128 if sys.platform == 'win32':
130 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
131 return (sys.stdout, filename)
132 stream = open(filename, open_mode)
133 return (stream, filename)
134 except (IOError, OSError), err:
135 # In case of error, try to remove win32 forbidden chars
136 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
138 # An exception here should be caught in the caller
139 stream = open(filename, open_mode)
140 return (stream, filename)
142 def timeconvert(timestr):
143 """Convert RFC 2822 defined time string into system timestamp"""
145 timetuple = email.utils.parsedate_tz(timestr)
146 if timetuple is not None:
147 timestamp = email.utils.mktime_tz(timetuple)
150 class DownloadError(Exception):
151 """Download Error exception.
153 This exception may be thrown by FileDownloader objects if they are not
154 configured to continue on errors. They will contain the appropriate
159 class SameFileError(Exception):
160 """Same File exception.
162 This exception will be thrown by FileDownloader objects if they detect
163 multiple files would have to be downloaded to the same file on disk.
167 class PostProcessingError(Exception):
168 """Post Processing exception.
170 This exception may be raised by PostProcessor's .run() method to
171 indicate an error in the postprocessing task.
175 class UnavailableVideoError(Exception):
176 """Unavailable Format exception.
178 This exception will be thrown when a video is requested
179 in a format that is not available for that video.
183 class ContentTooShortError(Exception):
184 """Content Too Short exception.
186 This exception may be raised by FileDownloader objects when a file they
187 download is too small for what the server announced first, indicating
188 the connection was probably interrupted.
194 def __init__(self, downloaded, expected):
195 self.downloaded = downloaded
196 self.expected = expected
198 class YoutubeDLHandler(urllib2.HTTPHandler):
199 """Handler for HTTP requests and responses.
201 This class, when installed with an OpenerDirector, automatically adds
202 the standard headers to every HTTP request and handles gzipped and
203 deflated responses from web servers. If compression is to be avoided in
204 a particular request, the original request in the program code only has
205 to include the HTTP header "Youtubedl-No-Compression", which will be
206 removed before making the real request.
208 Part of this code was copied from:
210 http://techknack.net/python-urllib2-handlers/
212 Andrew Rowls, the author of that code, agreed to release it to the
219 return zlib.decompress(data, -zlib.MAX_WBITS)
221 return zlib.decompress(data)
224 def addinfourl_wrapper(stream, headers, url, code):
225 if hasattr(urllib2.addinfourl, 'getcode'):
226 return urllib2.addinfourl(stream, headers, url, code)
227 ret = urllib2.addinfourl(stream, headers, url)
231 def http_request(self, req):
232 for h in std_headers:
235 req.add_header(h, std_headers[h])
236 if 'Youtubedl-no-compression' in req.headers:
237 if 'Accept-encoding' in req.headers:
238 del req.headers['Accept-encoding']
239 del req.headers['Youtubedl-no-compression']
242 def http_response(self, req, resp):
245 if resp.headers.get('Content-encoding', '') == 'gzip':
246 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
247 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
248 resp.msg = old_resp.msg
250 if resp.headers.get('Content-encoding', '') == 'deflate':
251 gz = StringIO.StringIO(self.deflate(resp.read()))
252 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
253 resp.msg = old_resp.msg
256 class FileDownloader(object):
257 """File Downloader class.
259 File downloader objects are the ones responsible of downloading the
260 actual video file and writing it to disk if the user has requested
261 it, among some other tasks. In most cases there should be one per
262 program. As, given a video URL, the downloader doesn't know how to
263 extract all the needed information, task that InfoExtractors do, it
264 has to pass the URL to one of them.
266 For this, file downloader objects have a method that allows
267 InfoExtractors to be registered in a given order. When it is passed
268 a URL, the file downloader handles it to the first InfoExtractor it
269 finds that reports being able to handle it. The InfoExtractor extracts
270 all the information about the video or videos the URL refers to, and
271 asks the FileDownloader to process the video information, possibly
272 downloading the video.
274 File downloaders accept a lot of parameters. In order not to saturate
275 the object constructor with arguments, it receives a dictionary of
276 options instead. These options are available through the params
277 attribute for the InfoExtractors to use. The FileDownloader also
278 registers itself as the downloader in charge for the InfoExtractors
279 that are added to it, so this is a "mutual registration".
283 username: Username for authentication purposes.
284 password: Password for authentication purposes.
285 usenetrc: Use netrc for authentication instead.
286 quiet: Do not print messages to stdout.
287 forceurl: Force printing final URL.
288 forcetitle: Force printing title.
289 forcethumbnail: Force printing thumbnail URL.
290 forcedescription: Force printing description.
291 forcefilename: Force printing final filename.
292 simulate: Do not download the video files.
293 format: Video format code.
294 format_limit: Highest quality format to try.
295 outtmpl: Template for output names.
296 ignoreerrors: Do not stop on download errors.
297 ratelimit: Download speed limit, in bytes/sec.
298 nooverwrites: Prevent overwriting files.
299 retries: Number of times to retry for HTTP error 5xx
300 continuedl: Try to continue downloads if possible.
301 noprogress: Do not print the progress bar.
302 playliststart: Playlist item to start at.
303 playlistend: Playlist item to end at.
304 logtostderr: Log messages to stderr instead of stdout.
305 consoletitle: Display progress in console window's titlebar.
306 nopart: Do not use temporary .part files.
307 updatetime: Use the Last-modified header to set output file timestamps.
308 writedescription: Write the video description to a .description file
314 _download_retcode = None
315 _num_downloads = None
318 def __init__(self, params):
319 """Create a FileDownloader object with the given options."""
322 self._download_retcode = 0
323 self._num_downloads = 0
324 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
328 def pmkdir(filename):
329 """Create directory components in filename. Similar to Unix "mkdir -p"."""
330 components = filename.split(os.sep)
331 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
332 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
333 for dir in aggregate:
334 if not os.path.exists(dir):
338 def format_bytes(bytes):
341 if type(bytes) is str:
346 exponent = long(math.log(bytes, 1024.0))
347 suffix = 'bkMGTPEZY'[exponent]
348 converted = float(bytes) / float(1024**exponent)
349 return '%.2f%s' % (converted, suffix)
352 def calc_percent(byte_counter, data_len):
355 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
358 def calc_eta(start, now, total, current):
362 if current == 0 or dif < 0.001: # One millisecond
364 rate = float(current) / dif
365 eta = long((float(total) - float(current)) / rate)
366 (eta_mins, eta_secs) = divmod(eta, 60)
369 return '%02d:%02d' % (eta_mins, eta_secs)
372 def calc_speed(start, now, bytes):
374 if bytes == 0 or dif < 0.001: # One millisecond
375 return '%10s' % '---b/s'
376 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
379 def best_block_size(elapsed_time, bytes):
380 new_min = max(bytes / 2.0, 1.0)
381 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
382 if elapsed_time < 0.001:
384 rate = bytes / elapsed_time
392 def parse_bytes(bytestr):
393 """Parse a string indicating a byte quantity into a long integer."""
394 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
397 number = float(matchobj.group(1))
398 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
399 return long(round(number * multiplier))
401 def add_info_extractor(self, ie):
402 """Add an InfoExtractor object to the end of the list."""
404 ie.set_downloader(self)
406 def add_post_processor(self, pp):
407 """Add a PostProcessor object to the end of the chain."""
409 pp.set_downloader(self)
411 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
412 """Print message to stdout if not in quiet mode."""
414 if not self.params.get('quiet', False):
415 terminator = [u'\n', u''][skip_eol]
416 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
417 self._screen_file.flush()
418 except (UnicodeEncodeError), err:
419 if not ignore_encoding_errors:
422 def to_stderr(self, message):
423 """Print message to stderr."""
424 print >>sys.stderr, message.encode(preferredencoding())
426 def to_cons_title(self, message):
427 """Set console/terminal window title to message."""
428 if not self.params.get('consoletitle', False):
430 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
431 # c_wchar_p() might not be necessary if `message` is
432 # already of type unicode()
433 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
434 elif 'TERM' in os.environ:
435 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
437 def fixed_template(self):
438 """Checks if the output template is fixed."""
439 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
441 def trouble(self, message=None):
442 """Determine action to take when a download problem appears.
444 Depending on if the downloader has been configured to ignore
445 download errors or not, this method may throw an exception or
446 not when errors are found, after printing the message.
448 if message is not None:
449 self.to_stderr(message)
450 if not self.params.get('ignoreerrors', False):
451 raise DownloadError(message)
452 self._download_retcode = 1
454 def slow_down(self, start_time, byte_counter):
455 """Sleep if the download speed is over the rate limit."""
456 rate_limit = self.params.get('ratelimit', None)
457 if rate_limit is None or byte_counter == 0:
460 elapsed = now - start_time
463 speed = float(byte_counter) / elapsed
464 if speed > rate_limit:
465 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
467 def temp_name(self, filename):
468 """Returns a temporary filename for the given filename."""
469 if self.params.get('nopart', False) or filename == u'-' or \
470 (os.path.exists(filename) and not os.path.isfile(filename)):
472 return filename + u'.part'
474 def undo_temp_name(self, filename):
475 if filename.endswith(u'.part'):
476 return filename[:-len(u'.part')]
479 def try_rename(self, old_filename, new_filename):
481 if old_filename == new_filename:
483 os.rename(old_filename, new_filename)
484 except (IOError, OSError), err:
485 self.trouble(u'ERROR: unable to rename file')
487 def try_utime(self, filename, last_modified_hdr):
488 """Try to set the last-modified time of the given file."""
489 if last_modified_hdr is None:
491 if not os.path.isfile(filename):
493 timestr = last_modified_hdr
496 filetime = timeconvert(timestr)
500 os.utime(filename,(time.time(), filetime))
504 def report_writedescription(self, descfn):
505 """ Report that the description file has been written """
506 self.to_screen(u'[info] Video description written to: %s' % descfn, ignore_encoding_errors=True)
508 def report_destination(self, filename):
509 """Report destination filename."""
510 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
512 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
513 """Report download progress."""
514 if self.params.get('noprogress', False):
516 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
517 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
518 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
519 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
521 def report_resuming_byte(self, resume_len):
522 """Report attempt to resume at given byte."""
523 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
525 def report_retry(self, count, retries):
526 """Report retry in case of HTTP error 5xx"""
527 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
529 def report_file_already_downloaded(self, file_name):
530 """Report file has already been fully downloaded."""
532 self.to_screen(u'[download] %s has already been downloaded' % file_name)
533 except (UnicodeEncodeError), err:
534 self.to_screen(u'[download] The file has already been downloaded')
536 def report_unable_to_resume(self):
537 """Report it was impossible to resume download."""
538 self.to_screen(u'[download] Unable to resume')
540 def report_finish(self):
541 """Report download finished."""
542 if self.params.get('noprogress', False):
543 self.to_screen(u'[download] Download completed')
547 def increment_downloads(self):
548 """Increment the ordinal that assigns a number to each file."""
549 self._num_downloads += 1
551 def prepare_filename(self, info_dict):
552 """Generate the output filename."""
554 template_dict = dict(info_dict)
555 template_dict['epoch'] = unicode(long(time.time()))
556 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
557 filename = self.params['outtmpl'] % template_dict
559 except (ValueError, KeyError), err:
560 self.trouble(u'ERROR: invalid system charset or erroneous output template')
563 def process_info(self, info_dict):
564 """Process a single dictionary returned by an InfoExtractor."""
565 filename = self.prepare_filename(info_dict)
566 # Do nothing else if in simulate mode
567 if self.params.get('simulate', False):
569 if self.params.get('forcetitle', False):
570 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
571 if self.params.get('forceurl', False):
572 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
573 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
574 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
575 if self.params.get('forcedescription', False) and 'description' in info_dict:
576 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
577 if self.params.get('forcefilename', False) and filename is not None:
578 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
584 if self.params.get('nooverwrites', False) and os.path.exists(filename):
585 self.to_stderr(u'WARNING: file exists and will be skipped')
589 self.pmkdir(filename)
590 except (OSError, IOError), err:
591 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
594 if self.params.get('writedescription', False):
596 descfn = filename + '.description'
597 with contextlib.closing(open(descfn, 'wb')) as descfile:
598 descfile.write(info_dict['description'].encode('utf-8'))
599 self.report_writedescription(descfn)
600 except (OSError, IOError):
601 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
605 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
606 except (OSError, IOError), err:
607 raise UnavailableVideoError
608 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
609 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
611 except (ContentTooShortError, ), err:
612 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
617 self.post_process(filename, info_dict)
618 except (PostProcessingError), err:
619 self.trouble(u'ERROR: postprocessing: %s' % str(err))
622 def download(self, url_list):
623 """Download a given list of URLs."""
624 if len(url_list) > 1 and self.fixed_template():
625 raise SameFileError(self.params['outtmpl'])
628 suitable_found = False
630 # Go to next InfoExtractor if not suitable
631 if not ie.suitable(url):
634 # Suitable InfoExtractor found
635 suitable_found = True
637 # Extract information from URL and process it
640 # Suitable InfoExtractor had been found; go to next URL
643 if not suitable_found:
644 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
646 return self._download_retcode
648 def post_process(self, filename, ie_info):
649 """Run the postprocessing chain on the given file."""
651 info['filepath'] = filename
657 def _download_with_rtmpdump(self, filename, url, player_url):
658 self.report_destination(filename)
659 tmpfilename = self.temp_name(filename)
661 # Check for rtmpdump first
663 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
664 except (OSError, IOError):
665 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
668 # Download using rtmpdump. rtmpdump returns exit code 2 when
669 # the connection was interrumpted and resuming appears to be
670 # possible. This is part of rtmpdump's normal usage, AFAIK.
671 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
672 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
673 while retval == 2 or retval == 1:
674 prevsize = os.path.getsize(tmpfilename)
675 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
676 time.sleep(5.0) # This seems to be needed
677 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
678 cursize = os.path.getsize(tmpfilename)
679 if prevsize == cursize and retval == 1:
682 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
683 self.try_rename(tmpfilename, filename)
686 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
689 def _do_download(self, filename, url, player_url):
690 # Check file already present
691 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
692 self.report_file_already_downloaded(filename)
695 # Attempt to download using rtmpdump
696 if url.startswith('rtmp'):
697 return self._download_with_rtmpdump(filename, url, player_url)
699 tmpfilename = self.temp_name(filename)
703 # Do not include the Accept-Encoding header
704 headers = {'Youtubedl-no-compression': 'True'}
705 basic_request = urllib2.Request(url, None, headers)
706 request = urllib2.Request(url, None, headers)
708 # Establish possible resume length
709 if os.path.isfile(tmpfilename):
710 resume_len = os.path.getsize(tmpfilename)
714 # Request parameters in case of being able to resume
715 if self.params.get('continuedl', False) and resume_len != 0:
716 self.report_resuming_byte(resume_len)
717 request.add_header('Range','bytes=%d-' % resume_len)
721 retries = self.params.get('retries', 0)
722 while count <= retries:
723 # Establish connection
725 data = urllib2.urlopen(request)
727 except (urllib2.HTTPError, ), err:
728 if (err.code < 500 or err.code >= 600) and err.code != 416:
729 # Unexpected HTTP error
731 elif err.code == 416:
732 # Unable to resume (requested range not satisfiable)
734 # Open the connection again without the range header
735 data = urllib2.urlopen(basic_request)
736 content_length = data.info()['Content-Length']
737 except (urllib2.HTTPError, ), err:
738 if err.code < 500 or err.code >= 600:
741 # Examine the reported length
742 if (content_length is not None and
743 (resume_len - 100 < long(content_length) < resume_len + 100)):
744 # The file had already been fully downloaded.
745 # Explanation to the above condition: in issue #175 it was revealed that
746 # YouTube sometimes adds or removes a few bytes from the end of the file,
747 # changing the file size slightly and causing problems for some users. So
748 # I decided to implement a suggested change and consider the file
749 # completely downloaded if the file size differs less than 100 bytes from
750 # the one in the hard drive.
751 self.report_file_already_downloaded(filename)
752 self.try_rename(tmpfilename, filename)
755 # The length does not match, we start the download over
756 self.report_unable_to_resume()
762 self.report_retry(count, retries)
765 self.trouble(u'ERROR: giving up after %s retries' % retries)
768 data_len = data.info().get('Content-length', None)
769 if data_len is not None:
770 data_len = long(data_len) + resume_len
771 data_len_str = self.format_bytes(data_len)
772 byte_counter = 0 + resume_len
778 data_block = data.read(block_size)
780 if len(data_block) == 0:
782 byte_counter += len(data_block)
784 # Open file just in time
787 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
788 filename = self.undo_temp_name(tmpfilename)
789 self.report_destination(filename)
790 except (OSError, IOError), err:
791 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
794 stream.write(data_block)
795 except (IOError, OSError), err:
796 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
798 block_size = self.best_block_size(after - before, len(data_block))
801 percent_str = self.calc_percent(byte_counter, data_len)
802 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
803 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
804 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
807 self.slow_down(start, byte_counter - resume_len)
811 if data_len is not None and byte_counter != data_len:
812 raise ContentTooShortError(byte_counter, long(data_len))
813 self.try_rename(tmpfilename, filename)
815 # Update file modification time
816 if self.params.get('updatetime', True):
817 self.try_utime(filename, data.info().get('last-modified', None))
821 class InfoExtractor(object):
822 """Information Extractor class.
824 Information extractors are the classes that, given a URL, extract
825 information from the video (or videos) the URL refers to. This
826 information includes the real video URL, the video title and simplified
827 title, author and others. The information is stored in a dictionary
828 which is then passed to the FileDownloader. The FileDownloader
829 processes this information possibly downloading the video to the file
830 system, among other possible outcomes. The dictionaries must include
831 the following fields:
833 id: Video identifier.
834 url: Final video URL.
835 uploader: Nickname of the video uploader.
836 title: Literal title.
837 stitle: Simplified title.
838 ext: Video filename extension.
839 format: Video format.
840 player_url: SWF Player URL (may be None).
842 The following fields are optional. Their primary purpose is to allow
843 youtube-dl to serve as the backend for a video search function, such
844 as the one in youtube2mp3. They are only used when their respective
845 forced printing functions are called:
847 thumbnail: Full URL to a video thumbnail image.
848 description: One-line video description.
850 Subclasses of this one should re-define the _real_initialize() and
851 _real_extract() methods, as well as the suitable() static method.
852 Probably, they should also be instantiated and added to the main
859 def __init__(self, downloader=None):
860 """Constructor. Receives an optional downloader."""
862 self.set_downloader(downloader)
866 """Receives a URL and returns True if suitable for this IE."""
869 def initialize(self):
870 """Initializes an instance (authentication, etc)."""
872 self._real_initialize()
875 def extract(self, url):
876 """Extracts URL information and returns it in list of dicts."""
878 return self._real_extract(url)
880 def set_downloader(self, downloader):
881 """Sets the downloader for this IE."""
882 self._downloader = downloader
884 def _real_initialize(self):
885 """Real initialization process. Redefine in subclasses."""
888 def _real_extract(self, url):
889 """Real extraction process. Redefine in subclasses."""
892 class YoutubeIE(InfoExtractor):
893 """Information extractor for youtube.com."""
895 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
896 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
897 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
898 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
899 _NETRC_MACHINE = 'youtube'
900 # Listed in order of quality
901 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
902 _video_extensions = {
908 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
915 return (re.match(YoutubeIE._VALID_URL, url) is not None)
917 def report_lang(self):
918 """Report attempt to set language."""
919 self._downloader.to_screen(u'[youtube] Setting language')
921 def report_login(self):
922 """Report attempt to log in."""
923 self._downloader.to_screen(u'[youtube] Logging in')
925 def report_age_confirmation(self):
926 """Report attempt to confirm age."""
927 self._downloader.to_screen(u'[youtube] Confirming age')
929 def report_video_webpage_download(self, video_id):
930 """Report attempt to download video webpage."""
931 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
933 def report_video_info_webpage_download(self, video_id):
934 """Report attempt to download video info webpage."""
935 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
937 def report_information_extraction(self, video_id):
938 """Report attempt to extract video information."""
939 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
941 def report_unavailable_format(self, video_id, format):
942 """Report extracted video URL."""
943 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
945 def report_rtmp_download(self):
946 """Indicate the download will use the RTMP protocol."""
947 self._downloader.to_screen(u'[youtube] RTMP download detected')
949 def _real_initialize(self):
950 if self._downloader is None:
955 downloader_params = self._downloader.params
957 # Attempt to use provided username and password or .netrc data
958 if downloader_params.get('username', None) is not None:
959 username = downloader_params['username']
960 password = downloader_params['password']
961 elif downloader_params.get('usenetrc', False):
963 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
968 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
969 except (IOError, netrc.NetrcParseError), err:
970 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
974 request = urllib2.Request(self._LANG_URL)
977 urllib2.urlopen(request).read()
978 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
979 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
982 # No authentication to be performed
988 'current_form': 'loginForm',
990 'action_login': 'Log In',
991 'username': username,
992 'password': password,
994 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
997 login_results = urllib2.urlopen(request).read()
998 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
999 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1001 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1002 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1008 'action_confirm': 'Confirm',
1010 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1012 self.report_age_confirmation()
1013 age_results = urllib2.urlopen(request).read()
1014 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1015 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1018 def _real_extract(self, url):
1019 # Extract video id from URL
1020 mobj = re.match(self._VALID_URL, url)
1022 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1024 video_id = mobj.group(2)
1027 self.report_video_webpage_download(video_id)
1028 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1030 video_webpage = urllib2.urlopen(request).read()
1031 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1032 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1035 # Attempt to extract SWF player URL
1036 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1037 if mobj is not None:
1038 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1043 self.report_video_info_webpage_download(video_id)
1044 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1045 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1046 % (video_id, el_type))
1047 request = urllib2.Request(video_info_url)
1049 video_info_webpage = urllib2.urlopen(request).read()
1050 video_info = parse_qs(video_info_webpage)
1051 if 'token' in video_info:
1053 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1054 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1056 if 'token' not in video_info:
1057 if 'reason' in video_info:
1058 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1060 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1063 # Start extracting information
1064 self.report_information_extraction(video_id)
1067 if 'author' not in video_info:
1068 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1070 video_uploader = urllib.unquote_plus(video_info['author'][0])
1073 if 'title' not in video_info:
1074 self._downloader.trouble(u'ERROR: unable to extract video title')
1076 video_title = urllib.unquote_plus(video_info['title'][0])
1077 video_title = video_title.decode('utf-8')
1078 video_title = sanitize_title(video_title)
1081 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1082 simple_title = simple_title.strip(ur'_')
1085 if 'thumbnail_url' not in video_info:
1086 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1087 video_thumbnail = ''
1088 else: # don't panic if we can't find it
1089 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1093 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1094 if mobj is not None:
1095 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1096 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1097 for expression in format_expressions:
1099 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1107 video_description = u'No description available.'
1108 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1109 warnings.warn(u'You are using an old Python version, install Python 2.6+ or lxml. Falling back to old video description extractor.')
1110 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1111 if mobj is not None:
1112 video_description = mobj.group(1).decode('utf-8')
1114 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1115 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1116 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1119 video_token = urllib.unquote_plus(video_info['token'][0])
1121 # Decide which formats to download
1122 req_format = self._downloader.params.get('format', None)
1124 if 'fmt_url_map' in video_info and len(video_info['fmt_url_map']) >= 1 and ',' in video_info['fmt_url_map'][0]:
1125 url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
1126 format_limit = self._downloader.params.get('format_limit', None)
1127 if format_limit is not None and format_limit in self._available_formats:
1128 format_list = self._available_formats[self._available_formats.index(format_limit):]
1130 format_list = self._available_formats
1131 existing_formats = [x for x in format_list if x in url_map]
1132 if len(existing_formats) == 0:
1133 self._downloader.trouble(u'ERROR: no known formats available for video')
1135 if req_format is None:
1136 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1137 elif req_format == '-1':
1138 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1141 if req_format not in url_map:
1142 self._downloader.trouble(u'ERROR: requested format not available')
1144 video_url_list = [(req_format, url_map[req_format])] # Specific format
1146 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1147 self.report_rtmp_download()
1148 video_url_list = [(None, video_info['conn'][0])]
1151 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1154 for format_param, video_real_url in video_url_list:
1155 # At this point we have a new video
1156 self._downloader.increment_downloads()
1159 video_extension = self._video_extensions.get(format_param, 'flv')
1161 # Find the video URL in fmt_url_map or conn paramters
1163 # Process video information
1164 self._downloader.process_info({
1165 'id': video_id.decode('utf-8'),
1166 'url': video_real_url.decode('utf-8'),
1167 'uploader': video_uploader.decode('utf-8'),
1168 'upload_date': upload_date,
1169 'title': video_title,
1170 'stitle': simple_title,
1171 'ext': video_extension.decode('utf-8'),
1172 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1173 'thumbnail': video_thumbnail.decode('utf-8'),
1174 'description': video_description,
1175 'player_url': player_url,
1177 except UnavailableVideoError, err:
1178 self._downloader.trouble(u'\nERROR: unable to download video')
1181 class MetacafeIE(InfoExtractor):
1182 """Information Extractor for metacafe.com."""
1184 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1185 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1186 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1189 def __init__(self, youtube_ie, downloader=None):
1190 InfoExtractor.__init__(self, downloader)
1191 self._youtube_ie = youtube_ie
1195 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1197 def report_disclaimer(self):
1198 """Report disclaimer retrieval."""
1199 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1201 def report_age_confirmation(self):
1202 """Report attempt to confirm age."""
1203 self._downloader.to_screen(u'[metacafe] Confirming age')
1205 def report_download_webpage(self, video_id):
1206 """Report webpage download."""
1207 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1209 def report_extraction(self, video_id):
1210 """Report information extraction."""
1211 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1213 def _real_initialize(self):
1214 # Retrieve disclaimer
1215 request = urllib2.Request(self._DISCLAIMER)
1217 self.report_disclaimer()
1218 disclaimer = urllib2.urlopen(request).read()
1219 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1220 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1226 'submit': "Continue - I'm over 18",
1228 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1230 self.report_age_confirmation()
1231 disclaimer = urllib2.urlopen(request).read()
1232 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1233 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1236 def _real_extract(self, url):
1237 # Extract id and simplified title from URL
1238 mobj = re.match(self._VALID_URL, url)
1240 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1243 video_id = mobj.group(1)
1245 # Check if video comes from YouTube
1246 mobj2 = re.match(r'^yt-(.*)$', video_id)
1247 if mobj2 is not None:
1248 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1251 # At this point we have a new video
1252 self._downloader.increment_downloads()
1254 simple_title = mobj.group(2).decode('utf-8')
1256 # Retrieve video webpage to extract further information
1257 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1259 self.report_download_webpage(video_id)
1260 webpage = urllib2.urlopen(request).read()
1261 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1262 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1265 # Extract URL, uploader and title from webpage
1266 self.report_extraction(video_id)
1267 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1268 if mobj is not None:
1269 mediaURL = urllib.unquote(mobj.group(1))
1270 video_extension = mediaURL[-3:]
1272 # Extract gdaKey if available
1273 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1275 video_url = mediaURL
1277 gdaKey = mobj.group(1)
1278 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1280 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1282 self._downloader.trouble(u'ERROR: unable to extract media URL')
1284 vardict = parse_qs(mobj.group(1))
1285 if 'mediaData' not in vardict:
1286 self._downloader.trouble(u'ERROR: unable to extract media URL')
1288 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1290 self._downloader.trouble(u'ERROR: unable to extract media URL')
1292 mediaURL = mobj.group(1).replace('\\/', '/')
1293 video_extension = mediaURL[-3:]
1294 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1296 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1298 self._downloader.trouble(u'ERROR: unable to extract title')
1300 video_title = mobj.group(1).decode('utf-8')
1301 video_title = sanitize_title(video_title)
1303 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1305 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1307 video_uploader = mobj.group(1)
1310 # Process video information
1311 self._downloader.process_info({
1312 'id': video_id.decode('utf-8'),
1313 'url': video_url.decode('utf-8'),
1314 'uploader': video_uploader.decode('utf-8'),
1315 'upload_date': u'NA',
1316 'title': video_title,
1317 'stitle': simple_title,
1318 'ext': video_extension.decode('utf-8'),
1322 except UnavailableVideoError:
1323 self._downloader.trouble(u'\nERROR: unable to download video')
1326 class DailymotionIE(InfoExtractor):
1327 """Information Extractor for Dailymotion"""
1329 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1331 def __init__(self, downloader=None):
1332 InfoExtractor.__init__(self, downloader)
1336 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1338 def report_download_webpage(self, video_id):
1339 """Report webpage download."""
1340 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1342 def report_extraction(self, video_id):
1343 """Report information extraction."""
1344 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1346 def _real_initialize(self):
1349 def _real_extract(self, url):
1350 # Extract id and simplified title from URL
1351 mobj = re.match(self._VALID_URL, url)
1353 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1356 # At this point we have a new video
1357 self._downloader.increment_downloads()
1358 video_id = mobj.group(1)
1360 simple_title = mobj.group(2).decode('utf-8')
1361 video_extension = 'flv'
1363 # Retrieve video webpage to extract further information
1364 request = urllib2.Request(url)
1366 self.report_download_webpage(video_id)
1367 webpage = urllib2.urlopen(request).read()
1368 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1369 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1372 # Extract URL, uploader and title from webpage
1373 self.report_extraction(video_id)
1374 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1376 self._downloader.trouble(u'ERROR: unable to extract media URL')
1378 mediaURL = urllib.unquote(mobj.group(1))
1380 # if needed add http://www.dailymotion.com/ if relative URL
1382 video_url = mediaURL
1384 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1385 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1387 self._downloader.trouble(u'ERROR: unable to extract title')
1389 video_title = mobj.group(1).decode('utf-8')
1390 video_title = sanitize_title(video_title)
1392 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1394 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1396 video_uploader = mobj.group(1)
1399 # Process video information
1400 self._downloader.process_info({
1401 'id': video_id.decode('utf-8'),
1402 'url': video_url.decode('utf-8'),
1403 'uploader': video_uploader.decode('utf-8'),
1404 'upload_date': u'NA',
1405 'title': video_title,
1406 'stitle': simple_title,
1407 'ext': video_extension.decode('utf-8'),
1411 except UnavailableVideoError:
1412 self._downloader.trouble(u'\nERROR: unable to download video')
1414 class GoogleIE(InfoExtractor):
1415 """Information extractor for video.google.com."""
1417 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1419 def __init__(self, downloader=None):
1420 InfoExtractor.__init__(self, downloader)
1424 return (re.match(GoogleIE._VALID_URL, url) is not None)
1426 def report_download_webpage(self, video_id):
1427 """Report webpage download."""
1428 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1430 def report_extraction(self, video_id):
1431 """Report information extraction."""
1432 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1434 def _real_initialize(self):
1437 def _real_extract(self, url):
1438 # Extract id from URL
1439 mobj = re.match(self._VALID_URL, url)
1441 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1444 # At this point we have a new video
1445 self._downloader.increment_downloads()
1446 video_id = mobj.group(1)
1448 video_extension = 'mp4'
1450 # Retrieve video webpage to extract further information
1451 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1453 self.report_download_webpage(video_id)
1454 webpage = urllib2.urlopen(request).read()
1455 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1456 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1459 # Extract URL, uploader, and title from webpage
1460 self.report_extraction(video_id)
1461 mobj = re.search(r"download_url:'([^']+)'", webpage)
1463 video_extension = 'flv'
1464 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1466 self._downloader.trouble(u'ERROR: unable to extract media URL')
1468 mediaURL = urllib.unquote(mobj.group(1))
1469 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1470 mediaURL = mediaURL.replace('\\x26', '\x26')
1472 video_url = mediaURL
1474 mobj = re.search(r'<title>(.*)</title>', webpage)
1476 self._downloader.trouble(u'ERROR: unable to extract title')
1478 video_title = mobj.group(1).decode('utf-8')
1479 video_title = sanitize_title(video_title)
1480 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1482 # Extract video description
1483 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1485 self._downloader.trouble(u'ERROR: unable to extract video description')
1487 video_description = mobj.group(1).decode('utf-8')
1488 if not video_description:
1489 video_description = 'No description available.'
1491 # Extract video thumbnail
1492 if self._downloader.params.get('forcethumbnail', False):
1493 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1495 webpage = urllib2.urlopen(request).read()
1496 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1497 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1499 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1501 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1503 video_thumbnail = mobj.group(1)
1504 else: # we need something to pass to process_info
1505 video_thumbnail = ''
1509 # Process video information
1510 self._downloader.process_info({
1511 'id': video_id.decode('utf-8'),
1512 'url': video_url.decode('utf-8'),
1514 'upload_date': u'NA',
1515 'title': video_title,
1516 'stitle': simple_title,
1517 'ext': video_extension.decode('utf-8'),
1521 except UnavailableVideoError:
1522 self._downloader.trouble(u'\nERROR: unable to download video')
1525 class PhotobucketIE(InfoExtractor):
1526 """Information extractor for photobucket.com."""
1528 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1530 def __init__(self, downloader=None):
1531 InfoExtractor.__init__(self, downloader)
1535 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1537 def report_download_webpage(self, video_id):
1538 """Report webpage download."""
1539 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1541 def report_extraction(self, video_id):
1542 """Report information extraction."""
1543 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1545 def _real_initialize(self):
1548 def _real_extract(self, url):
1549 # Extract id from URL
1550 mobj = re.match(self._VALID_URL, url)
1552 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1555 # At this point we have a new video
1556 self._downloader.increment_downloads()
1557 video_id = mobj.group(1)
1559 video_extension = 'flv'
1561 # Retrieve video webpage to extract further information
1562 request = urllib2.Request(url)
1564 self.report_download_webpage(video_id)
1565 webpage = urllib2.urlopen(request).read()
1566 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1567 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1570 # Extract URL, uploader, and title from webpage
1571 self.report_extraction(video_id)
1572 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1574 self._downloader.trouble(u'ERROR: unable to extract media URL')
1576 mediaURL = urllib.unquote(mobj.group(1))
1578 video_url = mediaURL
1580 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1582 self._downloader.trouble(u'ERROR: unable to extract title')
1584 video_title = mobj.group(1).decode('utf-8')
1585 video_title = sanitize_title(video_title)
1586 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1588 video_uploader = mobj.group(2).decode('utf-8')
1591 # Process video information
1592 self._downloader.process_info({
1593 'id': video_id.decode('utf-8'),
1594 'url': video_url.decode('utf-8'),
1595 'uploader': video_uploader,
1596 'upload_date': u'NA',
1597 'title': video_title,
1598 'stitle': simple_title,
1599 'ext': video_extension.decode('utf-8'),
1603 except UnavailableVideoError:
1604 self._downloader.trouble(u'\nERROR: unable to download video')
1607 class YahooIE(InfoExtractor):
1608 """Information extractor for video.yahoo.com."""
1610 # _VALID_URL matches all Yahoo! Video URLs
1611 # _VPAGE_URL matches only the extractable '/watch/' URLs
1612 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1613 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1615 def __init__(self, downloader=None):
1616 InfoExtractor.__init__(self, downloader)
1620 return (re.match(YahooIE._VALID_URL, url) is not None)
1622 def report_download_webpage(self, video_id):
1623 """Report webpage download."""
1624 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1626 def report_extraction(self, video_id):
1627 """Report information extraction."""
1628 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1630 def _real_initialize(self):
1633 def _real_extract(self, url, new_video=True):
1634 # Extract ID from URL
1635 mobj = re.match(self._VALID_URL, url)
1637 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1640 # At this point we have a new video
1641 self._downloader.increment_downloads()
1642 video_id = mobj.group(2)
1643 video_extension = 'flv'
1645 # Rewrite valid but non-extractable URLs as
1646 # extractable English language /watch/ URLs
1647 if re.match(self._VPAGE_URL, url) is None:
1648 request = urllib2.Request(url)
1650 webpage = urllib2.urlopen(request).read()
1651 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1652 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1655 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1657 self._downloader.trouble(u'ERROR: Unable to extract id field')
1659 yahoo_id = mobj.group(1)
1661 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1663 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1665 yahoo_vid = mobj.group(1)
1667 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1668 return self._real_extract(url, new_video=False)
1670 # Retrieve video webpage to extract further information
1671 request = urllib2.Request(url)
1673 self.report_download_webpage(video_id)
1674 webpage = urllib2.urlopen(request).read()
1675 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1676 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1679 # Extract uploader and title from webpage
1680 self.report_extraction(video_id)
1681 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1683 self._downloader.trouble(u'ERROR: unable to extract video title')
1685 video_title = mobj.group(1).decode('utf-8')
1686 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1688 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1690 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1692 video_uploader = mobj.group(1).decode('utf-8')
1694 # Extract video thumbnail
1695 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1697 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1699 video_thumbnail = mobj.group(1).decode('utf-8')
1701 # Extract video description
1702 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1704 self._downloader.trouble(u'ERROR: unable to extract video description')
1706 video_description = mobj.group(1).decode('utf-8')
1707 if not video_description: video_description = 'No description available.'
1709 # Extract video height and width
1710 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1712 self._downloader.trouble(u'ERROR: unable to extract video height')
1714 yv_video_height = mobj.group(1)
1716 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1718 self._downloader.trouble(u'ERROR: unable to extract video width')
1720 yv_video_width = mobj.group(1)
1722 # Retrieve video playlist to extract media URL
1723 # I'm not completely sure what all these options are, but we
1724 # seem to need most of them, otherwise the server sends a 401.
1725 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1726 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1727 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1728 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1729 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1731 self.report_download_webpage(video_id)
1732 webpage = urllib2.urlopen(request).read()
1733 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1734 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1737 # Extract media URL from playlist XML
1738 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1740 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1742 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1743 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1746 # Process video information
1747 self._downloader.process_info({
1748 'id': video_id.decode('utf-8'),
1750 'uploader': video_uploader,
1751 'upload_date': u'NA',
1752 'title': video_title,
1753 'stitle': simple_title,
1754 'ext': video_extension.decode('utf-8'),
1755 'thumbnail': video_thumbnail.decode('utf-8'),
1756 'description': video_description,
1757 'thumbnail': video_thumbnail,
1758 'description': video_description,
1761 except UnavailableVideoError:
1762 self._downloader.trouble(u'\nERROR: unable to download video')
1765 class GenericIE(InfoExtractor):
1766 """Generic last-resort information extractor."""
1768 def __init__(self, downloader=None):
1769 InfoExtractor.__init__(self, downloader)
1775 def report_download_webpage(self, video_id):
1776 """Report webpage download."""
1777 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1778 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1780 def report_extraction(self, video_id):
1781 """Report information extraction."""
1782 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1784 def _real_initialize(self):
1787 def _real_extract(self, url):
1788 # At this point we have a new video
1789 self._downloader.increment_downloads()
1791 video_id = url.split('/')[-1]
1792 request = urllib2.Request(url)
1794 self.report_download_webpage(video_id)
1795 webpage = urllib2.urlopen(request).read()
1796 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1797 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1799 except ValueError, err:
1800 # since this is the last-resort InfoExtractor, if
1801 # this error is thrown, it'll be thrown here
1802 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1805 self.report_extraction(video_id)
1806 # Start with something easy: JW Player in SWFObject
1807 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1809 # Broaden the search a little bit
1810 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1812 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1815 # It's possible that one of the regexes
1816 # matched, but returned an empty group:
1817 if mobj.group(1) is None:
1818 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1821 video_url = urllib.unquote(mobj.group(1))
1822 video_id = os.path.basename(video_url)
1824 # here's a fun little line of code for you:
1825 video_extension = os.path.splitext(video_id)[1][1:]
1826 video_id = os.path.splitext(video_id)[0]
1828 # it's tempting to parse this further, but you would
1829 # have to take into account all the variations like
1830 # Video Title - Site Name
1831 # Site Name | Video Title
1832 # Video Title - Tagline | Site Name
1833 # and so on and so forth; it's just not practical
1834 mobj = re.search(r'<title>(.*)</title>', webpage)
1836 self._downloader.trouble(u'ERROR: unable to extract title')
1838 video_title = mobj.group(1).decode('utf-8')
1839 video_title = sanitize_title(video_title)
1840 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1842 # video uploader is domain name
1843 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1845 self._downloader.trouble(u'ERROR: unable to extract title')
1847 video_uploader = mobj.group(1).decode('utf-8')
1850 # Process video information
1851 self._downloader.process_info({
1852 'id': video_id.decode('utf-8'),
1853 'url': video_url.decode('utf-8'),
1854 'uploader': video_uploader,
1855 'upload_date': u'NA',
1856 'title': video_title,
1857 'stitle': simple_title,
1858 'ext': video_extension.decode('utf-8'),
1862 except UnavailableVideoError, err:
1863 self._downloader.trouble(u'\nERROR: unable to download video')
1866 class YoutubeSearchIE(InfoExtractor):
1867 """Information Extractor for YouTube search queries."""
1868 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1869 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1870 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1871 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1873 _max_youtube_results = 1000
1875 def __init__(self, youtube_ie, downloader=None):
1876 InfoExtractor.__init__(self, downloader)
1877 self._youtube_ie = youtube_ie
1881 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1883 def report_download_page(self, query, pagenum):
1884 """Report attempt to download playlist page with given number."""
1885 query = query.decode(preferredencoding())
1886 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1888 def _real_initialize(self):
1889 self._youtube_ie.initialize()
1891 def _real_extract(self, query):
1892 mobj = re.match(self._VALID_QUERY, query)
1894 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1897 prefix, query = query.split(':')
1899 query = query.encode('utf-8')
1901 self._download_n_results(query, 1)
1903 elif prefix == 'all':
1904 self._download_n_results(query, self._max_youtube_results)
1910 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1912 elif n > self._max_youtube_results:
1913 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1914 n = self._max_youtube_results
1915 self._download_n_results(query, n)
1917 except ValueError: # parsing prefix as integer fails
1918 self._download_n_results(query, 1)
1921 def _download_n_results(self, query, n):
1922 """Downloads a specified number of results for a query"""
1925 already_seen = set()
1929 self.report_download_page(query, pagenum)
1930 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1931 request = urllib2.Request(result_url)
1933 page = urllib2.urlopen(request).read()
1934 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1935 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1938 # Extract video identifiers
1939 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1940 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1941 if video_id not in already_seen:
1942 video_ids.append(video_id)
1943 already_seen.add(video_id)
1944 if len(video_ids) == n:
1945 # Specified n videos reached
1946 for id in video_ids:
1947 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1950 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1951 for id in video_ids:
1952 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1955 pagenum = pagenum + 1
1957 class GoogleSearchIE(InfoExtractor):
1958 """Information Extractor for Google Video search queries."""
1959 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1960 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1961 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1962 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1964 _max_google_results = 1000
1966 def __init__(self, google_ie, downloader=None):
1967 InfoExtractor.__init__(self, downloader)
1968 self._google_ie = google_ie
1972 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1974 def report_download_page(self, query, pagenum):
1975 """Report attempt to download playlist page with given number."""
1976 query = query.decode(preferredencoding())
1977 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1979 def _real_initialize(self):
1980 self._google_ie.initialize()
1982 def _real_extract(self, query):
1983 mobj = re.match(self._VALID_QUERY, query)
1985 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1988 prefix, query = query.split(':')
1990 query = query.encode('utf-8')
1992 self._download_n_results(query, 1)
1994 elif prefix == 'all':
1995 self._download_n_results(query, self._max_google_results)
2001 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2003 elif n > self._max_google_results:
2004 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2005 n = self._max_google_results
2006 self._download_n_results(query, n)
2008 except ValueError: # parsing prefix as integer fails
2009 self._download_n_results(query, 1)
2012 def _download_n_results(self, query, n):
2013 """Downloads a specified number of results for a query"""
2016 already_seen = set()
2020 self.report_download_page(query, pagenum)
2021 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2022 request = urllib2.Request(result_url)
2024 page = urllib2.urlopen(request).read()
2025 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2026 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2029 # Extract video identifiers
2030 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2031 video_id = mobj.group(1)
2032 if video_id not in already_seen:
2033 video_ids.append(video_id)
2034 already_seen.add(video_id)
2035 if len(video_ids) == n:
2036 # Specified n videos reached
2037 for id in video_ids:
2038 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2041 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2042 for id in video_ids:
2043 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2046 pagenum = pagenum + 1
2048 class YahooSearchIE(InfoExtractor):
2049 """Information Extractor for Yahoo! Video search queries."""
2050 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2051 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2052 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2053 _MORE_PAGES_INDICATOR = r'\s*Next'
2055 _max_yahoo_results = 1000
2057 def __init__(self, yahoo_ie, downloader=None):
2058 InfoExtractor.__init__(self, downloader)
2059 self._yahoo_ie = yahoo_ie
2063 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2065 def report_download_page(self, query, pagenum):
2066 """Report attempt to download playlist page with given number."""
2067 query = query.decode(preferredencoding())
2068 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2070 def _real_initialize(self):
2071 self._yahoo_ie.initialize()
2073 def _real_extract(self, query):
2074 mobj = re.match(self._VALID_QUERY, query)
2076 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2079 prefix, query = query.split(':')
2081 query = query.encode('utf-8')
2083 self._download_n_results(query, 1)
2085 elif prefix == 'all':
2086 self._download_n_results(query, self._max_yahoo_results)
2092 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2094 elif n > self._max_yahoo_results:
2095 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2096 n = self._max_yahoo_results
2097 self._download_n_results(query, n)
2099 except ValueError: # parsing prefix as integer fails
2100 self._download_n_results(query, 1)
2103 def _download_n_results(self, query, n):
2104 """Downloads a specified number of results for a query"""
2107 already_seen = set()
2111 self.report_download_page(query, pagenum)
2112 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2113 request = urllib2.Request(result_url)
2115 page = urllib2.urlopen(request).read()
2116 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2117 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2120 # Extract video identifiers
2121 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2122 video_id = mobj.group(1)
2123 if video_id not in already_seen:
2124 video_ids.append(video_id)
2125 already_seen.add(video_id)
2126 if len(video_ids) == n:
2127 # Specified n videos reached
2128 for id in video_ids:
2129 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2132 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2133 for id in video_ids:
2134 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2137 pagenum = pagenum + 1
2139 class YoutubePlaylistIE(InfoExtractor):
2140 """Information Extractor for YouTube playlists."""
2142 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2143 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2144 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2145 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2148 def __init__(self, youtube_ie, downloader=None):
2149 InfoExtractor.__init__(self, downloader)
2150 self._youtube_ie = youtube_ie
2154 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2156 def report_download_page(self, playlist_id, pagenum):
2157 """Report attempt to download playlist page with given number."""
2158 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2160 def _real_initialize(self):
2161 self._youtube_ie.initialize()
2163 def _real_extract(self, url):
2164 # Extract playlist id
2165 mobj = re.match(self._VALID_URL, url)
2167 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2171 if mobj.group(3) is not None:
2172 self._youtube_ie.extract(mobj.group(3))
2175 # Download playlist pages
2176 # prefix is 'p' as default for playlists but there are other types that need extra care
2177 playlist_prefix = mobj.group(1)
2178 if playlist_prefix == 'a':
2179 playlist_access = 'artist'
2181 playlist_prefix = 'p'
2182 playlist_access = 'view_play_list'
2183 playlist_id = mobj.group(2)
2188 self.report_download_page(playlist_id, pagenum)
2189 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2191 page = urllib2.urlopen(request).read()
2192 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2193 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2196 # Extract video identifiers
2198 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2199 if mobj.group(1) not in ids_in_page:
2200 ids_in_page.append(mobj.group(1))
2201 video_ids.extend(ids_in_page)
2203 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2205 pagenum = pagenum + 1
2207 playliststart = self._downloader.params.get('playliststart', 1) - 1
2208 playlistend = self._downloader.params.get('playlistend', -1)
2209 video_ids = video_ids[playliststart:playlistend]
2211 for id in video_ids:
2212 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2215 class YoutubeUserIE(InfoExtractor):
2216 """Information Extractor for YouTube users."""
2218 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2219 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2220 _GDATA_PAGE_SIZE = 50
2221 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2222 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2225 def __init__(self, youtube_ie, downloader=None):
2226 InfoExtractor.__init__(self, downloader)
2227 self._youtube_ie = youtube_ie
2231 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2233 def report_download_page(self, username, start_index):
2234 """Report attempt to download user page."""
2235 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2236 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2238 def _real_initialize(self):
2239 self._youtube_ie.initialize()
2241 def _real_extract(self, url):
2243 mobj = re.match(self._VALID_URL, url)
2245 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2248 username = mobj.group(1)
2250 # Download video ids using YouTube Data API. Result size per
2251 # query is limited (currently to 50 videos) so we need to query
2252 # page by page until there are no video ids - it means we got
2259 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2260 self.report_download_page(username, start_index)
2262 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2265 page = urllib2.urlopen(request).read()
2266 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2267 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2270 # Extract video identifiers
2273 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2274 if mobj.group(1) not in ids_in_page:
2275 ids_in_page.append(mobj.group(1))
2277 video_ids.extend(ids_in_page)
2279 # A little optimization - if current page is not
2280 # "full", ie. does not contain PAGE_SIZE video ids then
2281 # we can assume that this page is the last one - there
2282 # are no more ids on further pages - no need to query
2285 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2290 all_ids_count = len(video_ids)
2291 playliststart = self._downloader.params.get('playliststart', 1) - 1
2292 playlistend = self._downloader.params.get('playlistend', -1)
2294 if playlistend == -1:
2295 video_ids = video_ids[playliststart:]
2297 video_ids = video_ids[playliststart:playlistend]
2299 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2300 (username, all_ids_count, len(video_ids)))
2302 for video_id in video_ids:
2303 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2306 class DepositFilesIE(InfoExtractor):
2307 """Information extractor for depositfiles.com"""
2309 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2311 def __init__(self, downloader=None):
2312 InfoExtractor.__init__(self, downloader)
2316 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2318 def report_download_webpage(self, file_id):
2319 """Report webpage download."""
2320 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2322 def report_extraction(self, file_id):
2323 """Report information extraction."""
2324 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2326 def _real_initialize(self):
2329 def _real_extract(self, url):
2330 # At this point we have a new file
2331 self._downloader.increment_downloads()
2333 file_id = url.split('/')[-1]
2334 # Rebuild url in english locale
2335 url = 'http://depositfiles.com/en/files/' + file_id
2337 # Retrieve file webpage with 'Free download' button pressed
2338 free_download_indication = { 'gateway_result' : '1' }
2339 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2341 self.report_download_webpage(file_id)
2342 webpage = urllib2.urlopen(request).read()
2343 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2344 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2347 # Search for the real file URL
2348 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2349 if (mobj is None) or (mobj.group(1) is None):
2350 # Try to figure out reason of the error.
2351 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2352 if (mobj is not None) and (mobj.group(1) is not None):
2353 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2354 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2356 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2359 file_url = mobj.group(1)
2360 file_extension = os.path.splitext(file_url)[1][1:]
2362 # Search for file title
2363 mobj = re.search(r'<b title="(.*?)">', webpage)
2365 self._downloader.trouble(u'ERROR: unable to extract title')
2367 file_title = mobj.group(1).decode('utf-8')
2370 # Process file information
2371 self._downloader.process_info({
2372 'id': file_id.decode('utf-8'),
2373 'url': file_url.decode('utf-8'),
2375 'upload_date': u'NA',
2376 'title': file_title,
2377 'stitle': file_title,
2378 'ext': file_extension.decode('utf-8'),
2382 except UnavailableVideoError, err:
2383 self._downloader.trouble(u'ERROR: unable to download file')
2385 class FacebookIE(InfoExtractor):
2386 """Information Extractor for Facebook"""
2388 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2389 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2390 _NETRC_MACHINE = 'facebook'
2391 _available_formats = ['highqual', 'lowqual']
2392 _video_extensions = {
2397 def __init__(self, downloader=None):
2398 InfoExtractor.__init__(self, downloader)
2402 return (re.match(FacebookIE._VALID_URL, url) is not None)
2404 def _reporter(self, message):
2405 """Add header and report message."""
2406 self._downloader.to_screen(u'[facebook] %s' % message)
2408 def report_login(self):
2409 """Report attempt to log in."""
2410 self._reporter(u'Logging in')
2412 def report_video_webpage_download(self, video_id):
2413 """Report attempt to download video webpage."""
2414 self._reporter(u'%s: Downloading video webpage' % video_id)
2416 def report_information_extraction(self, video_id):
2417 """Report attempt to extract video information."""
2418 self._reporter(u'%s: Extracting video information' % video_id)
2420 def _parse_page(self, video_webpage):
2421 """Extract video information from page"""
2423 data = {'title': r'class="video_title datawrap">(.*?)</',
2424 'description': r'<div class="datawrap">(.*?)</div>',
2425 'owner': r'\("video_owner_name", "(.*?)"\)',
2426 'upload_date': r'data-date="(.*?)"',
2427 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2430 for piece in data.keys():
2431 mobj = re.search(data[piece], video_webpage)
2432 if mobj is not None:
2433 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2437 for fmt in self._available_formats:
2438 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2439 if mobj is not None:
2440 # URL is in a Javascript segment inside an escaped Unicode format within
2441 # the generally utf-8 page
2442 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2443 video_info['video_urls'] = video_urls
2447 def _real_initialize(self):
2448 if self._downloader is None:
2453 downloader_params = self._downloader.params
2455 # Attempt to use provided username and password or .netrc data
2456 if downloader_params.get('username', None) is not None:
2457 useremail = downloader_params['username']
2458 password = downloader_params['password']
2459 elif downloader_params.get('usenetrc', False):
2461 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2462 if info is not None:
2466 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2467 except (IOError, netrc.NetrcParseError), err:
2468 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2471 if useremail is None:
2480 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2483 login_results = urllib2.urlopen(request).read()
2484 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2485 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2487 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2488 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2491 def _real_extract(self, url):
2492 mobj = re.match(self._VALID_URL, url)
2494 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2496 video_id = mobj.group('ID')
2499 self.report_video_webpage_download(video_id)
2500 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2502 page = urllib2.urlopen(request)
2503 video_webpage = page.read()
2504 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2505 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2508 # Start extracting information
2509 self.report_information_extraction(video_id)
2511 # Extract information
2512 video_info = self._parse_page(video_webpage)
2515 if 'owner' not in video_info:
2516 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2518 video_uploader = video_info['owner']
2521 if 'title' not in video_info:
2522 self._downloader.trouble(u'ERROR: unable to extract video title')
2524 video_title = video_info['title']
2525 video_title = video_title.decode('utf-8')
2526 video_title = sanitize_title(video_title)
2529 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2530 simple_title = simple_title.strip(ur'_')
2533 if 'thumbnail' not in video_info:
2534 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2535 video_thumbnail = ''
2537 video_thumbnail = video_info['thumbnail']
2541 if 'upload_date' in video_info:
2542 upload_time = video_info['upload_date']
2543 timetuple = email.utils.parsedate_tz(upload_time)
2544 if timetuple is not None:
2546 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2551 video_description = video_info.get('description', 'No description available.')
2553 url_map = video_info['video_urls']
2554 if len(url_map.keys()) > 0:
2555 # Decide which formats to download
2556 req_format = self._downloader.params.get('format', None)
2557 format_limit = self._downloader.params.get('format_limit', None)
2559 if format_limit is not None and format_limit in self._available_formats:
2560 format_list = self._available_formats[self._available_formats.index(format_limit):]
2562 format_list = self._available_formats
2563 existing_formats = [x for x in format_list if x in url_map]
2564 if len(existing_formats) == 0:
2565 self._downloader.trouble(u'ERROR: no known formats available for video')
2567 if req_format is None:
2568 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2569 elif req_format == '-1':
2570 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2573 if req_format not in url_map:
2574 self._downloader.trouble(u'ERROR: requested format not available')
2576 video_url_list = [(req_format, url_map[req_format])] # Specific format
2578 for format_param, video_real_url in video_url_list:
2580 # At this point we have a new video
2581 self._downloader.increment_downloads()
2584 video_extension = self._video_extensions.get(format_param, 'mp4')
2586 # Find the video URL in fmt_url_map or conn paramters
2588 # Process video information
2589 self._downloader.process_info({
2590 'id': video_id.decode('utf-8'),
2591 'url': video_real_url.decode('utf-8'),
2592 'uploader': video_uploader.decode('utf-8'),
2593 'upload_date': upload_date,
2594 'title': video_title,
2595 'stitle': simple_title,
2596 'ext': video_extension.decode('utf-8'),
2597 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2598 'thumbnail': video_thumbnail.decode('utf-8'),
2599 'description': video_description.decode('utf-8'),
2602 except UnavailableVideoError, err:
2603 self._downloader.trouble(u'\nERROR: unable to download video')
2605 class BlipTVIE(InfoExtractor):
2606 """Information extractor for blip.tv"""
2608 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip.tv(/.+)$'
2609 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2613 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2615 def report_extraction(self, file_id):
2616 """Report information extraction."""
2617 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2619 def _simplify_title(self, title):
2620 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2621 res = res.strip(ur'_')
2624 def _real_extract(self, url):
2625 mobj = re.match(self._VALID_URL, url)
2627 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2630 json_url = url + ('&' if '?' in url else '?') + 'skin=json&version=2&no_wrap=1'
2631 request = urllib2.Request(json_url)
2632 self.report_extraction(mobj.group(1))
2634 json_code = urllib2.urlopen(request).read()
2635 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2636 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2639 json_data = json.loads(json_code)
2640 data = json_data['Post'] if 'Post' in json_data else json_data
2642 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2643 video_url = data['media']['url']
2644 umobj = re.match(self._URL_EXT, video_url)
2646 raise ValueError('Can not determine filename extension')
2647 ext = umobj.group(1)
2649 self._downloader.increment_downloads()
2652 'id': data['item_id'],
2654 'uploader': data['display_name'],
2655 'upload_date': upload_date,
2656 'title': data['title'],
2657 'stitle': self._simplify_title(data['title']),
2659 'format': data['media']['mimeType'],
2660 'thumbnail': data['thumbnailUrl'],
2661 'description': data['description'],
2662 'player_url': data['embedUrl']
2664 except (ValueError,KeyError), err:
2665 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2669 self._downloader.process_info(info)
2670 except UnavailableVideoError, err:
2671 self._downloader.trouble(u'\nERROR: unable to download video')
2674 class PostProcessor(object):
2675 """Post Processor class.
2677 PostProcessor objects can be added to downloaders with their
2678 add_post_processor() method. When the downloader has finished a
2679 successful download, it will take its internal chain of PostProcessors
2680 and start calling the run() method on each one of them, first with
2681 an initial argument and then with the returned value of the previous
2684 The chain will be stopped if one of them ever returns None or the end
2685 of the chain is reached.
2687 PostProcessor objects follow a "mutual registration" process similar
2688 to InfoExtractor objects.
2693 def __init__(self, downloader=None):
2694 self._downloader = downloader
2696 def set_downloader(self, downloader):
2697 """Sets the downloader for this PP."""
2698 self._downloader = downloader
2700 def run(self, information):
2701 """Run the PostProcessor.
2703 The "information" argument is a dictionary like the ones
2704 composed by InfoExtractors. The only difference is that this
2705 one has an extra field called "filepath" that points to the
2708 When this method returns None, the postprocessing chain is
2709 stopped. However, this method may return an information
2710 dictionary that will be passed to the next postprocessing
2711 object in the chain. It can be the one it received after
2712 changing some fields.
2714 In addition, this method may raise a PostProcessingError
2715 exception that will be taken into account by the downloader
2718 return information # by default, do nothing
2720 class FFmpegExtractAudioPP(PostProcessor):
2722 def __init__(self, downloader=None, preferredcodec=None):
2723 PostProcessor.__init__(self, downloader)
2724 if preferredcodec is None:
2725 preferredcodec = 'best'
2726 self._preferredcodec = preferredcodec
2729 def get_audio_codec(path):
2731 cmd = ['ffprobe', '-show_streams', '--', path]
2732 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
2733 output = handle.communicate()[0]
2734 if handle.wait() != 0:
2736 except (IOError, OSError):
2739 for line in output.split('\n'):
2740 if line.startswith('codec_name='):
2741 audio_codec = line.split('=')[1].strip()
2742 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
2747 def run_ffmpeg(path, out_path, codec, more_opts):
2749 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
2750 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
2752 except (IOError, OSError):
2755 def run(self, information):
2756 path = information['filepath']
2758 filecodec = self.get_audio_codec(path)
2759 if filecodec is None:
2760 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
2764 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
2765 if filecodec == 'aac' or filecodec == 'mp3':
2766 # Lossless if possible
2768 extension = filecodec
2769 if filecodec == 'aac':
2770 more_opts = ['-f', 'adts']
2773 acodec = 'libmp3lame'
2775 more_opts = ['-ab', '128k']
2777 # We convert the audio (lossy)
2778 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
2779 extension = self._preferredcodec
2780 more_opts = ['-ab', '128k']
2781 if self._preferredcodec == 'aac':
2782 more_opts += ['-f', 'adts']
2784 (prefix, ext) = os.path.splitext(path)
2785 new_path = prefix + '.' + extension
2786 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
2787 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
2790 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
2795 except (IOError, OSError):
2796 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
2799 information['filepath'] = new_path
2802 ### MAIN PROGRAM ###
2803 if __name__ == '__main__':
2805 # Modules needed only when running the main program
2809 # Function to update the program file with the latest version from the repository.
2810 def update_self(downloader, filename):
2811 # Note: downloader only used for options
2812 if not os.access(filename, os.W_OK):
2813 sys.exit('ERROR: no write permissions on %s' % filename)
2815 downloader.to_screen('Updating to latest stable version...')
2817 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2818 latest_version = urllib.urlopen(latest_url).read().strip()
2819 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2820 newcontent = urllib.urlopen(prog_url).read()
2821 except (IOError, OSError), err:
2822 sys.exit('ERROR: unable to download latest version')
2824 stream = open(filename, 'w')
2825 stream.write(newcontent)
2827 except (IOError, OSError), err:
2828 sys.exit('ERROR: unable to overwrite current version')
2829 downloader.to_screen('Updated to version %s' % latest_version)
2831 # Parse command line
2832 parser = optparse.OptionParser(
2833 usage='Usage: %prog [options] url...',
2834 version='2011.03.29',
2835 conflict_handler='resolve',
2838 parser.add_option('-h', '--help',
2839 action='help', help='print this help text and exit')
2840 parser.add_option('-v', '--version',
2841 action='version', help='print program version and exit')
2842 parser.add_option('-U', '--update',
2843 action='store_true', dest='update_self', help='update this program to latest stable version')
2844 parser.add_option('-i', '--ignore-errors',
2845 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2846 parser.add_option('-r', '--rate-limit',
2847 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2848 parser.add_option('-R', '--retries',
2849 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2850 parser.add_option('--playlist-start',
2851 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2852 parser.add_option('--playlist-end',
2853 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2854 parser.add_option('--dump-user-agent',
2855 action='store_true', dest='dump_user_agent',
2856 help='display the current browser identification', default=False)
2858 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2859 authentication.add_option('-u', '--username',
2860 dest='username', metavar='USERNAME', help='account username')
2861 authentication.add_option('-p', '--password',
2862 dest='password', metavar='PASSWORD', help='account password')
2863 authentication.add_option('-n', '--netrc',
2864 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2865 parser.add_option_group(authentication)
2867 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2868 video_format.add_option('-f', '--format',
2869 action='store', dest='format', metavar='FORMAT', help='video format code')
2870 video_format.add_option('--all-formats',
2871 action='store_const', dest='format', help='download all available video formats', const='-1')
2872 video_format.add_option('--max-quality',
2873 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2874 parser.add_option_group(video_format)
2876 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2877 verbosity.add_option('-q', '--quiet',
2878 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2879 verbosity.add_option('-s', '--simulate',
2880 action='store_true', dest='simulate', help='do not download video', default=False)
2881 verbosity.add_option('-g', '--get-url',
2882 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2883 verbosity.add_option('-e', '--get-title',
2884 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2885 verbosity.add_option('--get-thumbnail',
2886 action='store_true', dest='getthumbnail',
2887 help='simulate, quiet but print thumbnail URL', default=False)
2888 verbosity.add_option('--get-description',
2889 action='store_true', dest='getdescription',
2890 help='simulate, quiet but print video description', default=False)
2891 verbosity.add_option('--get-filename',
2892 action='store_true', dest='getfilename',
2893 help='simulate, quiet but print output filename', default=False)
2894 verbosity.add_option('--no-progress',
2895 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2896 verbosity.add_option('--console-title',
2897 action='store_true', dest='consoletitle',
2898 help='display progress in console titlebar', default=False)
2899 parser.add_option_group(verbosity)
2901 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2902 filesystem.add_option('-t', '--title',
2903 action='store_true', dest='usetitle', help='use title in file name', default=False)
2904 filesystem.add_option('-l', '--literal',
2905 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2906 filesystem.add_option('-A', '--auto-number',
2907 action='store_true', dest='autonumber',
2908 help='number downloaded files starting from 00000', default=False)
2909 filesystem.add_option('-o', '--output',
2910 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2911 filesystem.add_option('-a', '--batch-file',
2912 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2913 filesystem.add_option('-w', '--no-overwrites',
2914 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2915 filesystem.add_option('-c', '--continue',
2916 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2917 filesystem.add_option('--cookies',
2918 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2919 filesystem.add_option('--no-part',
2920 action='store_true', dest='nopart', help='do not use .part files', default=False)
2921 filesystem.add_option('--no-mtime',
2922 action='store_false', dest='updatetime',
2923 help='do not use the Last-modified header to set the file modification time', default=True)
2924 filesystem.add_option('--write-description',
2925 action='store_true', dest='writedescription',
2926 help='write video description to a .description file', default=False)
2927 parser.add_option_group(filesystem)
2929 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
2930 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
2931 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
2932 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
2933 help='"best", "aac" or "mp3"; best by default')
2934 parser.add_option_group(postproc)
2936 (opts, args) = parser.parse_args()
2938 # Open appropriate CookieJar
2939 if opts.cookiefile is None:
2940 jar = cookielib.CookieJar()
2943 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2944 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2946 except (IOError, OSError), err:
2947 sys.exit(u'ERROR: unable to open cookie file')
2950 if opts.dump_user_agent:
2951 print std_headers['User-Agent']
2954 # General configuration
2955 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2956 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
2957 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2959 # Batch file verification
2961 if opts.batchfile is not None:
2963 if opts.batchfile == '-':
2966 batchfd = open(opts.batchfile, 'r')
2967 batchurls = batchfd.readlines()
2968 batchurls = [x.strip() for x in batchurls]
2969 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2971 sys.exit(u'ERROR: batch file could not be read')
2972 all_urls = batchurls + args
2974 # Conflicting, missing and erroneous options
2975 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2976 parser.error(u'using .netrc conflicts with giving username/password')
2977 if opts.password is not None and opts.username is None:
2978 parser.error(u'account username missing')
2979 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2980 parser.error(u'using output template conflicts with using title, literal title or auto number')
2981 if opts.usetitle and opts.useliteral:
2982 parser.error(u'using title conflicts with using literal title')
2983 if opts.username is not None and opts.password is None:
2984 opts.password = getpass.getpass(u'Type account password and press return:')
2985 if opts.ratelimit is not None:
2986 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2987 if numeric_limit is None:
2988 parser.error(u'invalid rate limit specified')
2989 opts.ratelimit = numeric_limit
2990 if opts.retries is not None:
2992 opts.retries = long(opts.retries)
2993 except (TypeError, ValueError), err:
2994 parser.error(u'invalid retry count specified')
2996 opts.playliststart = long(opts.playliststart)
2997 if opts.playliststart <= 0:
2999 except (TypeError, ValueError), err:
3000 parser.error(u'invalid playlist start number specified')
3002 opts.playlistend = long(opts.playlistend)
3003 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3005 except (TypeError, ValueError), err:
3006 parser.error(u'invalid playlist end number specified')
3007 if opts.extractaudio:
3008 if opts.audioformat not in ['best', 'aac', 'mp3']:
3009 parser.error(u'invalid audio format specified')
3011 # Information extractors
3012 youtube_ie = YoutubeIE()
3013 metacafe_ie = MetacafeIE(youtube_ie)
3014 dailymotion_ie = DailymotionIE()
3015 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3016 youtube_user_ie = YoutubeUserIE(youtube_ie)
3017 youtube_search_ie = YoutubeSearchIE(youtube_ie)
3018 google_ie = GoogleIE()
3019 google_search_ie = GoogleSearchIE(google_ie)
3020 photobucket_ie = PhotobucketIE()
3021 yahoo_ie = YahooIE()
3022 yahoo_search_ie = YahooSearchIE(yahoo_ie)
3023 deposit_files_ie = DepositFilesIE()
3024 facebook_ie = FacebookIE()
3025 bliptv_ie = BlipTVIE()
3026 generic_ie = GenericIE()
3029 fd = FileDownloader({
3030 'usenetrc': opts.usenetrc,
3031 'username': opts.username,
3032 'password': opts.password,
3033 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3034 'forceurl': opts.geturl,
3035 'forcetitle': opts.gettitle,
3036 'forcethumbnail': opts.getthumbnail,
3037 'forcedescription': opts.getdescription,
3038 'forcefilename': opts.getfilename,
3039 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3040 'format': opts.format,
3041 'format_limit': opts.format_limit,
3042 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3043 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3044 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3045 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3046 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3047 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3048 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3049 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3050 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3051 or u'%(id)s.%(ext)s'),
3052 'ignoreerrors': opts.ignoreerrors,
3053 'ratelimit': opts.ratelimit,
3054 'nooverwrites': opts.nooverwrites,
3055 'retries': opts.retries,
3056 'continuedl': opts.continue_dl,
3057 'noprogress': opts.noprogress,
3058 'playliststart': opts.playliststart,
3059 'playlistend': opts.playlistend,
3060 'logtostderr': opts.outtmpl == '-',
3061 'consoletitle': opts.consoletitle,
3062 'nopart': opts.nopart,
3063 'updatetime': opts.updatetime,
3064 'writedescription': opts.writedescription,
3066 fd.add_info_extractor(youtube_search_ie)
3067 fd.add_info_extractor(youtube_pl_ie)
3068 fd.add_info_extractor(youtube_user_ie)
3069 fd.add_info_extractor(metacafe_ie)
3070 fd.add_info_extractor(dailymotion_ie)
3071 fd.add_info_extractor(youtube_ie)
3072 fd.add_info_extractor(google_ie)
3073 fd.add_info_extractor(google_search_ie)
3074 fd.add_info_extractor(photobucket_ie)
3075 fd.add_info_extractor(yahoo_ie)
3076 fd.add_info_extractor(yahoo_search_ie)
3077 fd.add_info_extractor(deposit_files_ie)
3078 fd.add_info_extractor(facebook_ie)
3079 fd.add_info_extractor(bliptv_ie)
3081 # This must come last since it's the
3082 # fallback if none of the others work
3083 fd.add_info_extractor(generic_ie)
3086 if opts.extractaudio:
3087 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3090 if opts.update_self:
3091 update_self(fd, sys.argv[0])
3094 if len(all_urls) < 1:
3095 if not opts.update_self:
3096 parser.error(u'you must provide at least one URL')
3099 retcode = fd.download(all_urls)
3101 # Dump cookie jar if requested
3102 if opts.cookiefile is not None:
3105 except (IOError, OSError), err:
3106 sys.exit(u'ERROR: unable to save cookie jar')
3110 except DownloadError:
3112 except SameFileError:
3113 sys.exit(u'ERROR: fixed output name but more than one file to download')
3114 except KeyboardInterrupt:
3115 sys.exit(u'\nERROR: Interrupted by user')