2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # Author: Vasyl' Vavrychuk
7 # Author: Witold Baryluk
8 # License: Public domain code
31 # parse_qs was moved from the cgi module to the urlparse module recently.
33 from urlparse import parse_qs
35 from cgi import parse_qs
38 'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
39 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
40 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
41 'Accept-Encoding': 'gzip, deflate',
42 'Accept-Language': 'en-us,en;q=0.5',
45 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
47 def preferredencoding():
48 """Get preferred encoding.
50 Returns the best encoding scheme for the system, based on
51 locale.getpreferredencoding() and some further tweaks.
53 def yield_preferredencoding():
55 pref = locale.getpreferredencoding()
61 return yield_preferredencoding().next()
63 def htmlentity_transform(matchobj):
64 """Transforms an HTML entity to a Unicode character.
66 This function receives a match object and is intended to be used with
67 the re.sub() function.
69 entity = matchobj.group(1)
71 # Known non-numeric HTML entity
72 if entity in htmlentitydefs.name2codepoint:
73 return unichr(htmlentitydefs.name2codepoint[entity])
76 mobj = re.match(ur'(?u)#(x?\d+)', entity)
78 numstr = mobj.group(1)
79 if numstr.startswith(u'x'):
81 numstr = u'0%s' % numstr
84 return unichr(long(numstr, base))
86 # Unknown entity in name, return its literal representation
87 return (u'&%s;' % entity)
89 def sanitize_title(utitle):
90 """Sanitizes a video title so it could be used as part of a filename."""
91 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
92 return utitle.replace(unicode(os.sep), u'%')
94 def sanitize_open(filename, open_mode):
95 """Try to open the given filename, and slightly tweak it if this fails.
97 Attempts to open the given filename. If this fails, it tries to change
98 the filename slightly, step by step, until it's either able to open it
99 or it fails and raises a final exception, like the standard open()
102 It returns the tuple (stream, definitive_file_name).
106 if sys.platform == 'win32':
108 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
109 return (sys.stdout, filename)
110 stream = open(filename, open_mode)
111 return (stream, filename)
112 except (IOError, OSError), err:
113 # In case of error, try to remove win32 forbidden chars
114 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
116 # An exception here should be caught in the caller
117 stream = open(filename, open_mode)
118 return (stream, filename)
120 class DownloadError(Exception):
121 """Download Error exception.
123 This exception may be thrown by FileDownloader objects if they are not
124 configured to continue on errors. They will contain the appropriate
129 class SameFileError(Exception):
130 """Same File exception.
132 This exception will be thrown by FileDownloader objects if they detect
133 multiple files would have to be downloaded to the same file on disk.
137 class PostProcessingError(Exception):
138 """Post Processing exception.
140 This exception may be raised by PostProcessor's .run() method to
141 indicate an error in the postprocessing task.
145 class UnavailableVideoError(Exception):
146 """Unavailable Format exception.
148 This exception will be thrown when a video is requested
149 in a format that is not available for that video.
153 class ContentTooShortError(Exception):
154 """Content Too Short exception.
156 This exception may be raised by FileDownloader objects when a file they
157 download is too small for what the server announced first, indicating
158 the connection was probably interrupted.
164 def __init__(self, downloaded, expected):
165 self.downloaded = downloaded
166 self.expected = expected
168 class YoutubeDLHandler(urllib2.HTTPHandler):
169 """Handler for HTTP requests and responses.
171 This class, when installed with an OpenerDirector, automatically adds
172 the standard headers to every HTTP request and handles gzipped and
173 deflated responses from web servers. If compression is to be avoided in
174 a particular request, the original request in the program code only has
175 to include the HTTP header "Youtubedl-No-Compression", which will be
176 removed before making the real request.
178 Part of this code was copied from:
180 http://techknack.net/python-urllib2-handlers/
182 Andrew Rowls, the author of that code, agreed to release it to the
189 return zlib.decompress(data, -zlib.MAX_WBITS)
191 return zlib.decompress(data)
194 def addinfourl_wrapper(stream, headers, url, code):
195 if hasattr(urllib2.addinfourl, 'getcode'):
196 return urllib2.addinfourl(stream, headers, url, code)
197 ret = urllib2.addinfourl(stream, headers, url)
201 def http_request(self, req):
202 for h in std_headers:
205 req.add_header(h, std_headers[h])
206 if 'Youtubedl-no-compression' in req.headers:
207 if 'Accept-encoding' in req.headers:
208 del req.headers['Accept-encoding']
209 del req.headers['Youtubedl-no-compression']
212 def http_response(self, req, resp):
215 if resp.headers.get('Content-encoding', '') == 'gzip':
216 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
217 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
218 resp.msg = old_resp.msg
220 if resp.headers.get('Content-encoding', '') == 'deflate':
221 gz = StringIO.StringIO(self.deflate(resp.read()))
222 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
223 resp.msg = old_resp.msg
226 class FileDownloader(object):
227 """File Downloader class.
229 File downloader objects are the ones responsible of downloading the
230 actual video file and writing it to disk if the user has requested
231 it, among some other tasks. In most cases there should be one per
232 program. As, given a video URL, the downloader doesn't know how to
233 extract all the needed information, task that InfoExtractors do, it
234 has to pass the URL to one of them.
236 For this, file downloader objects have a method that allows
237 InfoExtractors to be registered in a given order. When it is passed
238 a URL, the file downloader handles it to the first InfoExtractor it
239 finds that reports being able to handle it. The InfoExtractor extracts
240 all the information about the video or videos the URL refers to, and
241 asks the FileDownloader to process the video information, possibly
242 downloading the video.
244 File downloaders accept a lot of parameters. In order not to saturate
245 the object constructor with arguments, it receives a dictionary of
246 options instead. These options are available through the params
247 attribute for the InfoExtractors to use. The FileDownloader also
248 registers itself as the downloader in charge for the InfoExtractors
249 that are added to it, so this is a "mutual registration".
253 username: Username for authentication purposes.
254 password: Password for authentication purposes.
255 usenetrc: Use netrc for authentication instead.
256 quiet: Do not print messages to stdout.
257 forceurl: Force printing final URL.
258 forcetitle: Force printing title.
259 forcethumbnail: Force printing thumbnail URL.
260 forcedescription: Force printing description.
261 forcefilename: Force printing final filename.
262 simulate: Do not download the video files.
263 format: Video format code.
264 format_limit: Highest quality format to try.
265 outtmpl: Template for output names.
266 ignoreerrors: Do not stop on download errors.
267 ratelimit: Download speed limit, in bytes/sec.
268 nooverwrites: Prevent overwriting files.
269 retries: Number of times to retry for HTTP error 5xx
270 continuedl: Try to continue downloads if possible.
271 noprogress: Do not print the progress bar.
272 playliststart: Playlist item to start at.
273 playlistend: Playlist item to end at.
274 logtostderr: Log messages to stderr instead of stdout.
275 consoletitle: Display progress in console window's titlebar.
276 nopart: Do not use temporary .part files.
282 _download_retcode = None
283 _num_downloads = None
286 def __init__(self, params):
287 """Create a FileDownloader object with the given options."""
290 self._download_retcode = 0
291 self._num_downloads = 0
292 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
296 def pmkdir(filename):
297 """Create directory components in filename. Similar to Unix "mkdir -p"."""
298 components = filename.split(os.sep)
299 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
300 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
301 for dir in aggregate:
302 if not os.path.exists(dir):
306 def format_bytes(bytes):
309 if type(bytes) is str:
314 exponent = long(math.log(bytes, 1024.0))
315 suffix = 'bkMGTPEZY'[exponent]
316 converted = float(bytes) / float(1024**exponent)
317 return '%.2f%s' % (converted, suffix)
320 def calc_percent(byte_counter, data_len):
323 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
326 def calc_eta(start, now, total, current):
330 if current == 0 or dif < 0.001: # One millisecond
332 rate = float(current) / dif
333 eta = long((float(total) - float(current)) / rate)
334 (eta_mins, eta_secs) = divmod(eta, 60)
337 return '%02d:%02d' % (eta_mins, eta_secs)
340 def calc_speed(start, now, bytes):
342 if bytes == 0 or dif < 0.001: # One millisecond
343 return '%10s' % '---b/s'
344 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
347 def best_block_size(elapsed_time, bytes):
348 new_min = max(bytes / 2.0, 1.0)
349 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
350 if elapsed_time < 0.001:
352 rate = bytes / elapsed_time
360 def parse_bytes(bytestr):
361 """Parse a string indicating a byte quantity into a long integer."""
362 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
365 number = float(matchobj.group(1))
366 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
367 return long(round(number * multiplier))
369 def add_info_extractor(self, ie):
370 """Add an InfoExtractor object to the end of the list."""
372 ie.set_downloader(self)
374 def add_post_processor(self, pp):
375 """Add a PostProcessor object to the end of the chain."""
377 pp.set_downloader(self)
379 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
380 """Print message to stdout if not in quiet mode."""
382 if not self.params.get('quiet', False):
383 terminator = [u'\n', u''][skip_eol]
384 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
385 self._screen_file.flush()
386 except (UnicodeEncodeError), err:
387 if not ignore_encoding_errors:
390 def to_stderr(self, message):
391 """Print message to stderr."""
392 print >>sys.stderr, message.encode(preferredencoding())
394 def to_cons_title(self, message):
395 """Set console/terminal window title to message."""
396 if not self.params.get('consoletitle', False):
398 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
399 # c_wchar_p() might not be necessary if `message` is
400 # already of type unicode()
401 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
402 elif 'TERM' in os.environ:
403 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
405 def fixed_template(self):
406 """Checks if the output template is fixed."""
407 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
409 def trouble(self, message=None):
410 """Determine action to take when a download problem appears.
412 Depending on if the downloader has been configured to ignore
413 download errors or not, this method may throw an exception or
414 not when errors are found, after printing the message.
416 if message is not None:
417 self.to_stderr(message)
418 if not self.params.get('ignoreerrors', False):
419 raise DownloadError(message)
420 self._download_retcode = 1
422 def slow_down(self, start_time, byte_counter):
423 """Sleep if the download speed is over the rate limit."""
424 rate_limit = self.params.get('ratelimit', None)
425 if rate_limit is None or byte_counter == 0:
428 elapsed = now - start_time
431 speed = float(byte_counter) / elapsed
432 if speed > rate_limit:
433 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
435 def temp_name(self, filename):
436 """Returns a temporary filename for the given filename."""
437 if self.params.get('nopart', False) or filename == u'-' or \
438 (os.path.exists(filename) and not os.path.isfile(filename)):
440 return filename + u'.part'
442 def undo_temp_name(self, filename):
443 if filename.endswith(u'.part'):
444 return filename[:-len(u'.part')]
447 def try_rename(self, old_filename, new_filename):
449 if old_filename == new_filename:
451 os.rename(old_filename, new_filename)
452 except (IOError, OSError), err:
453 self.trouble(u'ERROR: unable to rename file')
455 def report_destination(self, filename):
456 """Report destination filename."""
457 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
459 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
460 """Report download progress."""
461 if self.params.get('noprogress', False):
463 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
464 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
465 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
466 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
468 def report_resuming_byte(self, resume_len):
469 """Report attempt to resume at given byte."""
470 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
472 def report_retry(self, count, retries):
473 """Report retry in case of HTTP error 5xx"""
474 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
476 def report_file_already_downloaded(self, file_name):
477 """Report file has already been fully downloaded."""
479 self.to_screen(u'[download] %s has already been downloaded' % file_name)
480 except (UnicodeEncodeError), err:
481 self.to_screen(u'[download] The file has already been downloaded')
483 def report_unable_to_resume(self):
484 """Report it was impossible to resume download."""
485 self.to_screen(u'[download] Unable to resume')
487 def report_finish(self):
488 """Report download finished."""
489 if self.params.get('noprogress', False):
490 self.to_screen(u'[download] Download completed')
494 def increment_downloads(self):
495 """Increment the ordinal that assigns a number to each file."""
496 self._num_downloads += 1
498 def prepare_filename(self, info_dict):
499 """Generate the output filename."""
501 template_dict = dict(info_dict)
502 template_dict['epoch'] = unicode(long(time.time()))
503 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
504 filename = self.params['outtmpl'] % template_dict
506 except (ValueError, KeyError), err:
507 self.trouble(u'ERROR: invalid system charset or erroneous output template')
510 def process_info(self, info_dict):
511 """Process a single dictionary returned by an InfoExtractor."""
512 filename = self.prepare_filename(info_dict)
513 # Do nothing else if in simulate mode
514 if self.params.get('simulate', False):
516 if self.params.get('forcetitle', False):
517 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
518 if self.params.get('forceurl', False):
519 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
520 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
521 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
522 if self.params.get('forcedescription', False) and 'description' in info_dict:
523 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
524 if self.params.get('forcefilename', False) and filename is not None:
525 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
531 if self.params.get('nooverwrites', False) and os.path.exists(filename):
532 self.to_stderr(u'WARNING: file exists and will be skipped')
536 self.pmkdir(filename)
537 except (OSError, IOError), err:
538 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
542 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
543 except (OSError, IOError), err:
544 raise UnavailableVideoError
545 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
546 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
548 except (ContentTooShortError, ), err:
549 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
554 self.post_process(filename, info_dict)
555 except (PostProcessingError), err:
556 self.trouble(u'ERROR: postprocessing: %s' % str(err))
559 def download(self, url_list):
560 """Download a given list of URLs."""
561 if len(url_list) > 1 and self.fixed_template():
562 raise SameFileError(self.params['outtmpl'])
565 suitable_found = False
567 # Go to next InfoExtractor if not suitable
568 if not ie.suitable(url):
571 # Suitable InfoExtractor found
572 suitable_found = True
574 # Extract information from URL and process it
577 # Suitable InfoExtractor had been found; go to next URL
580 if not suitable_found:
581 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
583 return self._download_retcode
585 def post_process(self, filename, ie_info):
586 """Run the postprocessing chain on the given file."""
588 info['filepath'] = filename
594 def _download_with_rtmpdump(self, filename, url, player_url):
595 self.report_destination(filename)
596 tmpfilename = self.temp_name(filename)
598 # Check for rtmpdump first
600 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
601 except (OSError, IOError):
602 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
605 # Download using rtmpdump. rtmpdump returns exit code 2 when
606 # the connection was interrumpted and resuming appears to be
607 # possible. This is part of rtmpdump's normal usage, AFAIK.
608 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
609 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
610 while retval == 2 or retval == 1:
611 prevsize = os.path.getsize(tmpfilename)
612 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
613 time.sleep(5.0) # This seems to be needed
614 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
615 cursize = os.path.getsize(tmpfilename)
616 if prevsize == cursize and retval == 1:
619 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
620 self.try_rename(tmpfilename, filename)
623 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
626 def _do_download(self, filename, url, player_url):
627 # Check file already present
628 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
629 self.report_file_already_downloaded(filename)
632 # Attempt to download using rtmpdump
633 if url.startswith('rtmp'):
634 return self._download_with_rtmpdump(filename, url, player_url)
636 tmpfilename = self.temp_name(filename)
640 # Do not include the Accept-Encoding header
641 headers = {'Youtubedl-no-compression': 'True'}
642 basic_request = urllib2.Request(url, None, headers)
643 request = urllib2.Request(url, None, headers)
645 # Establish possible resume length
646 if os.path.isfile(tmpfilename):
647 resume_len = os.path.getsize(tmpfilename)
651 # Request parameters in case of being able to resume
652 if self.params.get('continuedl', False) and resume_len != 0:
653 self.report_resuming_byte(resume_len)
654 request.add_header('Range','bytes=%d-' % resume_len)
658 retries = self.params.get('retries', 0)
659 while count <= retries:
660 # Establish connection
662 data = urllib2.urlopen(request)
664 except (urllib2.HTTPError, ), err:
665 if (err.code < 500 or err.code >= 600) and err.code != 416:
666 # Unexpected HTTP error
668 elif err.code == 416:
669 # Unable to resume (requested range not satisfiable)
671 # Open the connection again without the range header
672 data = urllib2.urlopen(basic_request)
673 content_length = data.info()['Content-Length']
674 except (urllib2.HTTPError, ), err:
675 if err.code < 500 or err.code >= 600:
678 # Examine the reported length
679 if (content_length is not None and
680 (resume_len - 100 < long(content_length) < resume_len + 100)):
681 # The file had already been fully downloaded.
682 # Explanation to the above condition: in issue #175 it was revealed that
683 # YouTube sometimes adds or removes a few bytes from the end of the file,
684 # changing the file size slightly and causing problems for some users. So
685 # I decided to implement a suggested change and consider the file
686 # completely downloaded if the file size differs less than 100 bytes from
687 # the one in the hard drive.
688 self.report_file_already_downloaded(filename)
689 self.try_rename(tmpfilename, filename)
692 # The length does not match, we start the download over
693 self.report_unable_to_resume()
699 self.report_retry(count, retries)
702 self.trouble(u'ERROR: giving up after %s retries' % retries)
705 data_len = data.info().get('Content-length', None)
706 if data_len is not None:
707 data_len = long(data_len) + resume_len
708 data_len_str = self.format_bytes(data_len)
709 byte_counter = 0 + resume_len
715 data_block = data.read(block_size)
717 if len(data_block) == 0:
719 byte_counter += len(data_block)
721 # Open file just in time
724 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
725 filename = self.undo_temp_name(tmpfilename)
726 self.report_destination(filename)
727 except (OSError, IOError), err:
728 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
731 stream.write(data_block)
732 except (IOError, OSError), err:
733 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
735 block_size = self.best_block_size(after - before, len(data_block))
738 percent_str = self.calc_percent(byte_counter, data_len)
739 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
740 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
741 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
744 self.slow_down(start, byte_counter - resume_len)
748 if data_len is not None and byte_counter != data_len:
749 raise ContentTooShortError(byte_counter, long(data_len))
750 self.try_rename(tmpfilename, filename)
753 class InfoExtractor(object):
754 """Information Extractor class.
756 Information extractors are the classes that, given a URL, extract
757 information from the video (or videos) the URL refers to. This
758 information includes the real video URL, the video title and simplified
759 title, author and others. The information is stored in a dictionary
760 which is then passed to the FileDownloader. The FileDownloader
761 processes this information possibly downloading the video to the file
762 system, among other possible outcomes. The dictionaries must include
763 the following fields:
765 id: Video identifier.
766 url: Final video URL.
767 uploader: Nickname of the video uploader.
768 title: Literal title.
769 stitle: Simplified title.
770 ext: Video filename extension.
771 format: Video format.
772 player_url: SWF Player URL (may be None).
774 The following fields are optional. Their primary purpose is to allow
775 youtube-dl to serve as the backend for a video search function, such
776 as the one in youtube2mp3. They are only used when their respective
777 forced printing functions are called:
779 thumbnail: Full URL to a video thumbnail image.
780 description: One-line video description.
782 Subclasses of this one should re-define the _real_initialize() and
783 _real_extract() methods, as well as the suitable() static method.
784 Probably, they should also be instantiated and added to the main
791 def __init__(self, downloader=None):
792 """Constructor. Receives an optional downloader."""
794 self.set_downloader(downloader)
798 """Receives a URL and returns True if suitable for this IE."""
801 def initialize(self):
802 """Initializes an instance (authentication, etc)."""
804 self._real_initialize()
807 def extract(self, url):
808 """Extracts URL information and returns it in list of dicts."""
810 return self._real_extract(url)
812 def set_downloader(self, downloader):
813 """Sets the downloader for this IE."""
814 self._downloader = downloader
816 def _real_initialize(self):
817 """Real initialization process. Redefine in subclasses."""
820 def _real_extract(self, url):
821 """Real extraction process. Redefine in subclasses."""
824 class YoutubeIE(InfoExtractor):
825 """Information extractor for youtube.com."""
827 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
828 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
829 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
830 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
831 _NETRC_MACHINE = 'youtube'
832 # Listed in order of quality
833 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
834 _video_extensions = {
840 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
847 return (re.match(YoutubeIE._VALID_URL, url) is not None)
849 def report_lang(self):
850 """Report attempt to set language."""
851 self._downloader.to_screen(u'[youtube] Setting language')
853 def report_login(self):
854 """Report attempt to log in."""
855 self._downloader.to_screen(u'[youtube] Logging in')
857 def report_age_confirmation(self):
858 """Report attempt to confirm age."""
859 self._downloader.to_screen(u'[youtube] Confirming age')
861 def report_video_webpage_download(self, video_id):
862 """Report attempt to download video webpage."""
863 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
865 def report_video_info_webpage_download(self, video_id):
866 """Report attempt to download video info webpage."""
867 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
869 def report_information_extraction(self, video_id):
870 """Report attempt to extract video information."""
871 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
873 def report_unavailable_format(self, video_id, format):
874 """Report extracted video URL."""
875 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
877 def report_rtmp_download(self):
878 """Indicate the download will use the RTMP protocol."""
879 self._downloader.to_screen(u'[youtube] RTMP download detected')
881 def _real_initialize(self):
882 if self._downloader is None:
887 downloader_params = self._downloader.params
889 # Attempt to use provided username and password or .netrc data
890 if downloader_params.get('username', None) is not None:
891 username = downloader_params['username']
892 password = downloader_params['password']
893 elif downloader_params.get('usenetrc', False):
895 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
900 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
901 except (IOError, netrc.NetrcParseError), err:
902 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
906 request = urllib2.Request(self._LANG_URL)
909 urllib2.urlopen(request).read()
910 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
911 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
914 # No authentication to be performed
920 'current_form': 'loginForm',
922 'action_login': 'Log In',
923 'username': username,
924 'password': password,
926 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
929 login_results = urllib2.urlopen(request).read()
930 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
931 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
933 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
934 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
940 'action_confirm': 'Confirm',
942 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
944 self.report_age_confirmation()
945 age_results = urllib2.urlopen(request).read()
946 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
947 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
950 def _real_extract(self, url):
951 # Extract video id from URL
952 mobj = re.match(self._VALID_URL, url)
954 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
956 video_id = mobj.group(2)
959 self.report_video_webpage_download(video_id)
960 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
962 video_webpage = urllib2.urlopen(request).read()
963 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
964 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
967 # Attempt to extract SWF player URL
968 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
970 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
975 self.report_video_info_webpage_download(video_id)
976 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
977 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
978 % (video_id, el_type))
979 request = urllib2.Request(video_info_url)
981 video_info_webpage = urllib2.urlopen(request).read()
982 video_info = parse_qs(video_info_webpage)
983 if 'token' in video_info:
985 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
986 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
988 if 'token' not in video_info:
989 if 'reason' in video_info:
990 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
992 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
995 # Start extracting information
996 self.report_information_extraction(video_id)
999 if 'author' not in video_info:
1000 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1002 video_uploader = urllib.unquote_plus(video_info['author'][0])
1005 if 'title' not in video_info:
1006 self._downloader.trouble(u'ERROR: unable to extract video title')
1008 video_title = urllib.unquote_plus(video_info['title'][0])
1009 video_title = video_title.decode('utf-8')
1010 video_title = sanitize_title(video_title)
1013 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1014 simple_title = simple_title.strip(ur'_')
1017 if 'thumbnail_url' not in video_info:
1018 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1019 video_thumbnail = ''
1020 else: # don't panic if we can't find it
1021 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1025 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
1026 if mobj is not None:
1027 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1028 format_expressions = ['%d %B %Y', '%B %d %Y']
1029 for expression in format_expressions:
1031 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1036 video_description = 'No description available.'
1037 if self._downloader.params.get('forcedescription', False):
1038 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1039 if mobj is not None:
1040 video_description = mobj.group(1)
1043 video_token = urllib.unquote_plus(video_info['token'][0])
1045 # Decide which formats to download
1046 req_format = self._downloader.params.get('format', None)
1048 if 'fmt_url_map' in video_info:
1049 url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
1050 format_limit = self._downloader.params.get('format_limit', None)
1051 if format_limit is not None and format_limit in self._available_formats:
1052 format_list = self._available_formats[self._available_formats.index(format_limit):]
1054 format_list = self._available_formats
1055 existing_formats = [x for x in format_list if x in url_map]
1056 if len(existing_formats) == 0:
1057 self._downloader.trouble(u'ERROR: no known formats available for video')
1059 if req_format is None:
1060 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1061 elif req_format == '-1':
1062 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1065 if req_format not in url_map:
1066 self._downloader.trouble(u'ERROR: requested format not available')
1068 video_url_list = [(req_format, url_map[req_format])] # Specific format
1070 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1071 self.report_rtmp_download()
1072 video_url_list = [(None, video_info['conn'][0])]
1075 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1078 for format_param, video_real_url in video_url_list:
1079 # At this point we have a new video
1080 self._downloader.increment_downloads()
1083 video_extension = self._video_extensions.get(format_param, 'flv')
1085 # Find the video URL in fmt_url_map or conn paramters
1087 # Process video information
1088 self._downloader.process_info({
1089 'id': video_id.decode('utf-8'),
1090 'url': video_real_url.decode('utf-8'),
1091 'uploader': video_uploader.decode('utf-8'),
1092 'upload_date': upload_date,
1093 'title': video_title,
1094 'stitle': simple_title,
1095 'ext': video_extension.decode('utf-8'),
1096 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1097 'thumbnail': video_thumbnail.decode('utf-8'),
1098 'description': video_description.decode('utf-8'),
1099 'player_url': player_url,
1101 except UnavailableVideoError, err:
1102 self._downloader.trouble(u'\nERROR: unable to download video')
1105 class MetacafeIE(InfoExtractor):
1106 """Information Extractor for metacafe.com."""
1108 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1109 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1110 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1113 def __init__(self, youtube_ie, downloader=None):
1114 InfoExtractor.__init__(self, downloader)
1115 self._youtube_ie = youtube_ie
1119 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1121 def report_disclaimer(self):
1122 """Report disclaimer retrieval."""
1123 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1125 def report_age_confirmation(self):
1126 """Report attempt to confirm age."""
1127 self._downloader.to_screen(u'[metacafe] Confirming age')
1129 def report_download_webpage(self, video_id):
1130 """Report webpage download."""
1131 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1133 def report_extraction(self, video_id):
1134 """Report information extraction."""
1135 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1137 def _real_initialize(self):
1138 # Retrieve disclaimer
1139 request = urllib2.Request(self._DISCLAIMER)
1141 self.report_disclaimer()
1142 disclaimer = urllib2.urlopen(request).read()
1143 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1144 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1150 'submit': "Continue - I'm over 18",
1152 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1154 self.report_age_confirmation()
1155 disclaimer = urllib2.urlopen(request).read()
1156 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1157 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1160 def _real_extract(self, url):
1161 # Extract id and simplified title from URL
1162 mobj = re.match(self._VALID_URL, url)
1164 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1167 video_id = mobj.group(1)
1169 # Check if video comes from YouTube
1170 mobj2 = re.match(r'^yt-(.*)$', video_id)
1171 if mobj2 is not None:
1172 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1175 # At this point we have a new video
1176 self._downloader.increment_downloads()
1178 simple_title = mobj.group(2).decode('utf-8')
1180 # Retrieve video webpage to extract further information
1181 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1183 self.report_download_webpage(video_id)
1184 webpage = urllib2.urlopen(request).read()
1185 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1186 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1189 # Extract URL, uploader and title from webpage
1190 self.report_extraction(video_id)
1191 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1192 if mobj is not None:
1193 mediaURL = urllib.unquote(mobj.group(1))
1194 video_extension = mediaURL[-3:]
1196 # Extract gdaKey if available
1197 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1199 video_url = mediaURL
1201 gdaKey = mobj.group(1)
1202 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1204 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1206 self._downloader.trouble(u'ERROR: unable to extract media URL')
1208 vardict = parse_qs(mobj.group(1))
1209 if 'mediaData' not in vardict:
1210 self._downloader.trouble(u'ERROR: unable to extract media URL')
1212 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1214 self._downloader.trouble(u'ERROR: unable to extract media URL')
1216 mediaURL = mobj.group(1).replace('\\/', '/')
1217 video_extension = mediaURL[-3:]
1218 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1220 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1222 self._downloader.trouble(u'ERROR: unable to extract title')
1224 video_title = mobj.group(1).decode('utf-8')
1225 video_title = sanitize_title(video_title)
1227 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1229 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1231 video_uploader = mobj.group(1)
1234 # Process video information
1235 self._downloader.process_info({
1236 'id': video_id.decode('utf-8'),
1237 'url': video_url.decode('utf-8'),
1238 'uploader': video_uploader.decode('utf-8'),
1239 'upload_date': u'NA',
1240 'title': video_title,
1241 'stitle': simple_title,
1242 'ext': video_extension.decode('utf-8'),
1246 except UnavailableVideoError:
1247 self._downloader.trouble(u'\nERROR: unable to download video')
1250 class DailymotionIE(InfoExtractor):
1251 """Information Extractor for Dailymotion"""
1253 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1255 def __init__(self, downloader=None):
1256 InfoExtractor.__init__(self, downloader)
1260 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1262 def report_download_webpage(self, video_id):
1263 """Report webpage download."""
1264 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1266 def report_extraction(self, video_id):
1267 """Report information extraction."""
1268 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1270 def _real_initialize(self):
1273 def _real_extract(self, url):
1274 # Extract id and simplified title from URL
1275 mobj = re.match(self._VALID_URL, url)
1277 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1280 # At this point we have a new video
1281 self._downloader.increment_downloads()
1282 video_id = mobj.group(1)
1284 simple_title = mobj.group(2).decode('utf-8')
1285 video_extension = 'flv'
1287 # Retrieve video webpage to extract further information
1288 request = urllib2.Request(url)
1290 self.report_download_webpage(video_id)
1291 webpage = urllib2.urlopen(request).read()
1292 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1293 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1296 # Extract URL, uploader and title from webpage
1297 self.report_extraction(video_id)
1298 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1300 self._downloader.trouble(u'ERROR: unable to extract media URL')
1302 mediaURL = urllib.unquote(mobj.group(1))
1304 # if needed add http://www.dailymotion.com/ if relative URL
1306 video_url = mediaURL
1308 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1309 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1311 self._downloader.trouble(u'ERROR: unable to extract title')
1313 video_title = mobj.group(1).decode('utf-8')
1314 video_title = sanitize_title(video_title)
1316 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1318 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1320 video_uploader = mobj.group(1)
1323 # Process video information
1324 self._downloader.process_info({
1325 'id': video_id.decode('utf-8'),
1326 'url': video_url.decode('utf-8'),
1327 'uploader': video_uploader.decode('utf-8'),
1328 'upload_date': u'NA',
1329 'title': video_title,
1330 'stitle': simple_title,
1331 'ext': video_extension.decode('utf-8'),
1335 except UnavailableVideoError:
1336 self._downloader.trouble(u'\nERROR: unable to download video')
1338 class GoogleIE(InfoExtractor):
1339 """Information extractor for video.google.com."""
1341 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1343 def __init__(self, downloader=None):
1344 InfoExtractor.__init__(self, downloader)
1348 return (re.match(GoogleIE._VALID_URL, url) is not None)
1350 def report_download_webpage(self, video_id):
1351 """Report webpage download."""
1352 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1354 def report_extraction(self, video_id):
1355 """Report information extraction."""
1356 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1358 def _real_initialize(self):
1361 def _real_extract(self, url):
1362 # Extract id from URL
1363 mobj = re.match(self._VALID_URL, url)
1365 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1368 # At this point we have a new video
1369 self._downloader.increment_downloads()
1370 video_id = mobj.group(1)
1372 video_extension = 'mp4'
1374 # Retrieve video webpage to extract further information
1375 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1377 self.report_download_webpage(video_id)
1378 webpage = urllib2.urlopen(request).read()
1379 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1380 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1383 # Extract URL, uploader, and title from webpage
1384 self.report_extraction(video_id)
1385 mobj = re.search(r"download_url:'([^']+)'", webpage)
1387 video_extension = 'flv'
1388 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1390 self._downloader.trouble(u'ERROR: unable to extract media URL')
1392 mediaURL = urllib.unquote(mobj.group(1))
1393 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1394 mediaURL = mediaURL.replace('\\x26', '\x26')
1396 video_url = mediaURL
1398 mobj = re.search(r'<title>(.*)</title>', webpage)
1400 self._downloader.trouble(u'ERROR: unable to extract title')
1402 video_title = mobj.group(1).decode('utf-8')
1403 video_title = sanitize_title(video_title)
1404 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1406 # Extract video description
1407 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1409 self._downloader.trouble(u'ERROR: unable to extract video description')
1411 video_description = mobj.group(1).decode('utf-8')
1412 if not video_description:
1413 video_description = 'No description available.'
1415 # Extract video thumbnail
1416 if self._downloader.params.get('forcethumbnail', False):
1417 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1419 webpage = urllib2.urlopen(request).read()
1420 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1421 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1423 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1425 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1427 video_thumbnail = mobj.group(1)
1428 else: # we need something to pass to process_info
1429 video_thumbnail = ''
1433 # Process video information
1434 self._downloader.process_info({
1435 'id': video_id.decode('utf-8'),
1436 'url': video_url.decode('utf-8'),
1438 'upload_date': u'NA',
1439 'title': video_title,
1440 'stitle': simple_title,
1441 'ext': video_extension.decode('utf-8'),
1445 except UnavailableVideoError:
1446 self._downloader.trouble(u'\nERROR: unable to download video')
1449 class PhotobucketIE(InfoExtractor):
1450 """Information extractor for photobucket.com."""
1452 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1454 def __init__(self, downloader=None):
1455 InfoExtractor.__init__(self, downloader)
1459 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1461 def report_download_webpage(self, video_id):
1462 """Report webpage download."""
1463 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1465 def report_extraction(self, video_id):
1466 """Report information extraction."""
1467 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1469 def _real_initialize(self):
1472 def _real_extract(self, url):
1473 # Extract id from URL
1474 mobj = re.match(self._VALID_URL, url)
1476 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1479 # At this point we have a new video
1480 self._downloader.increment_downloads()
1481 video_id = mobj.group(1)
1483 video_extension = 'flv'
1485 # Retrieve video webpage to extract further information
1486 request = urllib2.Request(url)
1488 self.report_download_webpage(video_id)
1489 webpage = urllib2.urlopen(request).read()
1490 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1491 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1494 # Extract URL, uploader, and title from webpage
1495 self.report_extraction(video_id)
1496 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1498 self._downloader.trouble(u'ERROR: unable to extract media URL')
1500 mediaURL = urllib.unquote(mobj.group(1))
1502 video_url = mediaURL
1504 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1506 self._downloader.trouble(u'ERROR: unable to extract title')
1508 video_title = mobj.group(1).decode('utf-8')
1509 video_title = sanitize_title(video_title)
1510 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1512 video_uploader = mobj.group(2).decode('utf-8')
1515 # Process video information
1516 self._downloader.process_info({
1517 'id': video_id.decode('utf-8'),
1518 'url': video_url.decode('utf-8'),
1519 'uploader': video_uploader,
1520 'upload_date': u'NA',
1521 'title': video_title,
1522 'stitle': simple_title,
1523 'ext': video_extension.decode('utf-8'),
1527 except UnavailableVideoError:
1528 self._downloader.trouble(u'\nERROR: unable to download video')
1531 class YahooIE(InfoExtractor):
1532 """Information extractor for video.yahoo.com."""
1534 # _VALID_URL matches all Yahoo! Video URLs
1535 # _VPAGE_URL matches only the extractable '/watch/' URLs
1536 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1537 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1539 def __init__(self, downloader=None):
1540 InfoExtractor.__init__(self, downloader)
1544 return (re.match(YahooIE._VALID_URL, url) is not None)
1546 def report_download_webpage(self, video_id):
1547 """Report webpage download."""
1548 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1550 def report_extraction(self, video_id):
1551 """Report information extraction."""
1552 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1554 def _real_initialize(self):
1557 def _real_extract(self, url, new_video=True):
1558 # Extract ID from URL
1559 mobj = re.match(self._VALID_URL, url)
1561 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1564 # At this point we have a new video
1565 self._downloader.increment_downloads()
1566 video_id = mobj.group(2)
1567 video_extension = 'flv'
1569 # Rewrite valid but non-extractable URLs as
1570 # extractable English language /watch/ URLs
1571 if re.match(self._VPAGE_URL, url) is None:
1572 request = urllib2.Request(url)
1574 webpage = urllib2.urlopen(request).read()
1575 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1576 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1579 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1581 self._downloader.trouble(u'ERROR: Unable to extract id field')
1583 yahoo_id = mobj.group(1)
1585 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1587 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1589 yahoo_vid = mobj.group(1)
1591 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1592 return self._real_extract(url, new_video=False)
1594 # Retrieve video webpage to extract further information
1595 request = urllib2.Request(url)
1597 self.report_download_webpage(video_id)
1598 webpage = urllib2.urlopen(request).read()
1599 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1600 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1603 # Extract uploader and title from webpage
1604 self.report_extraction(video_id)
1605 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1607 self._downloader.trouble(u'ERROR: unable to extract video title')
1609 video_title = mobj.group(1).decode('utf-8')
1610 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1612 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1614 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1616 video_uploader = mobj.group(1).decode('utf-8')
1618 # Extract video thumbnail
1619 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1621 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1623 video_thumbnail = mobj.group(1).decode('utf-8')
1625 # Extract video description
1626 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1628 self._downloader.trouble(u'ERROR: unable to extract video description')
1630 video_description = mobj.group(1).decode('utf-8')
1631 if not video_description: video_description = 'No description available.'
1633 # Extract video height and width
1634 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1636 self._downloader.trouble(u'ERROR: unable to extract video height')
1638 yv_video_height = mobj.group(1)
1640 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1642 self._downloader.trouble(u'ERROR: unable to extract video width')
1644 yv_video_width = mobj.group(1)
1646 # Retrieve video playlist to extract media URL
1647 # I'm not completely sure what all these options are, but we
1648 # seem to need most of them, otherwise the server sends a 401.
1649 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1650 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1651 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1652 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1653 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1655 self.report_download_webpage(video_id)
1656 webpage = urllib2.urlopen(request).read()
1657 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1658 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1661 # Extract media URL from playlist XML
1662 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1664 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1666 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1667 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1670 # Process video information
1671 self._downloader.process_info({
1672 'id': video_id.decode('utf-8'),
1674 'uploader': video_uploader,
1675 'upload_date': u'NA',
1676 'title': video_title,
1677 'stitle': simple_title,
1678 'ext': video_extension.decode('utf-8'),
1679 'thumbnail': video_thumbnail.decode('utf-8'),
1680 'description': video_description,
1681 'thumbnail': video_thumbnail,
1682 'description': video_description,
1685 except UnavailableVideoError:
1686 self._downloader.trouble(u'\nERROR: unable to download video')
1689 class GenericIE(InfoExtractor):
1690 """Generic last-resort information extractor."""
1692 def __init__(self, downloader=None):
1693 InfoExtractor.__init__(self, downloader)
1699 def report_download_webpage(self, video_id):
1700 """Report webpage download."""
1701 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1702 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1704 def report_extraction(self, video_id):
1705 """Report information extraction."""
1706 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1708 def _real_initialize(self):
1711 def _real_extract(self, url):
1712 # At this point we have a new video
1713 self._downloader.increment_downloads()
1715 video_id = url.split('/')[-1]
1716 request = urllib2.Request(url)
1718 self.report_download_webpage(video_id)
1719 webpage = urllib2.urlopen(request).read()
1720 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1721 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1723 except ValueError, err:
1724 # since this is the last-resort InfoExtractor, if
1725 # this error is thrown, it'll be thrown here
1726 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1729 self.report_extraction(video_id)
1730 # Start with something easy: JW Player in SWFObject
1731 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1733 # Broaden the search a little bit
1734 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1736 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1739 # It's possible that one of the regexes
1740 # matched, but returned an empty group:
1741 if mobj.group(1) is None:
1742 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1745 video_url = urllib.unquote(mobj.group(1))
1746 video_id = os.path.basename(video_url)
1748 # here's a fun little line of code for you:
1749 video_extension = os.path.splitext(video_id)[1][1:]
1750 video_id = os.path.splitext(video_id)[0]
1752 # it's tempting to parse this further, but you would
1753 # have to take into account all the variations like
1754 # Video Title - Site Name
1755 # Site Name | Video Title
1756 # Video Title - Tagline | Site Name
1757 # and so on and so forth; it's just not practical
1758 mobj = re.search(r'<title>(.*)</title>', webpage)
1760 self._downloader.trouble(u'ERROR: unable to extract title')
1762 video_title = mobj.group(1).decode('utf-8')
1763 video_title = sanitize_title(video_title)
1764 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1766 # video uploader is domain name
1767 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1769 self._downloader.trouble(u'ERROR: unable to extract title')
1771 video_uploader = mobj.group(1).decode('utf-8')
1774 # Process video information
1775 self._downloader.process_info({
1776 'id': video_id.decode('utf-8'),
1777 'url': video_url.decode('utf-8'),
1778 'uploader': video_uploader,
1779 'upload_date': u'NA',
1780 'title': video_title,
1781 'stitle': simple_title,
1782 'ext': video_extension.decode('utf-8'),
1786 except UnavailableVideoError, err:
1787 self._downloader.trouble(u'\nERROR: unable to download video')
1790 class YoutubeSearchIE(InfoExtractor):
1791 """Information Extractor for YouTube search queries."""
1792 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1793 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1794 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1795 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1797 _max_youtube_results = 1000
1799 def __init__(self, youtube_ie, downloader=None):
1800 InfoExtractor.__init__(self, downloader)
1801 self._youtube_ie = youtube_ie
1805 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1807 def report_download_page(self, query, pagenum):
1808 """Report attempt to download playlist page with given number."""
1809 query = query.decode(preferredencoding())
1810 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1812 def _real_initialize(self):
1813 self._youtube_ie.initialize()
1815 def _real_extract(self, query):
1816 mobj = re.match(self._VALID_QUERY, query)
1818 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1821 prefix, query = query.split(':')
1823 query = query.encode('utf-8')
1825 self._download_n_results(query, 1)
1827 elif prefix == 'all':
1828 self._download_n_results(query, self._max_youtube_results)
1834 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1836 elif n > self._max_youtube_results:
1837 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1838 n = self._max_youtube_results
1839 self._download_n_results(query, n)
1841 except ValueError: # parsing prefix as integer fails
1842 self._download_n_results(query, 1)
1845 def _download_n_results(self, query, n):
1846 """Downloads a specified number of results for a query"""
1849 already_seen = set()
1853 self.report_download_page(query, pagenum)
1854 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1855 request = urllib2.Request(result_url)
1857 page = urllib2.urlopen(request).read()
1858 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1859 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1862 # Extract video identifiers
1863 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1864 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1865 if video_id not in already_seen:
1866 video_ids.append(video_id)
1867 already_seen.add(video_id)
1868 if len(video_ids) == n:
1869 # Specified n videos reached
1870 for id in video_ids:
1871 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1874 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1875 for id in video_ids:
1876 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1879 pagenum = pagenum + 1
1881 class GoogleSearchIE(InfoExtractor):
1882 """Information Extractor for Google Video search queries."""
1883 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1884 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1885 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1886 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1888 _max_google_results = 1000
1890 def __init__(self, google_ie, downloader=None):
1891 InfoExtractor.__init__(self, downloader)
1892 self._google_ie = google_ie
1896 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1898 def report_download_page(self, query, pagenum):
1899 """Report attempt to download playlist page with given number."""
1900 query = query.decode(preferredencoding())
1901 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1903 def _real_initialize(self):
1904 self._google_ie.initialize()
1906 def _real_extract(self, query):
1907 mobj = re.match(self._VALID_QUERY, query)
1909 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1912 prefix, query = query.split(':')
1914 query = query.encode('utf-8')
1916 self._download_n_results(query, 1)
1918 elif prefix == 'all':
1919 self._download_n_results(query, self._max_google_results)
1925 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1927 elif n > self._max_google_results:
1928 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1929 n = self._max_google_results
1930 self._download_n_results(query, n)
1932 except ValueError: # parsing prefix as integer fails
1933 self._download_n_results(query, 1)
1936 def _download_n_results(self, query, n):
1937 """Downloads a specified number of results for a query"""
1940 already_seen = set()
1944 self.report_download_page(query, pagenum)
1945 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1946 request = urllib2.Request(result_url)
1948 page = urllib2.urlopen(request).read()
1949 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1950 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1953 # Extract video identifiers
1954 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1955 video_id = mobj.group(1)
1956 if video_id not in already_seen:
1957 video_ids.append(video_id)
1958 already_seen.add(video_id)
1959 if len(video_ids) == n:
1960 # Specified n videos reached
1961 for id in video_ids:
1962 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1965 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1966 for id in video_ids:
1967 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1970 pagenum = pagenum + 1
1972 class YahooSearchIE(InfoExtractor):
1973 """Information Extractor for Yahoo! Video search queries."""
1974 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1975 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1976 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1977 _MORE_PAGES_INDICATOR = r'\s*Next'
1979 _max_yahoo_results = 1000
1981 def __init__(self, yahoo_ie, downloader=None):
1982 InfoExtractor.__init__(self, downloader)
1983 self._yahoo_ie = yahoo_ie
1987 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1989 def report_download_page(self, query, pagenum):
1990 """Report attempt to download playlist page with given number."""
1991 query = query.decode(preferredencoding())
1992 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1994 def _real_initialize(self):
1995 self._yahoo_ie.initialize()
1997 def _real_extract(self, query):
1998 mobj = re.match(self._VALID_QUERY, query)
2000 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2003 prefix, query = query.split(':')
2005 query = query.encode('utf-8')
2007 self._download_n_results(query, 1)
2009 elif prefix == 'all':
2010 self._download_n_results(query, self._max_yahoo_results)
2016 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2018 elif n > self._max_yahoo_results:
2019 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2020 n = self._max_yahoo_results
2021 self._download_n_results(query, n)
2023 except ValueError: # parsing prefix as integer fails
2024 self._download_n_results(query, 1)
2027 def _download_n_results(self, query, n):
2028 """Downloads a specified number of results for a query"""
2031 already_seen = set()
2035 self.report_download_page(query, pagenum)
2036 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2037 request = urllib2.Request(result_url)
2039 page = urllib2.urlopen(request).read()
2040 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2041 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2044 # Extract video identifiers
2045 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2046 video_id = mobj.group(1)
2047 if video_id not in already_seen:
2048 video_ids.append(video_id)
2049 already_seen.add(video_id)
2050 if len(video_ids) == n:
2051 # Specified n videos reached
2052 for id in video_ids:
2053 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2056 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2057 for id in video_ids:
2058 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2061 pagenum = pagenum + 1
2063 class YoutubePlaylistIE(InfoExtractor):
2064 """Information Extractor for YouTube playlists."""
2066 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/|p/)([^&]+).*'
2067 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
2068 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2069 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2072 def __init__(self, youtube_ie, downloader=None):
2073 InfoExtractor.__init__(self, downloader)
2074 self._youtube_ie = youtube_ie
2078 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2080 def report_download_page(self, playlist_id, pagenum):
2081 """Report attempt to download playlist page with given number."""
2082 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2084 def _real_initialize(self):
2085 self._youtube_ie.initialize()
2087 def _real_extract(self, url):
2088 # Extract playlist id
2089 mobj = re.match(self._VALID_URL, url)
2091 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2094 # Download playlist pages
2095 playlist_id = mobj.group(1)
2100 self.report_download_page(playlist_id, pagenum)
2101 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum))
2103 page = urllib2.urlopen(request).read()
2104 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2105 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2108 # Extract video identifiers
2110 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2111 if mobj.group(1) not in ids_in_page:
2112 ids_in_page.append(mobj.group(1))
2113 video_ids.extend(ids_in_page)
2115 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2117 pagenum = pagenum + 1
2119 playliststart = self._downloader.params.get('playliststart', 1) - 1
2120 playlistend = self._downloader.params.get('playlistend', -1)
2121 video_ids = video_ids[playliststart:playlistend]
2123 for id in video_ids:
2124 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2127 class YoutubeUserIE(InfoExtractor):
2128 """Information Extractor for YouTube users."""
2130 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2131 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2132 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2135 def __init__(self, youtube_ie, downloader=None):
2136 InfoExtractor.__init__(self, downloader)
2137 self._youtube_ie = youtube_ie
2141 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2143 def report_download_page(self, username):
2144 """Report attempt to download user page."""
2145 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2147 def _real_initialize(self):
2148 self._youtube_ie.initialize()
2150 def _real_extract(self, url):
2152 mobj = re.match(self._VALID_URL, url)
2154 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2157 # Download user page
2158 username = mobj.group(1)
2162 self.report_download_page(username)
2163 request = urllib2.Request(self._TEMPLATE_URL % (username))
2165 page = urllib2.urlopen(request).read()
2166 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2167 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2170 # Extract video identifiers
2173 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2174 if mobj.group(1) not in ids_in_page:
2175 ids_in_page.append(mobj.group(1))
2176 video_ids.extend(ids_in_page)
2178 playliststart = self._downloader.params.get('playliststart', 1) - 1
2179 playlistend = self._downloader.params.get('playlistend', -1)
2180 video_ids = video_ids[playliststart:playlistend]
2182 for id in video_ids:
2183 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2186 class DepositFilesIE(InfoExtractor):
2187 """Information extractor for depositfiles.com"""
2189 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2191 def __init__(self, downloader=None):
2192 InfoExtractor.__init__(self, downloader)
2196 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2198 def report_download_webpage(self, file_id):
2199 """Report webpage download."""
2200 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2202 def report_extraction(self, file_id):
2203 """Report information extraction."""
2204 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2206 def _real_initialize(self):
2209 def _real_extract(self, url):
2210 # At this point we have a new file
2211 self._downloader.increment_downloads()
2213 file_id = url.split('/')[-1]
2214 # Rebuild url in english locale
2215 url = 'http://depositfiles.com/en/files/' + file_id
2217 # Retrieve file webpage with 'Free download' button pressed
2218 free_download_indication = { 'gateway_result' : '1' }
2219 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2221 self.report_download_webpage(file_id)
2222 webpage = urllib2.urlopen(request).read()
2223 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2224 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2227 # Search for the real file URL
2228 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2229 if (mobj is None) or (mobj.group(1) is None):
2230 # Try to figure out reason of the error.
2231 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2232 if (mobj is not None) and (mobj.group(1) is not None):
2233 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2234 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2236 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2239 file_url = mobj.group(1)
2240 file_extension = os.path.splitext(file_url)[1][1:]
2242 # Search for file title
2243 mobj = re.search(r'<b title="(.*?)">', webpage)
2245 self._downloader.trouble(u'ERROR: unable to extract title')
2247 file_title = mobj.group(1).decode('utf-8')
2250 # Process file information
2251 self._downloader.process_info({
2252 'id': file_id.decode('utf-8'),
2253 'url': file_url.decode('utf-8'),
2255 'upload_date': u'NA',
2256 'title': file_title,
2257 'stitle': file_title,
2258 'ext': file_extension.decode('utf-8'),
2262 except UnavailableVideoError, err:
2263 self._downloader.trouble(u'ERROR: unable to download file')
2265 class PostProcessor(object):
2266 """Post Processor class.
2268 PostProcessor objects can be added to downloaders with their
2269 add_post_processor() method. When the downloader has finished a
2270 successful download, it will take its internal chain of PostProcessors
2271 and start calling the run() method on each one of them, first with
2272 an initial argument and then with the returned value of the previous
2275 The chain will be stopped if one of them ever returns None or the end
2276 of the chain is reached.
2278 PostProcessor objects follow a "mutual registration" process similar
2279 to InfoExtractor objects.
2284 def __init__(self, downloader=None):
2285 self._downloader = downloader
2287 def set_downloader(self, downloader):
2288 """Sets the downloader for this PP."""
2289 self._downloader = downloader
2291 def run(self, information):
2292 """Run the PostProcessor.
2294 The "information" argument is a dictionary like the ones
2295 composed by InfoExtractors. The only difference is that this
2296 one has an extra field called "filepath" that points to the
2299 When this method returns None, the postprocessing chain is
2300 stopped. However, this method may return an information
2301 dictionary that will be passed to the next postprocessing
2302 object in the chain. It can be the one it received after
2303 changing some fields.
2305 In addition, this method may raise a PostProcessingError
2306 exception that will be taken into account by the downloader
2309 return information # by default, do nothing
2311 ### MAIN PROGRAM ###
2312 if __name__ == '__main__':
2314 # Modules needed only when running the main program
2318 # Function to update the program file with the latest version from the repository.
2319 def update_self(downloader, filename):
2320 # Note: downloader only used for options
2321 if not os.access(filename, os.W_OK):
2322 sys.exit('ERROR: no write permissions on %s' % filename)
2324 downloader.to_screen('Updating to latest stable version...')
2326 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2327 latest_version = urllib.urlopen(latest_url).read().strip()
2328 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2329 newcontent = urllib.urlopen(prog_url).read()
2330 except (IOError, OSError), err:
2331 sys.exit('ERROR: unable to download latest version')
2333 stream = open(filename, 'w')
2334 stream.write(newcontent)
2336 except (IOError, OSError), err:
2337 sys.exit('ERROR: unable to overwrite current version')
2338 downloader.to_screen('Updated to version %s' % latest_version)
2340 # Parse command line
2341 parser = optparse.OptionParser(
2342 usage='Usage: %prog [options] url...',
2343 version='2010.12.09',
2344 conflict_handler='resolve',
2347 parser.add_option('-h', '--help',
2348 action='help', help='print this help text and exit')
2349 parser.add_option('-v', '--version',
2350 action='version', help='print program version and exit')
2351 parser.add_option('-U', '--update',
2352 action='store_true', dest='update_self', help='update this program to latest stable version')
2353 parser.add_option('-i', '--ignore-errors',
2354 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2355 parser.add_option('-r', '--rate-limit',
2356 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2357 parser.add_option('-R', '--retries',
2358 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2359 parser.add_option('--playlist-start',
2360 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2361 parser.add_option('--playlist-end',
2362 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2363 parser.add_option('--dump-user-agent',
2364 action='store_true', dest='dump_user_agent', help='display the current browser identification', default=False)
2366 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2367 authentication.add_option('-u', '--username',
2368 dest='username', metavar='USERNAME', help='account username')
2369 authentication.add_option('-p', '--password',
2370 dest='password', metavar='PASSWORD', help='account password')
2371 authentication.add_option('-n', '--netrc',
2372 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2373 parser.add_option_group(authentication)
2375 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2376 video_format.add_option('-f', '--format',
2377 action='store', dest='format', metavar='FORMAT', help='video format code')
2378 video_format.add_option('--all-formats',
2379 action='store_const', dest='format', help='download all available video formats', const='-1')
2380 video_format.add_option('--max-quality',
2381 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2382 parser.add_option_group(video_format)
2384 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2385 verbosity.add_option('-q', '--quiet',
2386 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2387 verbosity.add_option('-s', '--simulate',
2388 action='store_true', dest='simulate', help='do not download video', default=False)
2389 verbosity.add_option('-g', '--get-url',
2390 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2391 verbosity.add_option('-e', '--get-title',
2392 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2393 verbosity.add_option('--get-thumbnail',
2394 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2395 verbosity.add_option('--get-description',
2396 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2397 verbosity.add_option('--get-filename',
2398 action='store_true', dest='getfilename', help='simulate, quiet but print output filename', default=False)
2399 verbosity.add_option('--no-progress',
2400 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2401 verbosity.add_option('--console-title',
2402 action='store_true', dest='consoletitle', help='display progress in console titlebar', default=False)
2403 parser.add_option_group(verbosity)
2405 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2406 filesystem.add_option('-t', '--title',
2407 action='store_true', dest='usetitle', help='use title in file name', default=False)
2408 filesystem.add_option('-l', '--literal',
2409 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2410 filesystem.add_option('-A', '--auto-number',
2411 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2412 filesystem.add_option('-o', '--output',
2413 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2414 filesystem.add_option('-a', '--batch-file',
2415 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2416 filesystem.add_option('-w', '--no-overwrites',
2417 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2418 filesystem.add_option('-c', '--continue',
2419 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2420 filesystem.add_option('--cookies',
2421 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2422 filesystem.add_option('--no-part',
2423 action='store_true', dest='nopart', help='do not use .part files', default=False)
2424 parser.add_option_group(filesystem)
2426 (opts, args) = parser.parse_args()
2428 # Open appropriate CookieJar
2429 if opts.cookiefile is None:
2430 jar = cookielib.CookieJar()
2433 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2434 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2436 except (IOError, OSError), err:
2437 sys.exit(u'ERROR: unable to open cookie file')
2440 if opts.dump_user_agent:
2441 print std_headers['User-Agent']
2444 # General configuration
2445 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2446 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
2447 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2449 # Batch file verification
2451 if opts.batchfile is not None:
2453 if opts.batchfile == '-':
2456 batchfd = open(opts.batchfile, 'r')
2457 batchurls = batchfd.readlines()
2458 batchurls = [x.strip() for x in batchurls]
2459 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2461 sys.exit(u'ERROR: batch file could not be read')
2462 all_urls = batchurls + args
2464 # Conflicting, missing and erroneous options
2465 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2466 parser.error(u'using .netrc conflicts with giving username/password')
2467 if opts.password is not None and opts.username is None:
2468 parser.error(u'account username missing')
2469 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2470 parser.error(u'using output template conflicts with using title, literal title or auto number')
2471 if opts.usetitle and opts.useliteral:
2472 parser.error(u'using title conflicts with using literal title')
2473 if opts.username is not None and opts.password is None:
2474 opts.password = getpass.getpass(u'Type account password and press return:')
2475 if opts.ratelimit is not None:
2476 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2477 if numeric_limit is None:
2478 parser.error(u'invalid rate limit specified')
2479 opts.ratelimit = numeric_limit
2480 if opts.retries is not None:
2482 opts.retries = long(opts.retries)
2483 except (TypeError, ValueError), err:
2484 parser.error(u'invalid retry count specified')
2486 opts.playliststart = long(opts.playliststart)
2487 if opts.playliststart <= 0:
2489 except (TypeError, ValueError), err:
2490 parser.error(u'invalid playlist start number specified')
2492 opts.playlistend = long(opts.playlistend)
2493 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2495 except (TypeError, ValueError), err:
2496 parser.error(u'invalid playlist end number specified')
2498 # Information extractors
2499 youtube_ie = YoutubeIE()
2500 metacafe_ie = MetacafeIE(youtube_ie)
2501 dailymotion_ie = DailymotionIE()
2502 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2503 youtube_user_ie = YoutubeUserIE(youtube_ie)
2504 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2505 google_ie = GoogleIE()
2506 google_search_ie = GoogleSearchIE(google_ie)
2507 photobucket_ie = PhotobucketIE()
2508 yahoo_ie = YahooIE()
2509 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2510 deposit_files_ie = DepositFilesIE()
2511 generic_ie = GenericIE()
2514 fd = FileDownloader({
2515 'usenetrc': opts.usenetrc,
2516 'username': opts.username,
2517 'password': opts.password,
2518 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
2519 'forceurl': opts.geturl,
2520 'forcetitle': opts.gettitle,
2521 'forcethumbnail': opts.getthumbnail,
2522 'forcedescription': opts.getdescription,
2523 'forcefilename': opts.getfilename,
2524 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
2525 'format': opts.format,
2526 'format_limit': opts.format_limit,
2527 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2528 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2529 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2530 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2531 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2532 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2533 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2534 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2535 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2536 or u'%(id)s.%(ext)s'),
2537 'ignoreerrors': opts.ignoreerrors,
2538 'ratelimit': opts.ratelimit,
2539 'nooverwrites': opts.nooverwrites,
2540 'retries': opts.retries,
2541 'continuedl': opts.continue_dl,
2542 'noprogress': opts.noprogress,
2543 'playliststart': opts.playliststart,
2544 'playlistend': opts.playlistend,
2545 'logtostderr': opts.outtmpl == '-',
2546 'consoletitle': opts.consoletitle,
2547 'nopart': opts.nopart,
2549 fd.add_info_extractor(youtube_search_ie)
2550 fd.add_info_extractor(youtube_pl_ie)
2551 fd.add_info_extractor(youtube_user_ie)
2552 fd.add_info_extractor(metacafe_ie)
2553 fd.add_info_extractor(dailymotion_ie)
2554 fd.add_info_extractor(youtube_ie)
2555 fd.add_info_extractor(google_ie)
2556 fd.add_info_extractor(google_search_ie)
2557 fd.add_info_extractor(photobucket_ie)
2558 fd.add_info_extractor(yahoo_ie)
2559 fd.add_info_extractor(yahoo_search_ie)
2560 fd.add_info_extractor(deposit_files_ie)
2562 # This must come last since it's the
2563 # fallback if none of the others work
2564 fd.add_info_extractor(generic_ie)
2567 if opts.update_self:
2568 update_self(fd, sys.argv[0])
2571 if len(all_urls) < 1:
2572 if not opts.update_self:
2573 parser.error(u'you must provide at least one URL')
2576 retcode = fd.download(all_urls)
2578 # Dump cookie jar if requested
2579 if opts.cookiefile is not None:
2582 except (IOError, OSError), err:
2583 sys.exit(u'ERROR: unable to save cookie jar')
2587 except DownloadError:
2589 except SameFileError:
2590 sys.exit(u'ERROR: fixed output name but more than one file to download')
2591 except KeyboardInterrupt:
2592 sys.exit(u'\nERROR: Interrupted by user')