2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # Author: Vasyl' Vavrychuk
7 # License: Public domain code
27 # parse_qs was moved from the cgi module to the urlparse module recently.
29 from urlparse import parse_qs
31 from cgi import parse_qs
34 'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
35 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
36 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
37 'Accept-Encoding': 'gzip, deflate',
38 'Accept-Language': 'en-us,en;q=0.5',
41 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
43 def preferredencoding():
44 """Get preferred encoding.
46 Returns the best encoding scheme for the system, based on
47 locale.getpreferredencoding() and some further tweaks.
49 def yield_preferredencoding():
51 pref = locale.getpreferredencoding()
57 return yield_preferredencoding().next()
59 def htmlentity_transform(matchobj):
60 """Transforms an HTML entity to a Unicode character.
62 This function receives a match object and is intended to be used with
63 the re.sub() function.
65 entity = matchobj.group(1)
67 # Known non-numeric HTML entity
68 if entity in htmlentitydefs.name2codepoint:
69 return unichr(htmlentitydefs.name2codepoint[entity])
72 mobj = re.match(ur'(?u)#(x?\d+)', entity)
74 numstr = mobj.group(1)
75 if numstr.startswith(u'x'):
77 numstr = u'0%s' % numstr
80 return unichr(long(numstr, base))
82 # Unknown entity in name, return its literal representation
83 return (u'&%s;' % entity)
85 def sanitize_title(utitle):
86 """Sanitizes a video title so it could be used as part of a filename."""
87 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
88 return utitle.replace(unicode(os.sep), u'%')
90 def sanitize_open(filename, open_mode):
91 """Try to open the given filename, and slightly tweak it if this fails.
93 Attempts to open the given filename. If this fails, it tries to change
94 the filename slightly, step by step, until it's either able to open it
95 or it fails and raises a final exception, like the standard open()
98 It returns the tuple (stream, definitive_file_name).
102 if sys.platform == 'win32':
104 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
105 return (sys.stdout, filename)
106 stream = open(filename, open_mode)
107 return (stream, filename)
108 except (IOError, OSError), err:
109 # In case of error, try to remove win32 forbidden chars
110 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
112 # An exception here should be caught in the caller
113 stream = open(filename, open_mode)
114 return (stream, filename)
116 class DownloadError(Exception):
117 """Download Error exception.
119 This exception may be thrown by FileDownloader objects if they are not
120 configured to continue on errors. They will contain the appropriate
125 class SameFileError(Exception):
126 """Same File exception.
128 This exception will be thrown by FileDownloader objects if they detect
129 multiple files would have to be downloaded to the same file on disk.
133 class PostProcessingError(Exception):
134 """Post Processing exception.
136 This exception may be raised by PostProcessor's .run() method to
137 indicate an error in the postprocessing task.
141 class UnavailableVideoError(Exception):
142 """Unavailable Format exception.
144 This exception will be thrown when a video is requested
145 in a format that is not available for that video.
149 class ContentTooShortError(Exception):
150 """Content Too Short exception.
152 This exception may be raised by FileDownloader objects when a file they
153 download is too small for what the server announced first, indicating
154 the connection was probably interrupted.
160 def __init__(self, downloaded, expected):
161 self.downloaded = downloaded
162 self.expected = expected
164 class FileDownloader(object):
165 """File Downloader class.
167 File downloader objects are the ones responsible of downloading the
168 actual video file and writing it to disk if the user has requested
169 it, among some other tasks. In most cases there should be one per
170 program. As, given a video URL, the downloader doesn't know how to
171 extract all the needed information, task that InfoExtractors do, it
172 has to pass the URL to one of them.
174 For this, file downloader objects have a method that allows
175 InfoExtractors to be registered in a given order. When it is passed
176 a URL, the file downloader handles it to the first InfoExtractor it
177 finds that reports being able to handle it. The InfoExtractor extracts
178 all the information about the video or videos the URL refers to, and
179 asks the FileDownloader to process the video information, possibly
180 downloading the video.
182 File downloaders accept a lot of parameters. In order not to saturate
183 the object constructor with arguments, it receives a dictionary of
184 options instead. These options are available through the params
185 attribute for the InfoExtractors to use. The FileDownloader also
186 registers itself as the downloader in charge for the InfoExtractors
187 that are added to it, so this is a "mutual registration".
191 username: Username for authentication purposes.
192 password: Password for authentication purposes.
193 usenetrc: Use netrc for authentication instead.
194 quiet: Do not print messages to stdout.
195 forceurl: Force printing final URL.
196 forcetitle: Force printing title.
197 forcethumbnail: Force printing thumbnail URL.
198 forcedescription: Force printing description.
199 simulate: Do not download the video files.
200 format: Video format code.
201 format_limit: Highest quality format to try.
202 outtmpl: Template for output names.
203 ignoreerrors: Do not stop on download errors.
204 ratelimit: Download speed limit, in bytes/sec.
205 nooverwrites: Prevent overwriting files.
206 retries: Number of times to retry for HTTP error 5xx
207 continuedl: Try to continue downloads if possible.
208 noprogress: Do not print the progress bar.
209 playliststart: Playlist item to start at.
210 playlistend: Playlist item to end at.
211 logtostderr: Log messages to stderr instead of stdout.
212 consoletitle: Display progress in console window's titlebar.
218 _download_retcode = None
219 _num_downloads = None
222 def __init__(self, params):
223 """Create a FileDownloader object with the given options."""
226 self._download_retcode = 0
227 self._num_downloads = 0
228 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
232 def pmkdir(filename):
233 """Create directory components in filename. Similar to Unix "mkdir -p"."""
234 components = filename.split(os.sep)
235 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
236 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
237 for dir in aggregate:
238 if not os.path.exists(dir):
242 def temp_name(filename):
243 """Returns a temporary filename for the given filename."""
244 if filename == u'-' or (os.path.exists(filename) and not os.path.isfile(filename)):
246 return filename + u'.part'
249 def format_bytes(bytes):
252 if type(bytes) is str:
257 exponent = long(math.log(bytes, 1024.0))
258 suffix = 'bkMGTPEZY'[exponent]
259 converted = float(bytes) / float(1024**exponent)
260 return '%.2f%s' % (converted, suffix)
263 def calc_percent(byte_counter, data_len):
266 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
269 def calc_eta(start, now, total, current):
273 if current == 0 or dif < 0.001: # One millisecond
275 rate = float(current) / dif
276 eta = long((float(total) - float(current)) / rate)
277 (eta_mins, eta_secs) = divmod(eta, 60)
280 return '%02d:%02d' % (eta_mins, eta_secs)
283 def calc_speed(start, now, bytes):
285 if bytes == 0 or dif < 0.001: # One millisecond
286 return '%10s' % '---b/s'
287 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
290 def best_block_size(elapsed_time, bytes):
291 new_min = max(bytes / 2.0, 1.0)
292 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
293 if elapsed_time < 0.001:
295 rate = bytes / elapsed_time
303 def parse_bytes(bytestr):
304 """Parse a string indicating a byte quantity into a long integer."""
305 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
308 number = float(matchobj.group(1))
309 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
310 return long(round(number * multiplier))
312 def add_info_extractor(self, ie):
313 """Add an InfoExtractor object to the end of the list."""
315 ie.set_downloader(self)
317 def add_post_processor(self, pp):
318 """Add a PostProcessor object to the end of the chain."""
320 pp.set_downloader(self)
322 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
323 """Print message to stdout if not in quiet mode."""
325 if not self.params.get('quiet', False):
326 terminator = [u'\n', u''][skip_eol]
327 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
328 self._screen_file.flush()
329 except (UnicodeEncodeError), err:
330 if not ignore_encoding_errors:
333 def to_stderr(self, message):
334 """Print message to stderr."""
335 print >>sys.stderr, message.encode(preferredencoding())
337 def to_cons_title(self, message):
338 """Set console/terminal window title to message."""
339 if not self.params.get('consoletitle', False):
341 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
342 # c_wchar_p() might not be necessary if `message` is
343 # already of type unicode()
344 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
345 elif 'TERM' in os.environ:
346 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
348 def fixed_template(self):
349 """Checks if the output template is fixed."""
350 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
352 def trouble(self, message=None):
353 """Determine action to take when a download problem appears.
355 Depending on if the downloader has been configured to ignore
356 download errors or not, this method may throw an exception or
357 not when errors are found, after printing the message.
359 if message is not None:
360 self.to_stderr(message)
361 if not self.params.get('ignoreerrors', False):
362 raise DownloadError(message)
363 self._download_retcode = 1
365 def slow_down(self, start_time, byte_counter):
366 """Sleep if the download speed is over the rate limit."""
367 rate_limit = self.params.get('ratelimit', None)
368 if rate_limit is None or byte_counter == 0:
371 elapsed = now - start_time
374 speed = float(byte_counter) / elapsed
375 if speed > rate_limit:
376 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
378 def try_rename(self, old_filename, new_filename):
380 if old_filename == new_filename:
382 os.rename(old_filename, new_filename)
383 except (IOError, OSError), err:
384 self.trouble(u'ERROR: unable to rename file')
386 def report_destination(self, filename):
387 """Report destination filename."""
388 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
390 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
391 """Report download progress."""
392 if self.params.get('noprogress', False):
394 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
395 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
396 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
397 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
399 def report_resuming_byte(self, resume_len):
400 """Report attempt to resume at given byte."""
401 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
403 def report_retry(self, count, retries):
404 """Report retry in case of HTTP error 5xx"""
405 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
407 def report_file_already_downloaded(self, file_name):
408 """Report file has already been fully downloaded."""
410 self.to_screen(u'[download] %s has already been downloaded' % file_name)
411 except (UnicodeEncodeError), err:
412 self.to_screen(u'[download] The file has already been downloaded')
414 def report_unable_to_resume(self):
415 """Report it was impossible to resume download."""
416 self.to_screen(u'[download] Unable to resume')
418 def report_finish(self):
419 """Report download finished."""
420 if self.params.get('noprogress', False):
421 self.to_screen(u'[download] Download completed')
425 def increment_downloads(self):
426 """Increment the ordinal that assigns a number to each file."""
427 self._num_downloads += 1
429 def process_info(self, info_dict):
430 """Process a single dictionary returned by an InfoExtractor."""
431 # Do nothing else if in simulate mode
432 if self.params.get('simulate', False):
434 if self.params.get('forcetitle', False):
435 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
436 if self.params.get('forceurl', False):
437 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
438 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
439 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
440 if self.params.get('forcedescription', False) and 'description' in info_dict:
441 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
446 template_dict = dict(info_dict)
447 template_dict['epoch'] = unicode(long(time.time()))
448 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
449 filename = self.params['outtmpl'] % template_dict
450 except (ValueError, KeyError), err:
451 self.trouble(u'ERROR: invalid system charset or erroneous output template')
453 if self.params.get('nooverwrites', False) and os.path.exists(filename):
454 self.to_stderr(u'WARNING: file exists and will be skipped')
458 self.pmkdir(filename)
459 except (OSError, IOError), err:
460 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
464 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
465 except (OSError, IOError), err:
466 raise UnavailableVideoError
467 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
468 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
470 except (ContentTooShortError, ), err:
471 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
476 self.post_process(filename, info_dict)
477 except (PostProcessingError), err:
478 self.trouble(u'ERROR: postprocessing: %s' % str(err))
481 def download(self, url_list):
482 """Download a given list of URLs."""
483 if len(url_list) > 1 and self.fixed_template():
484 raise SameFileError(self.params['outtmpl'])
487 suitable_found = False
489 # Go to next InfoExtractor if not suitable
490 if not ie.suitable(url):
493 # Suitable InfoExtractor found
494 suitable_found = True
496 # Extract information from URL and process it
499 # Suitable InfoExtractor had been found; go to next URL
502 if not suitable_found:
503 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
505 return self._download_retcode
507 def post_process(self, filename, ie_info):
508 """Run the postprocessing chain on the given file."""
510 info['filepath'] = filename
516 def _download_with_rtmpdump(self, filename, url, player_url):
517 self.report_destination(filename)
518 tmpfilename = self.temp_name(filename)
520 # Check for rtmpdump first
522 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
523 except (OSError, IOError):
524 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
527 # Download using rtmpdump. rtmpdump returns exit code 2 when
528 # the connection was interrumpted and resuming appears to be
529 # possible. This is part of rtmpdump's normal usage, AFAIK.
530 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
531 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
532 while retval == 2 or retval == 1:
533 prevsize = os.path.getsize(tmpfilename)
534 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
535 time.sleep(5.0) # This seems to be needed
536 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
537 cursize = os.path.getsize(tmpfilename)
538 if prevsize == cursize and retval == 1:
541 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
542 self.try_rename(tmpfilename, filename)
545 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
548 def _do_download(self, filename, url, player_url):
549 # Check file already present
550 if self.params.get('continuedl', False) and os.path.isfile(filename):
551 self.report_file_already_downloaded(filename)
554 # Attempt to download using rtmpdump
555 if url.startswith('rtmp'):
556 return self._download_with_rtmpdump(filename, url, player_url)
558 tmpfilename = self.temp_name(filename)
561 basic_request = urllib2.Request(url, None, std_headers)
562 request = urllib2.Request(url, None, std_headers)
564 # Establish possible resume length
565 if os.path.isfile(tmpfilename):
566 resume_len = os.path.getsize(tmpfilename)
570 # Request parameters in case of being able to resume
571 if self.params.get('continuedl', False) and resume_len != 0:
572 self.report_resuming_byte(resume_len)
573 request.add_header('Range','bytes=%d-' % resume_len)
577 retries = self.params.get('retries', 0)
578 while count <= retries:
579 # Establish connection
581 data = urllib2.urlopen(request)
583 except (urllib2.HTTPError, ), err:
584 if (err.code < 500 or err.code >= 600) and err.code != 416:
585 # Unexpected HTTP error
587 elif err.code == 416:
588 # Unable to resume (requested range not satisfiable)
590 # Open the connection again without the range header
591 data = urllib2.urlopen(basic_request)
592 content_length = data.info()['Content-Length']
593 except (urllib2.HTTPError, ), err:
594 if err.code < 500 or err.code >= 600:
597 # Examine the reported length
598 if (content_length is not None and
599 (resume_len - 100 < long(content_length) < resume_len + 100)):
600 # The file had already been fully downloaded.
601 # Explanation to the above condition: in issue #175 it was revealed that
602 # YouTube sometimes adds or removes a few bytes from the end of the file,
603 # changing the file size slightly and causing problems for some users. So
604 # I decided to implement a suggested change and consider the file
605 # completely downloaded if the file size differs less than 100 bytes from
606 # the one in the hard drive.
607 self.report_file_already_downloaded(filename)
608 self.try_rename(tmpfilename, filename)
611 # The length does not match, we start the download over
612 self.report_unable_to_resume()
618 self.report_retry(count, retries)
621 self.trouble(u'ERROR: giving up after %s retries' % retries)
624 data_len = data.info().get('Content-length', None)
625 if data_len is not None:
626 data_len = long(data_len) + resume_len
627 data_len_str = self.format_bytes(data_len)
628 byte_counter = 0 + resume_len
634 data_block = data.read(block_size)
636 if len(data_block) == 0:
638 byte_counter += len(data_block)
640 # Open file just in time
643 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
644 self.report_destination(filename)
645 except (OSError, IOError), err:
646 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
649 stream.write(data_block)
650 except (IOError, OSError), err:
651 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
653 block_size = self.best_block_size(after - before, len(data_block))
656 percent_str = self.calc_percent(byte_counter, data_len)
657 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
658 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
659 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
662 self.slow_down(start, byte_counter - resume_len)
666 if data_len is not None and byte_counter != data_len:
667 raise ContentTooShortError(byte_counter, long(data_len))
668 self.try_rename(tmpfilename, filename)
671 class InfoExtractor(object):
672 """Information Extractor class.
674 Information extractors are the classes that, given a URL, extract
675 information from the video (or videos) the URL refers to. This
676 information includes the real video URL, the video title and simplified
677 title, author and others. The information is stored in a dictionary
678 which is then passed to the FileDownloader. The FileDownloader
679 processes this information possibly downloading the video to the file
680 system, among other possible outcomes. The dictionaries must include
681 the following fields:
683 id: Video identifier.
684 url: Final video URL.
685 uploader: Nickname of the video uploader.
686 title: Literal title.
687 stitle: Simplified title.
688 ext: Video filename extension.
689 format: Video format.
690 player_url: SWF Player URL (may be None).
692 The following fields are optional. Their primary purpose is to allow
693 youtube-dl to serve as the backend for a video search function, such
694 as the one in youtube2mp3. They are only used when their respective
695 forced printing functions are called:
697 thumbnail: Full URL to a video thumbnail image.
698 description: One-line video description.
700 Subclasses of this one should re-define the _real_initialize() and
701 _real_extract() methods, as well as the suitable() static method.
702 Probably, they should also be instantiated and added to the main
709 def __init__(self, downloader=None):
710 """Constructor. Receives an optional downloader."""
712 self.set_downloader(downloader)
716 """Receives a URL and returns True if suitable for this IE."""
719 def initialize(self):
720 """Initializes an instance (authentication, etc)."""
722 self._real_initialize()
725 def extract(self, url):
726 """Extracts URL information and returns it in list of dicts."""
728 return self._real_extract(url)
730 def set_downloader(self, downloader):
731 """Sets the downloader for this IE."""
732 self._downloader = downloader
734 def _real_initialize(self):
735 """Real initialization process. Redefine in subclasses."""
738 def _real_extract(self, url):
739 """Real extraction process. Redefine in subclasses."""
742 class YoutubeIE(InfoExtractor):
743 """Information extractor for youtube.com."""
745 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
746 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
747 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
748 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
749 _NETRC_MACHINE = 'youtube'
750 # Listed in order of quality
751 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
752 _video_extensions = {
758 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
765 return (re.match(YoutubeIE._VALID_URL, url) is not None)
767 def report_lang(self):
768 """Report attempt to set language."""
769 self._downloader.to_screen(u'[youtube] Setting language')
771 def report_login(self):
772 """Report attempt to log in."""
773 self._downloader.to_screen(u'[youtube] Logging in')
775 def report_age_confirmation(self):
776 """Report attempt to confirm age."""
777 self._downloader.to_screen(u'[youtube] Confirming age')
779 def report_video_webpage_download(self, video_id):
780 """Report attempt to download video webpage."""
781 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
783 def report_video_info_webpage_download(self, video_id):
784 """Report attempt to download video info webpage."""
785 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
787 def report_information_extraction(self, video_id):
788 """Report attempt to extract video information."""
789 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
791 def report_unavailable_format(self, video_id, format):
792 """Report extracted video URL."""
793 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
795 def report_rtmp_download(self):
796 """Indicate the download will use the RTMP protocol."""
797 self._downloader.to_screen(u'[youtube] RTMP download detected')
799 def _real_initialize(self):
800 if self._downloader is None:
805 downloader_params = self._downloader.params
807 # Attempt to use provided username and password or .netrc data
808 if downloader_params.get('username', None) is not None:
809 username = downloader_params['username']
810 password = downloader_params['password']
811 elif downloader_params.get('usenetrc', False):
813 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
818 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
819 except (IOError, netrc.NetrcParseError), err:
820 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
824 request = urllib2.Request(self._LANG_URL, None, std_headers)
827 urllib2.urlopen(request).read()
828 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
829 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
832 # No authentication to be performed
838 'current_form': 'loginForm',
840 'action_login': 'Log In',
841 'username': username,
842 'password': password,
844 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
847 login_results = urllib2.urlopen(request).read()
848 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
849 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
851 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
852 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
858 'action_confirm': 'Confirm',
860 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
862 self.report_age_confirmation()
863 age_results = urllib2.urlopen(request).read()
864 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
865 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
868 def _real_extract(self, url):
869 # Extract video id from URL
870 mobj = re.match(self._VALID_URL, url)
872 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
874 video_id = mobj.group(2)
877 self.report_video_webpage_download(video_id)
878 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id, None, std_headers)
880 video_webpage = urllib2.urlopen(request).read()
881 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
882 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
885 # Attempt to extract SWF player URL
886 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
888 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
893 self.report_video_info_webpage_download(video_id)
894 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
895 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
896 % (video_id, el_type))
897 request = urllib2.Request(video_info_url, None, std_headers)
899 video_info_webpage = urllib2.urlopen(request).read()
900 video_info = parse_qs(video_info_webpage)
901 if 'token' in video_info:
903 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
904 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
906 if 'token' not in video_info:
907 if 'reason' in video_info:
908 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
910 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
913 # Start extracting information
914 self.report_information_extraction(video_id)
917 if 'author' not in video_info:
918 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
920 video_uploader = urllib.unquote_plus(video_info['author'][0])
923 if 'title' not in video_info:
924 self._downloader.trouble(u'ERROR: unable to extract video title')
926 video_title = urllib.unquote_plus(video_info['title'][0])
927 video_title = video_title.decode('utf-8')
928 video_title = sanitize_title(video_title)
931 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
932 simple_title = simple_title.strip(ur'_')
935 if 'thumbnail_url' not in video_info:
936 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
938 else: # don't panic if we can't find it
939 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
943 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
945 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
946 format_expressions = ['%d %B %Y', '%B %d %Y']
947 for expression in format_expressions:
949 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
954 video_description = 'No description available.'
955 if self._downloader.params.get('forcedescription', False):
956 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
958 video_description = mobj.group(1)
961 video_token = urllib.unquote_plus(video_info['token'][0])
963 # Decide which formats to download
964 req_format = self._downloader.params.get('format', None)
966 if 'fmt_url_map' in video_info:
967 url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
968 format_limit = self._downloader.params.get('format_limit', None)
969 if format_limit is not None and format_limit in self._available_formats:
970 format_list = self._available_formats[self._available_formats.index(format_limit):]
972 format_list = self._available_formats
973 existing_formats = [x for x in format_list if x in url_map]
974 if len(existing_formats) == 0:
975 self._downloader.trouble(u'ERROR: no known formats available for video')
977 if req_format is None:
978 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
979 elif req_format == '-1':
980 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
983 if req_format not in url_map:
984 self._downloader.trouble(u'ERROR: requested format not available')
986 video_url_list = [(req_format, url_map[req_format])] # Specific format
988 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
989 self.report_rtmp_download()
990 video_url_list = [(None, video_info['conn'][0])]
993 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
996 for format_param, video_real_url in video_url_list:
997 # At this point we have a new video
998 self._downloader.increment_downloads()
1001 video_extension = self._video_extensions.get(format_param, 'flv')
1003 # Find the video URL in fmt_url_map or conn paramters
1005 # Process video information
1006 self._downloader.process_info({
1007 'id': video_id.decode('utf-8'),
1008 'url': video_real_url.decode('utf-8'),
1009 'uploader': video_uploader.decode('utf-8'),
1010 'upload_date': upload_date,
1011 'title': video_title,
1012 'stitle': simple_title,
1013 'ext': video_extension.decode('utf-8'),
1014 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1015 'thumbnail': video_thumbnail.decode('utf-8'),
1016 'description': video_description.decode('utf-8'),
1017 'player_url': player_url,
1019 except UnavailableVideoError, err:
1020 self._downloader.trouble(u'\nERROR: unable to download video')
1023 class MetacafeIE(InfoExtractor):
1024 """Information Extractor for metacafe.com."""
1026 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1027 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1028 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1031 def __init__(self, youtube_ie, downloader=None):
1032 InfoExtractor.__init__(self, downloader)
1033 self._youtube_ie = youtube_ie
1037 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1039 def report_disclaimer(self):
1040 """Report disclaimer retrieval."""
1041 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1043 def report_age_confirmation(self):
1044 """Report attempt to confirm age."""
1045 self._downloader.to_screen(u'[metacafe] Confirming age')
1047 def report_download_webpage(self, video_id):
1048 """Report webpage download."""
1049 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1051 def report_extraction(self, video_id):
1052 """Report information extraction."""
1053 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1055 def _real_initialize(self):
1056 # Retrieve disclaimer
1057 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
1059 self.report_disclaimer()
1060 disclaimer = urllib2.urlopen(request).read()
1061 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1062 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1068 'submit': "Continue - I'm over 18",
1070 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1072 self.report_age_confirmation()
1073 disclaimer = urllib2.urlopen(request).read()
1074 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1075 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1078 def _real_extract(self, url):
1079 # Extract id and simplified title from URL
1080 mobj = re.match(self._VALID_URL, url)
1082 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1085 video_id = mobj.group(1)
1087 # Check if video comes from YouTube
1088 mobj2 = re.match(r'^yt-(.*)$', video_id)
1089 if mobj2 is not None:
1090 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1093 # At this point we have a new video
1094 self._downloader.increment_downloads()
1096 simple_title = mobj.group(2).decode('utf-8')
1098 # Retrieve video webpage to extract further information
1099 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1101 self.report_download_webpage(video_id)
1102 webpage = urllib2.urlopen(request).read()
1103 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1104 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1107 # Extract URL, uploader and title from webpage
1108 self.report_extraction(video_id)
1109 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1110 if mobj is not None:
1111 mediaURL = urllib.unquote(mobj.group(1))
1112 video_extension = mediaURL[-3:]
1114 # Extract gdaKey if available
1115 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1117 video_url = mediaURL
1119 gdaKey = mobj.group(1)
1120 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1122 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1124 self._downloader.trouble(u'ERROR: unable to extract media URL')
1126 vardict = parse_qs(mobj.group(1))
1127 if 'mediaData' not in vardict:
1128 self._downloader.trouble(u'ERROR: unable to extract media URL')
1130 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1132 self._downloader.trouble(u'ERROR: unable to extract media URL')
1134 mediaURL = mobj.group(1).replace('\\/', '/')
1135 video_extension = mediaURL[-3:]
1136 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1138 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1140 self._downloader.trouble(u'ERROR: unable to extract title')
1142 video_title = mobj.group(1).decode('utf-8')
1143 video_title = sanitize_title(video_title)
1145 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1147 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1149 video_uploader = mobj.group(1)
1152 # Process video information
1153 self._downloader.process_info({
1154 'id': video_id.decode('utf-8'),
1155 'url': video_url.decode('utf-8'),
1156 'uploader': video_uploader.decode('utf-8'),
1157 'upload_date': u'NA',
1158 'title': video_title,
1159 'stitle': simple_title,
1160 'ext': video_extension.decode('utf-8'),
1164 except UnavailableVideoError:
1165 self._downloader.trouble(u'\nERROR: unable to download video')
1168 class DailymotionIE(InfoExtractor):
1169 """Information Extractor for Dailymotion"""
1171 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1173 def __init__(self, downloader=None):
1174 InfoExtractor.__init__(self, downloader)
1178 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1180 def report_download_webpage(self, video_id):
1181 """Report webpage download."""
1182 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1184 def report_extraction(self, video_id):
1185 """Report information extraction."""
1186 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1188 def _real_initialize(self):
1191 def _real_extract(self, url):
1192 # Extract id and simplified title from URL
1193 mobj = re.match(self._VALID_URL, url)
1195 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1198 # At this point we have a new video
1199 self._downloader.increment_downloads()
1200 video_id = mobj.group(1)
1202 simple_title = mobj.group(2).decode('utf-8')
1203 video_extension = 'flv'
1205 # Retrieve video webpage to extract further information
1206 request = urllib2.Request(url)
1208 self.report_download_webpage(video_id)
1209 webpage = urllib2.urlopen(request).read()
1210 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1211 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1214 # Extract URL, uploader and title from webpage
1215 self.report_extraction(video_id)
1216 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1218 self._downloader.trouble(u'ERROR: unable to extract media URL')
1220 mediaURL = urllib.unquote(mobj.group(1))
1222 # if needed add http://www.dailymotion.com/ if relative URL
1224 video_url = mediaURL
1226 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1227 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1229 self._downloader.trouble(u'ERROR: unable to extract title')
1231 video_title = mobj.group(1).decode('utf-8')
1232 video_title = sanitize_title(video_title)
1234 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
1236 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1238 video_uploader = mobj.group(1)
1241 # Process video information
1242 self._downloader.process_info({
1243 'id': video_id.decode('utf-8'),
1244 'url': video_url.decode('utf-8'),
1245 'uploader': video_uploader.decode('utf-8'),
1246 'upload_date': u'NA',
1247 'title': video_title,
1248 'stitle': simple_title,
1249 'ext': video_extension.decode('utf-8'),
1253 except UnavailableVideoError:
1254 self._downloader.trouble(u'\nERROR: unable to download video')
1256 class GoogleIE(InfoExtractor):
1257 """Information extractor for video.google.com."""
1259 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1261 def __init__(self, downloader=None):
1262 InfoExtractor.__init__(self, downloader)
1266 return (re.match(GoogleIE._VALID_URL, url) is not None)
1268 def report_download_webpage(self, video_id):
1269 """Report webpage download."""
1270 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1272 def report_extraction(self, video_id):
1273 """Report information extraction."""
1274 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1276 def _real_initialize(self):
1279 def _real_extract(self, url):
1280 # Extract id from URL
1281 mobj = re.match(self._VALID_URL, url)
1283 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1286 # At this point we have a new video
1287 self._downloader.increment_downloads()
1288 video_id = mobj.group(1)
1290 video_extension = 'mp4'
1292 # Retrieve video webpage to extract further information
1293 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1295 self.report_download_webpage(video_id)
1296 webpage = urllib2.urlopen(request).read()
1297 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1298 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1301 # Extract URL, uploader, and title from webpage
1302 self.report_extraction(video_id)
1303 mobj = re.search(r"download_url:'([^']+)'", webpage)
1305 video_extension = 'flv'
1306 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1308 self._downloader.trouble(u'ERROR: unable to extract media URL')
1310 mediaURL = urllib.unquote(mobj.group(1))
1311 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1312 mediaURL = mediaURL.replace('\\x26', '\x26')
1314 video_url = mediaURL
1316 mobj = re.search(r'<title>(.*)</title>', webpage)
1318 self._downloader.trouble(u'ERROR: unable to extract title')
1320 video_title = mobj.group(1).decode('utf-8')
1321 video_title = sanitize_title(video_title)
1322 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1324 # Extract video description
1325 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1327 self._downloader.trouble(u'ERROR: unable to extract video description')
1329 video_description = mobj.group(1).decode('utf-8')
1330 if not video_description:
1331 video_description = 'No description available.'
1333 # Extract video thumbnail
1334 if self._downloader.params.get('forcethumbnail', False):
1335 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1337 webpage = urllib2.urlopen(request).read()
1338 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1339 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1341 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1343 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1345 video_thumbnail = mobj.group(1)
1346 else: # we need something to pass to process_info
1347 video_thumbnail = ''
1351 # Process video information
1352 self._downloader.process_info({
1353 'id': video_id.decode('utf-8'),
1354 'url': video_url.decode('utf-8'),
1356 'upload_date': u'NA',
1357 'title': video_title,
1358 'stitle': simple_title,
1359 'ext': video_extension.decode('utf-8'),
1363 except UnavailableVideoError:
1364 self._downloader.trouble(u'\nERROR: unable to download video')
1367 class PhotobucketIE(InfoExtractor):
1368 """Information extractor for photobucket.com."""
1370 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1372 def __init__(self, downloader=None):
1373 InfoExtractor.__init__(self, downloader)
1377 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1379 def report_download_webpage(self, video_id):
1380 """Report webpage download."""
1381 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1383 def report_extraction(self, video_id):
1384 """Report information extraction."""
1385 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1387 def _real_initialize(self):
1390 def _real_extract(self, url):
1391 # Extract id from URL
1392 mobj = re.match(self._VALID_URL, url)
1394 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1397 # At this point we have a new video
1398 self._downloader.increment_downloads()
1399 video_id = mobj.group(1)
1401 video_extension = 'flv'
1403 # Retrieve video webpage to extract further information
1404 request = urllib2.Request(url)
1406 self.report_download_webpage(video_id)
1407 webpage = urllib2.urlopen(request).read()
1408 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1409 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1412 # Extract URL, uploader, and title from webpage
1413 self.report_extraction(video_id)
1414 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1416 self._downloader.trouble(u'ERROR: unable to extract media URL')
1418 mediaURL = urllib.unquote(mobj.group(1))
1420 video_url = mediaURL
1422 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1424 self._downloader.trouble(u'ERROR: unable to extract title')
1426 video_title = mobj.group(1).decode('utf-8')
1427 video_title = sanitize_title(video_title)
1428 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1430 video_uploader = mobj.group(2).decode('utf-8')
1433 # Process video information
1434 self._downloader.process_info({
1435 'id': video_id.decode('utf-8'),
1436 'url': video_url.decode('utf-8'),
1437 'uploader': video_uploader,
1438 'upload_date': u'NA',
1439 'title': video_title,
1440 'stitle': simple_title,
1441 'ext': video_extension.decode('utf-8'),
1445 except UnavailableVideoError:
1446 self._downloader.trouble(u'\nERROR: unable to download video')
1449 class YahooIE(InfoExtractor):
1450 """Information extractor for video.yahoo.com."""
1452 # _VALID_URL matches all Yahoo! Video URLs
1453 # _VPAGE_URL matches only the extractable '/watch/' URLs
1454 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1455 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1457 def __init__(self, downloader=None):
1458 InfoExtractor.__init__(self, downloader)
1462 return (re.match(YahooIE._VALID_URL, url) is not None)
1464 def report_download_webpage(self, video_id):
1465 """Report webpage download."""
1466 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1468 def report_extraction(self, video_id):
1469 """Report information extraction."""
1470 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1472 def _real_initialize(self):
1475 def _real_extract(self, url, new_video=True):
1476 # Extract ID from URL
1477 mobj = re.match(self._VALID_URL, url)
1479 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1482 # At this point we have a new video
1483 self._downloader.increment_downloads()
1484 video_id = mobj.group(2)
1485 video_extension = 'flv'
1487 # Rewrite valid but non-extractable URLs as
1488 # extractable English language /watch/ URLs
1489 if re.match(self._VPAGE_URL, url) is None:
1490 request = urllib2.Request(url)
1492 webpage = urllib2.urlopen(request).read()
1493 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1494 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1497 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1499 self._downloader.trouble(u'ERROR: Unable to extract id field')
1501 yahoo_id = mobj.group(1)
1503 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1505 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1507 yahoo_vid = mobj.group(1)
1509 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1510 return self._real_extract(url, new_video=False)
1512 # Retrieve video webpage to extract further information
1513 request = urllib2.Request(url)
1515 self.report_download_webpage(video_id)
1516 webpage = urllib2.urlopen(request).read()
1517 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1518 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1521 # Extract uploader and title from webpage
1522 self.report_extraction(video_id)
1523 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1525 self._downloader.trouble(u'ERROR: unable to extract video title')
1527 video_title = mobj.group(1).decode('utf-8')
1528 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1530 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1532 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1534 video_uploader = mobj.group(1).decode('utf-8')
1536 # Extract video thumbnail
1537 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1539 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1541 video_thumbnail = mobj.group(1).decode('utf-8')
1543 # Extract video description
1544 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1546 self._downloader.trouble(u'ERROR: unable to extract video description')
1548 video_description = mobj.group(1).decode('utf-8')
1549 if not video_description: video_description = 'No description available.'
1551 # Extract video height and width
1552 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1554 self._downloader.trouble(u'ERROR: unable to extract video height')
1556 yv_video_height = mobj.group(1)
1558 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1560 self._downloader.trouble(u'ERROR: unable to extract video width')
1562 yv_video_width = mobj.group(1)
1564 # Retrieve video playlist to extract media URL
1565 # I'm not completely sure what all these options are, but we
1566 # seem to need most of them, otherwise the server sends a 401.
1567 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1568 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1569 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1570 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1571 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1573 self.report_download_webpage(video_id)
1574 webpage = urllib2.urlopen(request).read()
1575 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1576 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1579 # Extract media URL from playlist XML
1580 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1582 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1584 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1585 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1588 # Process video information
1589 self._downloader.process_info({
1590 'id': video_id.decode('utf-8'),
1592 'uploader': video_uploader,
1593 'upload_date': u'NA',
1594 'title': video_title,
1595 'stitle': simple_title,
1596 'ext': video_extension.decode('utf-8'),
1597 'thumbnail': video_thumbnail.decode('utf-8'),
1598 'description': video_description,
1599 'thumbnail': video_thumbnail,
1600 'description': video_description,
1603 except UnavailableVideoError:
1604 self._downloader.trouble(u'\nERROR: unable to download video')
1607 class GenericIE(InfoExtractor):
1608 """Generic last-resort information extractor."""
1610 def __init__(self, downloader=None):
1611 InfoExtractor.__init__(self, downloader)
1617 def report_download_webpage(self, video_id):
1618 """Report webpage download."""
1619 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1620 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1622 def report_extraction(self, video_id):
1623 """Report information extraction."""
1624 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1626 def _real_initialize(self):
1629 def _real_extract(self, url):
1630 # At this point we have a new video
1631 self._downloader.increment_downloads()
1633 video_id = url.split('/')[-1]
1634 request = urllib2.Request(url)
1636 self.report_download_webpage(video_id)
1637 webpage = urllib2.urlopen(request).read()
1638 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1639 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1641 except ValueError, err:
1642 # since this is the last-resort InfoExtractor, if
1643 # this error is thrown, it'll be thrown here
1644 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1647 self.report_extraction(video_id)
1648 # Start with something easy: JW Player in SWFObject
1649 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1651 # Broaden the search a little bit
1652 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1654 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1657 # It's possible that one of the regexes
1658 # matched, but returned an empty group:
1659 if mobj.group(1) is None:
1660 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1663 video_url = urllib.unquote(mobj.group(1))
1664 video_id = os.path.basename(video_url)
1666 # here's a fun little line of code for you:
1667 video_extension = os.path.splitext(video_id)[1][1:]
1668 video_id = os.path.splitext(video_id)[0]
1670 # it's tempting to parse this further, but you would
1671 # have to take into account all the variations like
1672 # Video Title - Site Name
1673 # Site Name | Video Title
1674 # Video Title - Tagline | Site Name
1675 # and so on and so forth; it's just not practical
1676 mobj = re.search(r'<title>(.*)</title>', webpage)
1678 self._downloader.trouble(u'ERROR: unable to extract title')
1680 video_title = mobj.group(1).decode('utf-8')
1681 video_title = sanitize_title(video_title)
1682 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1684 # video uploader is domain name
1685 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1687 self._downloader.trouble(u'ERROR: unable to extract title')
1689 video_uploader = mobj.group(1).decode('utf-8')
1692 # Process video information
1693 self._downloader.process_info({
1694 'id': video_id.decode('utf-8'),
1695 'url': video_url.decode('utf-8'),
1696 'uploader': video_uploader,
1697 'upload_date': u'NA',
1698 'title': video_title,
1699 'stitle': simple_title,
1700 'ext': video_extension.decode('utf-8'),
1704 except UnavailableVideoError, err:
1705 self._downloader.trouble(u'\nERROR: unable to download video')
1708 class YoutubeSearchIE(InfoExtractor):
1709 """Information Extractor for YouTube search queries."""
1710 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1711 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1712 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1713 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1715 _max_youtube_results = 1000
1717 def __init__(self, youtube_ie, downloader=None):
1718 InfoExtractor.__init__(self, downloader)
1719 self._youtube_ie = youtube_ie
1723 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1725 def report_download_page(self, query, pagenum):
1726 """Report attempt to download playlist page with given number."""
1727 query = query.decode(preferredencoding())
1728 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1730 def _real_initialize(self):
1731 self._youtube_ie.initialize()
1733 def _real_extract(self, query):
1734 mobj = re.match(self._VALID_QUERY, query)
1736 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1739 prefix, query = query.split(':')
1741 query = query.encode('utf-8')
1743 self._download_n_results(query, 1)
1745 elif prefix == 'all':
1746 self._download_n_results(query, self._max_youtube_results)
1752 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1754 elif n > self._max_youtube_results:
1755 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1756 n = self._max_youtube_results
1757 self._download_n_results(query, n)
1759 except ValueError: # parsing prefix as integer fails
1760 self._download_n_results(query, 1)
1763 def _download_n_results(self, query, n):
1764 """Downloads a specified number of results for a query"""
1767 already_seen = set()
1771 self.report_download_page(query, pagenum)
1772 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1773 request = urllib2.Request(result_url, None, std_headers)
1775 page = urllib2.urlopen(request).read()
1776 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1777 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1780 # Extract video identifiers
1781 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1782 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1783 if video_id not in already_seen:
1784 video_ids.append(video_id)
1785 already_seen.add(video_id)
1786 if len(video_ids) == n:
1787 # Specified n videos reached
1788 for id in video_ids:
1789 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1792 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1793 for id in video_ids:
1794 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1797 pagenum = pagenum + 1
1799 class GoogleSearchIE(InfoExtractor):
1800 """Information Extractor for Google Video search queries."""
1801 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1802 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1803 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1804 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1806 _max_google_results = 1000
1808 def __init__(self, google_ie, downloader=None):
1809 InfoExtractor.__init__(self, downloader)
1810 self._google_ie = google_ie
1814 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1816 def report_download_page(self, query, pagenum):
1817 """Report attempt to download playlist page with given number."""
1818 query = query.decode(preferredencoding())
1819 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1821 def _real_initialize(self):
1822 self._google_ie.initialize()
1824 def _real_extract(self, query):
1825 mobj = re.match(self._VALID_QUERY, query)
1827 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1830 prefix, query = query.split(':')
1832 query = query.encode('utf-8')
1834 self._download_n_results(query, 1)
1836 elif prefix == 'all':
1837 self._download_n_results(query, self._max_google_results)
1843 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1845 elif n > self._max_google_results:
1846 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1847 n = self._max_google_results
1848 self._download_n_results(query, n)
1850 except ValueError: # parsing prefix as integer fails
1851 self._download_n_results(query, 1)
1854 def _download_n_results(self, query, n):
1855 """Downloads a specified number of results for a query"""
1858 already_seen = set()
1862 self.report_download_page(query, pagenum)
1863 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1864 request = urllib2.Request(result_url, None, std_headers)
1866 page = urllib2.urlopen(request).read()
1867 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1868 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1871 # Extract video identifiers
1872 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1873 video_id = mobj.group(1)
1874 if video_id not in already_seen:
1875 video_ids.append(video_id)
1876 already_seen.add(video_id)
1877 if len(video_ids) == n:
1878 # Specified n videos reached
1879 for id in video_ids:
1880 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1883 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1884 for id in video_ids:
1885 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1888 pagenum = pagenum + 1
1890 class YahooSearchIE(InfoExtractor):
1891 """Information Extractor for Yahoo! Video search queries."""
1892 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1893 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1894 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1895 _MORE_PAGES_INDICATOR = r'\s*Next'
1897 _max_yahoo_results = 1000
1899 def __init__(self, yahoo_ie, downloader=None):
1900 InfoExtractor.__init__(self, downloader)
1901 self._yahoo_ie = yahoo_ie
1905 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1907 def report_download_page(self, query, pagenum):
1908 """Report attempt to download playlist page with given number."""
1909 query = query.decode(preferredencoding())
1910 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1912 def _real_initialize(self):
1913 self._yahoo_ie.initialize()
1915 def _real_extract(self, query):
1916 mobj = re.match(self._VALID_QUERY, query)
1918 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1921 prefix, query = query.split(':')
1923 query = query.encode('utf-8')
1925 self._download_n_results(query, 1)
1927 elif prefix == 'all':
1928 self._download_n_results(query, self._max_yahoo_results)
1934 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1936 elif n > self._max_yahoo_results:
1937 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1938 n = self._max_yahoo_results
1939 self._download_n_results(query, n)
1941 except ValueError: # parsing prefix as integer fails
1942 self._download_n_results(query, 1)
1945 def _download_n_results(self, query, n):
1946 """Downloads a specified number of results for a query"""
1949 already_seen = set()
1953 self.report_download_page(query, pagenum)
1954 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1955 request = urllib2.Request(result_url, None, std_headers)
1957 page = urllib2.urlopen(request).read()
1958 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1959 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1962 # Extract video identifiers
1963 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1964 video_id = mobj.group(1)
1965 if video_id not in already_seen:
1966 video_ids.append(video_id)
1967 already_seen.add(video_id)
1968 if len(video_ids) == n:
1969 # Specified n videos reached
1970 for id in video_ids:
1971 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1974 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1975 for id in video_ids:
1976 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1979 pagenum = pagenum + 1
1981 class YoutubePlaylistIE(InfoExtractor):
1982 """Information Extractor for YouTube playlists."""
1984 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1985 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1986 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1987 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1990 def __init__(self, youtube_ie, downloader=None):
1991 InfoExtractor.__init__(self, downloader)
1992 self._youtube_ie = youtube_ie
1996 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1998 def report_download_page(self, playlist_id, pagenum):
1999 """Report attempt to download playlist page with given number."""
2000 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2002 def _real_initialize(self):
2003 self._youtube_ie.initialize()
2005 def _real_extract(self, url):
2006 # Extract playlist id
2007 mobj = re.match(self._VALID_URL, url)
2009 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2012 # Download playlist pages
2013 playlist_id = mobj.group(1)
2018 self.report_download_page(playlist_id, pagenum)
2019 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
2021 page = urllib2.urlopen(request).read()
2022 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2023 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2026 # Extract video identifiers
2028 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2029 if mobj.group(1) not in ids_in_page:
2030 ids_in_page.append(mobj.group(1))
2031 video_ids.extend(ids_in_page)
2033 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2035 pagenum = pagenum + 1
2037 playliststart = self._downloader.params.get('playliststart', 1) - 1
2038 playlistend = self._downloader.params.get('playlistend', -1)
2039 video_ids = video_ids[playliststart:playlistend]
2041 for id in video_ids:
2042 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2045 class YoutubeUserIE(InfoExtractor):
2046 """Information Extractor for YouTube users."""
2048 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2049 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2050 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2053 def __init__(self, youtube_ie, downloader=None):
2054 InfoExtractor.__init__(self, downloader)
2055 self._youtube_ie = youtube_ie
2059 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2061 def report_download_page(self, username):
2062 """Report attempt to download user page."""
2063 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2065 def _real_initialize(self):
2066 self._youtube_ie.initialize()
2068 def _real_extract(self, url):
2070 mobj = re.match(self._VALID_URL, url)
2072 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2075 # Download user page
2076 username = mobj.group(1)
2080 self.report_download_page(username)
2081 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2083 page = urllib2.urlopen(request).read()
2084 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2085 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2088 # Extract video identifiers
2091 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2092 if mobj.group(1) not in ids_in_page:
2093 ids_in_page.append(mobj.group(1))
2094 video_ids.extend(ids_in_page)
2096 playliststart = self._downloader.params.get('playliststart', 1) - 1
2097 playlistend = self._downloader.params.get('playlistend', -1)
2098 video_ids = video_ids[playliststart:playlistend]
2100 for id in video_ids:
2101 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2104 class DepositFilesIE(InfoExtractor):
2105 """Information extractor for depositfiles.com"""
2107 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2109 def __init__(self, downloader=None):
2110 InfoExtractor.__init__(self, downloader)
2114 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2116 def report_download_webpage(self, file_id):
2117 """Report webpage download."""
2118 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2120 def report_extraction(self, file_id):
2121 """Report information extraction."""
2122 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2124 def _real_initialize(self):
2127 def _real_extract(self, url):
2128 # At this point we have a new file
2129 self._downloader.increment_downloads()
2131 file_id = url.split('/')[-1]
2132 # Rebuild url in english locale
2133 url = 'http://depositfiles.com/en/files/' + file_id
2135 # Retrieve file webpage with 'Free download' button pressed
2136 free_download_indication = { 'gateway_result' : '1' }
2137 request = urllib2.Request(url, urllib.urlencode(free_download_indication), std_headers)
2139 self.report_download_webpage(file_id)
2140 webpage = urllib2.urlopen(request).read()
2141 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2142 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2145 # Search for the real file URL
2146 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2147 if (mobj is None) or (mobj.group(1) is None):
2148 # Try to figure out reason of the error.
2149 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2150 if (mobj is not None) and (mobj.group(1) is not None):
2151 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2152 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2154 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2157 file_url = mobj.group(1)
2158 file_extension = os.path.splitext(file_url)[1][1:]
2160 # Search for file title
2161 mobj = re.search(r'<b title="(.*?)">', webpage)
2163 self._downloader.trouble(u'ERROR: unable to extract title')
2165 file_title = mobj.group(1).decode('utf-8')
2168 # Process file information
2169 self._downloader.process_info({
2170 'id': file_id.decode('utf-8'),
2171 'url': file_url.decode('utf-8'),
2173 'upload_date': u'NA',
2174 'title': file_title,
2175 'stitle': file_title,
2176 'ext': file_extension.decode('utf-8'),
2180 except UnavailableVideoError, err:
2181 self._downloader.trouble(u'ERROR: unable to download file')
2183 class PostProcessor(object):
2184 """Post Processor class.
2186 PostProcessor objects can be added to downloaders with their
2187 add_post_processor() method. When the downloader has finished a
2188 successful download, it will take its internal chain of PostProcessors
2189 and start calling the run() method on each one of them, first with
2190 an initial argument and then with the returned value of the previous
2193 The chain will be stopped if one of them ever returns None or the end
2194 of the chain is reached.
2196 PostProcessor objects follow a "mutual registration" process similar
2197 to InfoExtractor objects.
2202 def __init__(self, downloader=None):
2203 self._downloader = downloader
2205 def set_downloader(self, downloader):
2206 """Sets the downloader for this PP."""
2207 self._downloader = downloader
2209 def run(self, information):
2210 """Run the PostProcessor.
2212 The "information" argument is a dictionary like the ones
2213 composed by InfoExtractors. The only difference is that this
2214 one has an extra field called "filepath" that points to the
2217 When this method returns None, the postprocessing chain is
2218 stopped. However, this method may return an information
2219 dictionary that will be passed to the next postprocessing
2220 object in the chain. It can be the one it received after
2221 changing some fields.
2223 In addition, this method may raise a PostProcessingError
2224 exception that will be taken into account by the downloader
2227 return information # by default, do nothing
2229 ### MAIN PROGRAM ###
2230 if __name__ == '__main__':
2232 # Modules needed only when running the main program
2236 # Function to update the program file with the latest version from bitbucket.org
2237 def update_self(downloader, filename):
2238 # Note: downloader only used for options
2239 if not os.access (filename, os.W_OK):
2240 sys.exit('ERROR: no write permissions on %s' % filename)
2242 downloader.to_screen('Updating to latest stable version...')
2243 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2244 latest_version = urllib.urlopen(latest_url).read().strip()
2245 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2246 newcontent = urllib.urlopen(prog_url).read()
2247 stream = open(filename, 'w')
2248 stream.write(newcontent)
2250 downloader.to_screen('Updated to version %s' % latest_version)
2252 # Parse command line
2253 parser = optparse.OptionParser(
2254 usage='Usage: %prog [options] url...',
2255 version='2010.12.09',
2256 conflict_handler='resolve',
2259 parser.add_option('-h', '--help',
2260 action='help', help='print this help text and exit')
2261 parser.add_option('-v', '--version',
2262 action='version', help='print program version and exit')
2263 parser.add_option('-U', '--update',
2264 action='store_true', dest='update_self', help='update this program to latest stable version')
2265 parser.add_option('-i', '--ignore-errors',
2266 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2267 parser.add_option('-r', '--rate-limit',
2268 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2269 parser.add_option('-R', '--retries',
2270 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2271 parser.add_option('--playlist-start',
2272 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2273 parser.add_option('--playlist-end',
2274 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2275 parser.add_option('--dump-user-agent',
2276 action='store_true', dest='dump_user_agent', help='display the current browser identification', default=False)
2278 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2279 authentication.add_option('-u', '--username',
2280 dest='username', metavar='USERNAME', help='account username')
2281 authentication.add_option('-p', '--password',
2282 dest='password', metavar='PASSWORD', help='account password')
2283 authentication.add_option('-n', '--netrc',
2284 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2285 parser.add_option_group(authentication)
2287 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2288 video_format.add_option('-f', '--format',
2289 action='store', dest='format', metavar='FORMAT', help='video format code')
2290 video_format.add_option('--all-formats',
2291 action='store_const', dest='format', help='download all available video formats', const='-1')
2292 video_format.add_option('--max-quality',
2293 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2294 parser.add_option_group(video_format)
2296 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2297 verbosity.add_option('-q', '--quiet',
2298 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2299 verbosity.add_option('-s', '--simulate',
2300 action='store_true', dest='simulate', help='do not download video', default=False)
2301 verbosity.add_option('-g', '--get-url',
2302 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2303 verbosity.add_option('-e', '--get-title',
2304 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2305 verbosity.add_option('--get-thumbnail',
2306 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2307 verbosity.add_option('--get-description',
2308 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2309 verbosity.add_option('--no-progress',
2310 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2311 verbosity.add_option('--console-title',
2312 action='store_true', dest='consoletitle', help='display progress in console titlebar', default=False)
2313 parser.add_option_group(verbosity)
2315 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2316 filesystem.add_option('-t', '--title',
2317 action='store_true', dest='usetitle', help='use title in file name', default=False)
2318 filesystem.add_option('-l', '--literal',
2319 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2320 filesystem.add_option('-A', '--auto-number',
2321 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2322 filesystem.add_option('-o', '--output',
2323 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2324 filesystem.add_option('-a', '--batch-file',
2325 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2326 filesystem.add_option('-w', '--no-overwrites',
2327 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2328 filesystem.add_option('-c', '--continue',
2329 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2330 filesystem.add_option('--cookies',
2331 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2332 parser.add_option_group(filesystem)
2334 (opts, args) = parser.parse_args()
2336 # Open appropriate CookieJar
2337 if opts.cookiefile is None:
2338 jar = cookielib.CookieJar()
2341 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2342 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2344 except (IOError, OSError), err:
2345 sys.exit(u'ERROR: unable to open cookie file')
2348 if opts.dump_user_agent:
2349 print std_headers['User-Agent']
2352 # General configuration
2353 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2354 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2355 urllib2.install_opener(urllib2.build_opener(cookie_processor))
2356 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2358 # Batch file verification
2360 if opts.batchfile is not None:
2362 if opts.batchfile == '-':
2365 batchfd = open(opts.batchfile, 'r')
2366 batchurls = batchfd.readlines()
2367 batchurls = [x.strip() for x in batchurls]
2368 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2370 sys.exit(u'ERROR: batch file could not be read')
2371 all_urls = batchurls + args
2373 # Conflicting, missing and erroneous options
2374 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2375 parser.error(u'using .netrc conflicts with giving username/password')
2376 if opts.password is not None and opts.username is None:
2377 parser.error(u'account username missing')
2378 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2379 parser.error(u'using output template conflicts with using title, literal title or auto number')
2380 if opts.usetitle and opts.useliteral:
2381 parser.error(u'using title conflicts with using literal title')
2382 if opts.username is not None and opts.password is None:
2383 opts.password = getpass.getpass(u'Type account password and press return:')
2384 if opts.ratelimit is not None:
2385 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2386 if numeric_limit is None:
2387 parser.error(u'invalid rate limit specified')
2388 opts.ratelimit = numeric_limit
2389 if opts.retries is not None:
2391 opts.retries = long(opts.retries)
2392 except (TypeError, ValueError), err:
2393 parser.error(u'invalid retry count specified')
2395 opts.playliststart = long(opts.playliststart)
2396 if opts.playliststart <= 0:
2398 except (TypeError, ValueError), err:
2399 parser.error(u'invalid playlist start number specified')
2401 opts.playlistend = long(opts.playlistend)
2402 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2404 except (TypeError, ValueError), err:
2405 parser.error(u'invalid playlist end number specified')
2407 # Information extractors
2408 youtube_ie = YoutubeIE()
2409 metacafe_ie = MetacafeIE(youtube_ie)
2410 dailymotion_ie = DailymotionIE()
2411 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2412 youtube_user_ie = YoutubeUserIE(youtube_ie)
2413 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2414 google_ie = GoogleIE()
2415 google_search_ie = GoogleSearchIE(google_ie)
2416 photobucket_ie = PhotobucketIE()
2417 yahoo_ie = YahooIE()
2418 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2419 deposit_files_ie = DepositFilesIE()
2420 generic_ie = GenericIE()
2423 fd = FileDownloader({
2424 'usenetrc': opts.usenetrc,
2425 'username': opts.username,
2426 'password': opts.password,
2427 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2428 'forceurl': opts.geturl,
2429 'forcetitle': opts.gettitle,
2430 'forcethumbnail': opts.getthumbnail,
2431 'forcedescription': opts.getdescription,
2432 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2433 'format': opts.format,
2434 'format_limit': opts.format_limit,
2435 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2436 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2437 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2438 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2439 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2440 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2441 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2442 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2443 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2444 or u'%(id)s.%(ext)s'),
2445 'ignoreerrors': opts.ignoreerrors,
2446 'ratelimit': opts.ratelimit,
2447 'nooverwrites': opts.nooverwrites,
2448 'retries': opts.retries,
2449 'continuedl': opts.continue_dl,
2450 'noprogress': opts.noprogress,
2451 'playliststart': opts.playliststart,
2452 'playlistend': opts.playlistend,
2453 'logtostderr': opts.outtmpl == '-',
2454 'consoletitle': opts.consoletitle,
2456 fd.add_info_extractor(youtube_search_ie)
2457 fd.add_info_extractor(youtube_pl_ie)
2458 fd.add_info_extractor(youtube_user_ie)
2459 fd.add_info_extractor(metacafe_ie)
2460 fd.add_info_extractor(dailymotion_ie)
2461 fd.add_info_extractor(youtube_ie)
2462 fd.add_info_extractor(google_ie)
2463 fd.add_info_extractor(google_search_ie)
2464 fd.add_info_extractor(photobucket_ie)
2465 fd.add_info_extractor(yahoo_ie)
2466 fd.add_info_extractor(yahoo_search_ie)
2467 fd.add_info_extractor(deposit_files_ie)
2469 # This must come last since it's the
2470 # fallback if none of the others work
2471 fd.add_info_extractor(generic_ie)
2474 if opts.update_self:
2475 update_self(fd, sys.argv[0])
2478 if len(all_urls) < 1:
2479 if not opts.update_self:
2480 parser.error(u'you must provide at least one URL')
2483 retcode = fd.download(all_urls)
2485 # Dump cookie jar if requested
2486 if opts.cookiefile is not None:
2489 except (IOError, OSError), err:
2490 sys.exit(u'ERROR: unable to save cookie jar')
2494 except DownloadError:
2496 except SameFileError:
2497 sys.exit(u'ERROR: fixed output name but more than one file to download')
2498 except KeyboardInterrupt:
2499 sys.exit(u'\nERROR: Interrupted by user')